#!/usr/bin/env Rscript ### This script consolidates everything library(pbapply) library(rvest) library(wikiproc) ## Fetch data cat("Starting data import...\n") articles <- getData(use.cache = FALSE, write.cache = TRUE, data.dir = "../../data/") ## Data processing cat("Processing data:\n") results <- pbapply(articles, 1, function(article) { # Within this function article is a vector representing a single row of our original data frame # This means article[1] represents the Title, article[2] the PageID etc. ## Data cleaning cleaned.text <- cleanHtml(article[4]) ## Data preprocessing/annotating # annotation <- createAnnotations(cleaned.text, article[2], article[3]) ## Extract information from Text no.spouses <- getNoOfSpouses(article[4]) ## Create Results data.frame(Name = article[1], NoSpouses = no.spouses, stringsAsFactors = FALSE) }) results <- do.call(rbind, results) cat("Data processing finished.\n") ## Results are now in results ## Format for rasa cat("Writing rasa files to 'rasa/'...\n") # someFormatFunction(results) cat("Data processing finished.\n")