#!/usr/bin/env Rscript ### This script consolidates everything library(pbapply) library(rvest) library(wikiproc) library(rprojroot) ## Fetch data cat("Starting data import...\n") # Define paths project_root <- find_root(has_file("README.md")) data_dir <- paste(project_root, "data", sep = .Platform$file.sep) articles <- wikiproc:::getData(use.cache = TRUE, data.dir = data_dir) ## Data processing cat("Processing data:\n") results <- pbapply(articles, 1, function(article) { # Within this function article is a vector representing a single row of our original data frame # This means article[1] represents the Title, article[2] the PageID etc. ## Data cleaning cleaned.text <- wikiproc:::cleanHtml(article[4]) ## Data preprocessing/annotating # annotation <- createAnnotations(cleaned.text, article[2], article[3]) ## Extract information from Text no.spouses <- wikiproc:::getNoOfSpouses(article[4]) ## Create Results data.frame(Name = article[1], NoSpouses = no.spouses, stringsAsFactors = FALSE) }) results <- do.call(rbind, results) cat("Data processing finished.\n") ## Results are now in results ## Format for rasa cat("Writing rasa files to 'rasa/'...\n") # someFormatFunction(results) cat("Data processing finished.\n")