#!/usr/bin/env Rscript ### This script consolidates everything library(pbapply) library(wikiproc) library(rprojroot) ## Set up nlp init_nlp("conda", "spcy") ## Fetch data cat("Starting data import...\n") # Define paths project_root <- find_root(has_file("README.md")) data_dir <- paste(project_root, "data", sep = .Platform$file.sep) articles <- get_data(use.cache = TRUE, data.dir = data_dir) ## Data processing cat("Processing data:\n") results <- pbapply(articles, 1, function(article) { # Within this function article is a vector representing a single row of our original data frame # This means article[1] represents the Title, article[2] the PageID etc. ## Data cleaning cleaned.text <- wikiproc:::cleanHtml(article[4]) ## Data preprocessing/annotating annotation <- create_annotations(cleaned.text, article[2], article[3], data.dir = data_dir) ## Extract information from Text no.spouses <- get_no_of_spouses(article[4]) ## Create Results data.frame(Name = article[1], NoSpouses = no.spouses, stringsAsFactors = FALSE) }) results <- do.call(rbind, results) cat("Data processing finished.\n") ## Results are now in results ## Format for rasa cat("Writing rasa files to 'rasa/'...\n") # someFormatFunction(results) cat("Data processing finished.\n")