#!/usr/bin/env Rscript ### This script consolidates everything library(pbapply) library(wikiproc) library(rprojroot) ## Set up nlp init_nlp("conda", "spcy") ## Fetch data cat("Starting data import...\n") # Define paths project_root <- find_root(has_file("README.md")) data_dir <- paste(project_root, "data", sep = .Platform$file.sep) rasa_dir <- paste(project_root, "rasa", sep = .Platform$file.sep) articles <- get_data(use.cache = TRUE, data.dir = data_dir) ## Data processing cat("Processing data:\n") results <- pbapply(articles, 1, function(article) { # Within this function article is a vector representing a single row of our original data frame # This means article[1] represents the Title, article[2] the PageID etc. ## Data cleaning cleaned_text <- clean_html(article[4]) ## Data preprocessing/annotating annotation <- create_annotations(cleaned_text, article[2], article[3], data.dir = data_dir) ## Extract information from Text spouse_found <- get_spouse(article[4], annotation) awards_found <- get_awards(annotation) university_found <- get_university(annotation) birthdate_found <- get_birthdate(cleaned_text, annotation) nationality_found <- get_nationality(cleaned_text, annotation) ## Create Results data.frame(name = article[1], spouse = spouse_found, nationality = nationality_found, birthdate = birthdate_found, day_of_death = NA, place_of_death = NA, is_alive = NA, primary_education = NA, university = university_found, area_of_research = NA, workplace = NA, awards = awards_found, stringsAsFactors = FALSE) }) results <- do.call(rbind, results) cat("Data processing finished.\n") ## Results are now in results ## Format for rasa cat("Writing rasa files to 'rasa/'...\n") if (!dir.exists(rasa_dir)) { dir.create("rasa") } write.table(results, paste(rasa_dir, "data.tsv", sep = .Platform$file.sep), quote = FALSE, sep = "\t", row.names = FALSE, na = "") cat("Data processing finished.\n")