Code owners
Assign users and groups as approvers for specific file changes. Learn more.
master.R 1.89 KiB
#!/usr/bin/env Rscript
### This script consolidates everything
library(pbapply)
library(wikiproc)
library(rprojroot)
## Set up nlp
init_nlp("conda", "spcy")
## Fetch data
cat("Starting data import...\n")
# Define paths
project_root <- find_root(has_file("README.md"))
data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
rasa_dir <- paste(project_root, "rasa", sep = .Platform$file.sep)
articles <- get_data(use.cache = TRUE, data.dir = data_dir)
## Data processing
cat("Processing data:\n")
results <- pbapply(articles, 1, function(article) {
# Within this function article is a vector representing a single row of our original data frame
# This means article[1] represents the Title, article[2] the PageID etc.
## Data cleaning
cleaned.text <- clean_html(article[4])
## Data preprocessing/annotating
annotation <- create_annotations(cleaned_text, article[2], article[3], data.dir = data_dir)
## Extract information from Text
spouse_found <- get_spouse(article[4], annotation)
awards <- get_awards(annotation)
## Create Results
data.frame(Name = article[1],
spouse = spouse_found,
birthplace = NA,
birthdate = NA,
day_of_death = NA,
place_of_death = NA,
is_alive = NA,
primary_education = NA,
university = NA,
area_of_research = NA,
workplace = NA,
awards = awards,
stringsAsFactors = FALSE)
})
results <- do.call(rbind, results)
cat("Data processing finished.\n")
## Results are now in results
## Format for rasa