Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
master.R 1.89 KiB
#!/usr/bin/env Rscript

### This script consolidates everything

library(pbapply)
library(wikiproc)
library(rprojroot)

## Set up nlp

init_nlp("conda", "spcy")

## Fetch data

cat("Starting data import...\n")

# Define paths
project_root <- find_root(has_file("README.md"))
data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
rasa_dir <- paste(project_root, "rasa", sep = .Platform$file.sep)

articles <- get_data(use.cache = TRUE, data.dir = data_dir)

## Data processing

cat("Processing data:\n")

results <- pbapply(articles, 1, function(article) {
  # Within this function article is a vector representing a single row of our original data frame
  # This means article[1] represents the Title, article[2] the PageID etc.

  ## Data cleaning

  cleaned.text <- clean_html(article[4])

  ## Data preprocessing/annotating

  annotation <- create_annotations(cleaned_text, article[2], article[3], data.dir = data_dir)

  ## Extract information from Text

  spouse_found <- get_spouse(article[4], annotation)
  awards <- get_awards(annotation)

  ## Create Results

  data.frame(Name = article[1],
             spouse = spouse_found,
             birthplace = NA,
             birthdate = NA,
             day_of_death = NA,
             place_of_death = NA,
             is_alive = NA,
             primary_education = NA,
             university = NA,
             area_of_research = NA,
             workplace = NA,
             awards = awards,
             stringsAsFactors = FALSE)

})

results <- do.call(rbind, results)

cat("Data processing finished.\n")

## Results are now in results

## Format for rasa