Skip to content
Snippets Groups Projects
master.R 2.11 KiB
Newer Older
David Fuhry's avatar
David Fuhry committed
#!/usr/bin/env Rscript

David Fuhry's avatar
David Fuhry committed
### This script consolidates everything

David Fuhry's avatar
David Fuhry committed
library(pbapply)
library(wikiproc)
library(rprojroot)
## Set up nlp
David Fuhry's avatar
David Fuhry committed

init_nlp("conda", "spcy")
David Fuhry's avatar
David Fuhry committed

David Fuhry's avatar
David Fuhry committed
## Fetch data
David Fuhry's avatar
David Fuhry committed

David Fuhry's avatar
David Fuhry committed
cat("Starting data import...\n")

# Define paths
project_root <- find_root(has_file("README.md"))
data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
rasa_dir <- paste(project_root, "rasa", sep = .Platform$file.sep)

articles <- get_data(use.cache = TRUE, data.dir = data_dir)
David Fuhry's avatar
David Fuhry committed

## Data processing

David Fuhry's avatar
David Fuhry committed
cat("Processing data:\n")
David Fuhry's avatar
David Fuhry committed

David Fuhry's avatar
David Fuhry committed
results <- pbapply(articles, 1, function(article) {
  # Within this function article is a vector representing a single row of our original data frame
  # This means article[1] represents the Title, article[2] the PageID etc.
David Fuhry's avatar
David Fuhry committed
  ## Data cleaning
  cleaned_text <- clean_html(article[4])
David Fuhry's avatar
David Fuhry committed
  ## Data preprocessing/annotating
  annotation <- create_annotations(cleaned_text, article[2], article[3], data.dir = data_dir)
David Fuhry's avatar
David Fuhry committed
  ## Extract information from Text
  spouse_found <- get_spouse(article[4], annotation)
  awards_found <- get_awards(annotation)
  university_found <- get_university(annotation)
  birthdate_found <- get_birthdate(cleaned_text, annotation)
  nationality_found <- get_nationality(cleaned_text, annotation)
David Fuhry's avatar
David Fuhry committed
  ## Create Results
  data.frame(name = article[1],
             nationality = nationality_found,
             birthdate = birthdate_found,
             day_of_death = NA,
             place_of_death = NA,
             is_alive = NA,
             primary_education = NA,
             university = university_found,
             area_of_research = NA,
             workplace = NA,
             awards = awards_found,
David Fuhry's avatar
David Fuhry committed
             stringsAsFactors = FALSE)
David Fuhry's avatar
David Fuhry committed
})

results <- do.call(rbind, results)

cat("Data processing finished.\n")

David Fuhry's avatar
David Fuhry committed
## Results are now in results

## Format for rasa
David Fuhry's avatar
David Fuhry committed

cat("Writing rasa files to 'rasa/'...\n")


if (!dir.exists(rasa_dir)) {
  dir.create("rasa")
}

write.table(results,
            paste(rasa_dir, "data.tsv", sep = .Platform$file.sep),
            quote = FALSE, sep = "\t", row.names = FALSE,
            na = "")

cat("Data processing finished.\n")