Skip to content
Snippets Groups Projects
master.R 1.3 KiB
Newer Older
David Fuhry's avatar
David Fuhry committed
#!/usr/bin/env Rscript

David Fuhry's avatar
David Fuhry committed
### This script consolidates everything

David Fuhry's avatar
David Fuhry committed
library(pbapply)
Lucas Schons's avatar
Lucas Schons committed
library(wikiproc)
Lucas Schons's avatar
Lucas Schons committed
library(rprojroot)
David Fuhry's avatar
David Fuhry committed

David Fuhry's avatar
David Fuhry committed
## Set up nlp

init_nlp("conda", "spcy")

David Fuhry's avatar
David Fuhry committed
## Fetch data
David Fuhry's avatar
David Fuhry committed

David Fuhry's avatar
David Fuhry committed
cat("Starting data import...\n")

Lucas Schons's avatar
Lucas Schons committed
# Define paths
project_root <- find_root(has_file("README.md"))
data_dir <- paste(project_root, "data", sep = .Platform$file.sep)

David Fuhry's avatar
David Fuhry committed
articles <- get_data(use.cache = TRUE, data.dir = data_dir)
David Fuhry's avatar
David Fuhry committed

## Data processing

David Fuhry's avatar
David Fuhry committed
cat("Processing data:\n")
David Fuhry's avatar
David Fuhry committed

David Fuhry's avatar
David Fuhry committed
results <- pbapply(articles, 1, function(article) {
  # Within this function article is a vector representing a single row of our original data frame
  # This means article[1] represents the Title, article[2] the PageID etc.
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  ## Data cleaning
Lucas Schons's avatar
Lucas Schons committed

  cleaned.text <- wikiproc:::cleanHtml(article[4])

David Fuhry's avatar
David Fuhry committed
  ## Data preprocessing/annotating
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  annotation <- create_annotations(cleaned.text, article[2], article[3], data.dir = data_dir)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  ## Extract information from Text
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  no.spouses <- get_no_of_spouses(article[4])
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  ## Create Results
Lucas Schons's avatar
Lucas Schons committed

  data.frame(Name = article[1],
David Fuhry's avatar
David Fuhry committed
             NoSpouses = no.spouses,
             stringsAsFactors = FALSE)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
})

results <- do.call(rbind, results)

cat("Data processing finished.\n")

David Fuhry's avatar
David Fuhry committed
## Results are now in results

Lucas Schons's avatar
Lucas Schons committed
## Format for rasa
David Fuhry's avatar
David Fuhry committed

cat("Writing rasa files to 'rasa/'...\n")

David Fuhry's avatar
David Fuhry committed
# someFormatFunction(results)

cat("Data processing finished.\n")