#!/usr/bin/env Rscript

### This script consolidates everything

library(pbapply)
library(wikiproc)
library(rprojroot)

## Set up nlp

init_nlp("conda", "spcy")

## Fetch data

cat("Starting data import...\n")

# Define paths
project_root <- find_root(has_file("README.md"))
data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
rasa_dir <- paste(project_root, "rasa", sep = .Platform$file.sep)

articles <- get_data(use.cache = TRUE, data.dir = data_dir)

## Data processing

cat("Processing data:\n")

results <- pbapply(articles, 1, function(article) {
  # Within this function article is a vector representing a single row of our original data frame
  # This means article[1] represents the Title, article[2] the PageID etc.

  ## Data cleaning

  cleaned_text <- clean_html(article[4])

  ## Data preprocessing/annotating

  annotation <- create_annotations(cleaned_text, article[2], article[3], data.dir = data_dir)

  ## Extract information from Text

  spouse_found <- get_spouse(article[4], annotation)
  awards_found <- get_awards(annotation)
  university_found <- get_university(annotation)
  birthdate_found <- get_birthdate(cleaned_text, annotation)
  nationality_found <- get_nationality(cleaned_text, annotation)

  ## Create Results

  data.frame(name = article[1],
             spouse = spouse_found,
             nationality = nationality_found,
             birthdate = birthdate_found,
             day_of_death = NA,
             place_of_death = NA,
             is_alive = NA,
             primary_education = NA,
             university = university_found,
             area_of_research = NA,
             workplace = NA,
             awards = awards_found,
             stringsAsFactors = FALSE)

})

results <- do.call(rbind, results)

cat("Data processing finished.\n")

## Results are now in results

## Format for rasa

cat("Writing rasa files to 'rasa/'...\n")


if (!dir.exists(rasa_dir)) {
  dir.create("rasa")
}

write.table(results,
            paste(rasa_dir, "data.tsv", sep = .Platform$file.sep),
            quote = FALSE, sep = "\t", row.names = FALSE,
            na = "")

cat("Data processing finished.\n")