#!/usr/bin/env Rscript

### This script consolidates everything

library(pbapply)
library(wikiproc)
library(rprojroot)

## Set up nlp

init_nlp("conda", "spcy")

## Fetch data

cat("Starting data import...\n")

# Define paths
project_root <- find_root(has_file("README.md"))
data_dir <- paste(project_root, "data", sep = .Platform$file.sep)

articles <- get_data(use.cache = TRUE, data.dir = data_dir)

## Data processing

cat("Processing data:\n")

results <- pbapply(articles, 1, function(article) {
  # Within this function article is a vector representing a single row of our original data frame
  # This means article[1] represents the Title, article[2] the PageID etc.

  ## Data cleaning

  cleaned.text <- wikiproc:::cleanHtml(article[4])

  ## Data preprocessing/annotating

  annotation <- create_annotations(cleaned.text, article[2], article[3], data.dir = data_dir)

  ## Extract information from Text

  no.spouses <- get_no_of_spouses(article[4])

  ## Create Results

  data.frame(Name = article[1],
             NoSpouses = no.spouses,
             stringsAsFactors = FALSE)

})

results <- do.call(rbind, results)

cat("Data processing finished.\n")

## Results are now in results

## Format for rasa

cat("Writing rasa files to 'rasa/'...\n")

# someFormatFunction(results)

cat("Data processing finished.\n")