Skip to content
Snippets Groups Projects
Master.R 1.13 KiB
Newer Older
David Fuhry's avatar
David Fuhry committed
#!/usr/bin/env Rscript

David Fuhry's avatar
David Fuhry committed
### This script consolidates everything

David Fuhry's avatar
David Fuhry committed
library(pbapply)
Lucas Schons's avatar
Lucas Schons committed
library(rvest)
library(wikiproc)
David Fuhry's avatar
David Fuhry committed

David Fuhry's avatar
David Fuhry committed
## Fetch data
David Fuhry's avatar
David Fuhry committed

David Fuhry's avatar
David Fuhry committed
cat("Starting data import...\n")

Lucas Schons's avatar
Lucas Schons committed
articles <- getData(use.cache = FALSE, write.cache = TRUE, data.dir = "../../data/")
David Fuhry's avatar
David Fuhry committed

## Data processing

David Fuhry's avatar
David Fuhry committed
cat("Processing data:\n")
David Fuhry's avatar
David Fuhry committed

David Fuhry's avatar
David Fuhry committed
results <- pbapply(articles, 1, function(article) {
  # Within this function article is a vector representing a single row of our original data frame
  # This means article[1] represents the Title, article[2] the PageID etc.
  
David Fuhry's avatar
David Fuhry committed
  ## Data cleaning
  
  cleaned.text <- cleanHtml(article[4])
David Fuhry's avatar
David Fuhry committed
  
  ## Data preprocessing/annotating
  
Lucas Schons's avatar
Lucas Schons committed
  # annotation <- createAnnotations(cleaned.text, article[2], article[3])
David Fuhry's avatar
David Fuhry committed
  
  ## Extract information from Text
  
  no.spouses <- getNoOfSpouses(article[4])
  
David Fuhry's avatar
David Fuhry committed
  ## Create Results
  
  data.frame(Name = article[1],
David Fuhry's avatar
David Fuhry committed
             NoSpouses = no.spouses,
             stringsAsFactors = FALSE)
David Fuhry's avatar
David Fuhry committed
  
})

results <- do.call(rbind, results)

cat("Data processing finished.\n")

David Fuhry's avatar
David Fuhry committed
## Results are now in results

## Format for rasa 

cat("Writing rasa files to 'rasa/'...\n")

David Fuhry's avatar
David Fuhry committed
# someFormatFunction(results)

cat("Data processing finished.\n")