Newer
Older
library(wikiproc)
library(rprojroot)
# Define paths
project_root <- find_root(has_file("README.md"))
data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
rasa_dir <- paste(project_root, "rasa", sep = .Platform$file.sep)
articles <- get_data(use.cache = TRUE, data.dir = data_dir)
# Within this function article is a vector representing a single row of our original data frame
# This means article[1] represents the Title, article[2] the PageID etc.
cleaned_text <- clean_html(article[4])
annotation <- create_annotations(cleaned_text, article[2], article[3], data.dir = data_dir)
spouse_found <- get_spouse(article[4], annotation)
awards_found <- get_awards(annotation)
university_found <- get_university(annotation)
birthdate_found <- get_birthdate(cleaned_text, annotation)
nationality_found <- get_nationality(cleaned_text, annotation)
spouse = spouse_found,
nationality = nationality_found,
day_of_death = NA,
place_of_death = NA,
is_alive = NA,
primary_education = NA,
cat("Writing rasa files to 'rasa/'...\n")
if (!dir.exists(rasa_dir)) {
dir.create("rasa")
}
write.table(results,
paste(rasa_dir, "data.tsv", sep = .Platform$file.sep),
quote = FALSE, sep = "\t", row.names = FALSE,
na = "")