diff --git a/processing/bin/ProcessNER.R b/processing/bin/ProcessNER.R deleted file mode 100644 index 775ffb17ffb198b7707bfd51b8e23880cbf2cddd..0000000000000000000000000000000000000000 --- a/processing/bin/ProcessNER.R +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env Rscript - -### Provides functionality to use NER, POS and Dependency Grammars - -## Author: David - -cat("Initializing spacy backend...\n") - -# It's important to do this prior to loading any python related stuff - -reticulate::use_condaenv("spcy", required = TRUE) - -# Load librarys - -library(cleanNLP) - -# Init nlp models - -cnlp_init_spacy(entity_flag = TRUE) - -cat("Done.\n") diff --git a/processing/bin/Master.R b/processing/script/master.R similarity index 81% rename from processing/bin/Master.R rename to processing/script/master.R index 0f476101b3c5fffeff755cd0623afd3947df3141..6e8121bef24989d341c1abf34c0017dd079a992c 100755 --- a/processing/bin/Master.R +++ b/processing/script/master.R @@ -3,10 +3,13 @@ ### This script consolidates everything library(pbapply) -library(rvest) library(wikiproc) library(rprojroot) +## Set up nlp + +init_nlp("conda", "spcy") + ## Fetch data cat("Starting data import...\n") @@ -15,7 +18,7 @@ cat("Starting data import...\n") project_root <- find_root(has_file("README.md")) data_dir <- paste(project_root, "data", sep = .Platform$file.sep) -articles <- wikiproc::getData(use.cache = TRUE, data.dir = data_dir) +articles <- get_data(use.cache = TRUE, data.dir = data_dir) ## Data processing @@ -31,11 +34,11 @@ results <- pbapply(articles, 1, function(article) { ## Data preprocessing/annotating - # annotation <- createAnnotations(cleaned.text, article[2], article[3]) + annotation <- create_annotations(cleaned.text, article[2], article[3], data.dir = data_dir) ## Extract information from Text - no.spouses <- wikiproc::getNoOfSpouses(article[4]) + no.spouses <- get_no_of_spouses(article[4]) ## Create Results diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE index f5ebafbf596ac4f41f745b7c83baff9b7edd80d3..5e2056db2f27b4dfd24924442b85fe72621177c8 100644 --- a/processing/wikiproc/NAMESPACE +++ b/processing/wikiproc/NAMESPACE @@ -1,8 +1,12 @@ # Generated by roxygen2: do not edit by hand -export(cleanHtml) -export(createAnnotations) -export(getBirthdate) -export(getBirthplace) -export(getData) -export(getNoOfSpouses) +export(create_annotations) +export(get_data) +export(get_no_of_spouses) +export(init_nlp) +import(rvest) +importFrom(data.table,"%like%") +importFrom(xml2,read_html) +importFrom(xml2,xml_add_sibling) +importFrom(xml2,xml_find_all) +importFrom(xml2,xml_remove) diff --git a/processing/wikiproc/R/CleanHtml.R b/processing/wikiproc/R/CleanHtml.R index b78b74f8e8a0b0badfc313e729e39659236f6463..e541a1d86aa0aac0e65e806ca0d5a4e5ff1c98cc 100644 --- a/processing/wikiproc/R/CleanHtml.R +++ b/processing/wikiproc/R/CleanHtml.R @@ -2,11 +2,6 @@ # Author: Lucas -library(rvest) -library(stringi) -library(textclean) -library(xml2) - #' Clean a html formatted wikipedia page. #' Nodes of interest from the DOM are extracted and then cleaned from all html #' tags and annotations. diff --git a/processing/wikiproc/R/GetData.R b/processing/wikiproc/R/get_data.R similarity index 95% rename from processing/wikiproc/R/GetData.R rename to processing/wikiproc/R/get_data.R index 2300cb676ad71130d2b789bd97cf58803a513453..d5dc1411e7b139563145a729371efb746d66b622 100644 --- a/processing/wikiproc/R/GetData.R +++ b/processing/wikiproc/R/get_data.R @@ -2,10 +2,6 @@ # Author: David -library(WikipediR) # For querying wikipedia -library(rvest) # For getting the list of physicits -library(xml2) - ## Though we could get the pages within the category 'physicists' with something like this ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers ## this gives us only about 50 pages. @@ -19,7 +15,8 @@ library(xml2) #' @param write.cache Write downloaded results into cache for use on future calls #' @param data.dir Directory the data should be read from and/or written to #' @return data.frame containing the title, id, revisionID and html-formatted full text -getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { +#' @export +get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep) dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep) diff --git a/processing/wikiproc/R/get_no_of_spouses.R b/processing/wikiproc/R/get_no_of_spouses.R new file mode 100755 index 0000000000000000000000000000000000000000..c0fb31eeec13d9e4724da79a6ec7a3a35253e40b --- /dev/null +++ b/processing/wikiproc/R/get_no_of_spouses.R @@ -0,0 +1,43 @@ +### GetNoOfSpouses.R +### This extracts the number of spouses from the infobox +### If no infobox or no information about spouses is found assumes there are none +### Not for use in production, this does not actually get information from text + +# Author: David + +#' Reads the number of spouses from the infobox of an wikipedia article +#' +#' @param article Wikipedia article in html format +#' +#' @return Integer indicating the number of spouses +#' @export +#' +#' @examples +#' \dontrun{ +#' articles <- get_data() +#' +#' no.spouses <- get_no_of_spouses(articles$Text[54]) +#' +#' no,spouses +#' } +get_no_of_spouses <- function(article) { + + # If there is no infobox we assume there were no spouses + if(!grepl("vcard", article)) { + return(0) + } + + infoBox <- get_infobox(article) + + # Get the spouse field + spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content + # Remove everything in parentheses + spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses) + # Split the strings by newlines to get one spouse per line + spouses <- strsplit(spouses, "\n") + spouses <- unlist(spouses) + if(length(spouses) > 0) { + return(length(spouses)) + } + return(0) +} diff --git a/processing/wikiproc/R/import_packages.R b/processing/wikiproc/R/import_packages.R new file mode 100644 index 0000000000000000000000000000000000000000..03025a3e2cbb9f89380516124e74c48f8b4b95c2 --- /dev/null +++ b/processing/wikiproc/R/import_packages.R @@ -0,0 +1,11 @@ +### File used to automatically create package imports with roxygen2 +### Note that it is discouraged to import many packages fully to avoid name conflicts +### If possible reference functions directy e.g. reshape2::melt() +### There is a (very) minor performance penalty for ::, +### if some functions are used frequently you may just import them +### with something like @importFrom reshape2 melt cast + +#' @import rvest +#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove read_html +#' @importFrom data.table %like% +NULL \ No newline at end of file diff --git a/processing/wikiproc/R/nlp_annotate.R b/processing/wikiproc/R/nlp_annotate.R new file mode 100644 index 0000000000000000000000000000000000000000..c83921ec93dcc21b20cbebec03c1b458a2bd8609 --- /dev/null +++ b/processing/wikiproc/R/nlp_annotate.R @@ -0,0 +1,74 @@ +#' Initialize the nlp backend +#' +#' A wrapper used to set the python environment and call cnlp_init +#' +#' @param type Type of python env to use, either "conda" or "python" +#' @param value Connection string, if using a conda environment the name of it +#' if using python directly the path to the python executable +#' +#' @return Does not return data +#' @export +#' +#' @examples +#' \dontrun{ +#' init_nlp("conda", "spcy") +#' } +init_nlp <- function(type, value) { + if (type == "conda") { + reticulate::use_condaenv(value, required = TRUE) + } else if (type == "python") { + reticulate::use_python(value, required = TRUE) + } + cleanNLP::cnlp_init_spacy(entity_flag = TRUE) +} + +#' Create annotations for the given text +#' +#' @param text Text to annotate +#' @param article.id ArticleID used for cashing +#' @param article.rev.id ArticleRevisionID used for cashing +#' @param use.cache Should cashed data be uses +#' @param write.cache Should the generated annotations be cashed +#' @param data.dir Directory the data should be read from and/or written to +#' +#' @return Annotation object for use with cleanNLP methods +#' @export +create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") { + + # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces + # To prevent this we stip 'em again + + filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE) + + # Check if there is a cached version of the annotations for this article in this specific revision + + if(use.cache & file.exists(filename)) { + res <- tryCatch({ + data <- readRDS(filename) + data + }, error = function (e) { + cat("Cached data seems to be corrupted, redoing annotation.\n") + }) + return(res) + } + + annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE) + + # Write cache if desired + + if(write.cache) { + if (!dir.exists("data")) { + dir.create("data") + } + if (!dir.exists("data/annotations")) { + dir.create("data/annotations") + } + saveRDS(annotation, filename) + } + + # Return data + # On a side note: Should we do this? The tidyverse style guide discourages explicit returns. + # But then again, it suggests snake case for variables... + + return(annotation) +} \ No newline at end of file diff --git a/processing/wikiproc/R/utils.R b/processing/wikiproc/R/utils.R new file mode 100644 index 0000000000000000000000000000000000000000..a518f16f9fa62497250c8d64f87cf9dc2bb012ab --- /dev/null +++ b/processing/wikiproc/R/utils.R @@ -0,0 +1,43 @@ +### Utility functions used internally + + +#' Extract the inforbox contents from wikipedia articles +#' +#' @param article Character vector containing the contents of an wikipedia +#' article as html +#' +#' @return Data frame holding the contents of the table +#' +#' @examples +#' \dontrun{ +#' articles <- get_data() +#' +#' infobox <- get_infobox(articles$Text[54]) +#' +#' infobox[3:4,] +#' } +get_infobox <- function(article) { + # Read page as html + page <- read_html(article) + + # Extracting text from the html will erase all <br> tags, + # this will replace them with line breaks + + xml_find_all(page, ".//br") %>% + xml_add_sibling("p", "\n") + + xml_find_all(page, ".//br") %>% + xml_remove() + + # Get the info box + # Will throw an error if there isnt any, so that should be checked beforehand + + table <- page %>% + html_nodes("table.vcard") %>% + html_table(fill = TRUE) %>% + .[[1]] + + colnames(table) <- c("Desc", "Content") + + return(table) +} diff --git a/processing/wikiproc/man/create_annotations.Rd b/processing/wikiproc/man/create_annotations.Rd new file mode 100644 index 0000000000000000000000000000000000000000..305b279d56dea9811b9c147f0912a005bb1f808e --- /dev/null +++ b/processing/wikiproc/man/create_annotations.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nlp_annotate.R +\name{create_annotations} +\alias{create_annotations} +\title{Create annotations for the given text} +\usage{ +create_annotations(text, article.id, article.rev.id, use.cache = TRUE, + write.cache = FALSE, data.dir = "data") +} +\arguments{ +\item{text}{Text to annotate} + +\item{article.id}{ArticleID used for cashing} + +\item{article.rev.id}{ArticleRevisionID used for cashing} + +\item{use.cache}{Should cashed data be uses} + +\item{write.cache}{Should the generated annotations be cashed} + +\item{data.dir}{Directory the data should be read from and/or written to} +} +\value{ +Annotation object for use with cleanNLP methods +} +\description{ +Create annotations for the given text +} diff --git a/processing/wikiproc/man/getData.Rd b/processing/wikiproc/man/get_data.Rd similarity index 77% rename from processing/wikiproc/man/getData.Rd rename to processing/wikiproc/man/get_data.Rd index ec865807ac52c5cc079ccda9998e01632f97e969..cec7d173ad8abf28d0811a9f6ed2a00cc3f77b13 100644 --- a/processing/wikiproc/man/getData.Rd +++ b/processing/wikiproc/man/get_data.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/GetData.R -\name{getData} -\alias{getData} +% Please edit documentation in R/get_data.R +\name{get_data} +\alias{get_data} \title{Retrieve wikipedia articles about physicists} \usage{ -getData(use.cache = TRUE, write.cache = FALSE, data.dir = "data") +get_data(use.cache = TRUE, write.cache = FALSE, data.dir = "data") } \arguments{ \item{use.cache}{Use cached data if it exists over downloading new data} diff --git a/processing/wikiproc/man/get_infobox.Rd b/processing/wikiproc/man/get_infobox.Rd new file mode 100644 index 0000000000000000000000000000000000000000..ef8d03180df7bce6c794ee1f7377e22de96c06af --- /dev/null +++ b/processing/wikiproc/man/get_infobox.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{get_infobox} +\alias{get_infobox} +\title{Extract the inforbox contents from wikipedia articles} +\usage{ +get_infobox(article) +} +\arguments{ +\item{article}{Character vector containing the contents of an wikipedia +article as html} +} +\value{ +Data frame holding the contents of the table +} +\description{ +Extract the inforbox contents from wikipedia articles +} +\examples{ +\dontrun{ +articles <- get_data() + +infobox <- get_infobox(articles$Text[54]) + +infobox[3:4,] +} +} diff --git a/processing/wikiproc/man/get_no_of_spouses.Rd b/processing/wikiproc/man/get_no_of_spouses.Rd new file mode 100644 index 0000000000000000000000000000000000000000..131c526cb56e28994b3295be2ada26343ecfb103 --- /dev/null +++ b/processing/wikiproc/man/get_no_of_spouses.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_no_of_spouses.R +\name{get_no_of_spouses} +\alias{get_no_of_spouses} +\title{Reads the number of spouses from the infobox of an wikipedia article} +\usage{ +get_no_of_spouses(article) +} +\arguments{ +\item{article}{Wikipedia article in html format} +} +\value{ +Integer indicating the number of spouses +} +\description{ +Reads the number of spouses from the infobox of an wikipedia article +} +\examples{ +\dontrun{ +articles <- get_data() + +no.spouses <- get_no_of_spouses(articles$Text[54]) + +no,spouses +} +} diff --git a/processing/wikiproc/man/init_nlp.Rd b/processing/wikiproc/man/init_nlp.Rd new file mode 100644 index 0000000000000000000000000000000000000000..47644aaed461189eefcea0cfaf43dfcbcb1f045e --- /dev/null +++ b/processing/wikiproc/man/init_nlp.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nlp_annotate.R +\name{init_nlp} +\alias{init_nlp} +\title{Initialize the nlp backend} +\usage{ +init_nlp(type, value) +} +\arguments{ +\item{type}{Type of python env to use, either "conda" or "python"} + +\item{value}{Connection string, if using a conda environment the name of it +if using python directly the path to the python executable} +} +\value{ +Does not return data +} +\description{ +A wrapper used to set the python environment and call cnlp_init +} +\examples{ +\dontrun{ +init_nlp("conda", "spcy") +} +}