diff --git a/processing/bin/ProcessNER.R b/processing/bin/ProcessNER.R deleted file mode 100644 index 775ffb17ffb198b7707bfd51b8e23880cbf2cddd..0000000000000000000000000000000000000000 --- a/processing/bin/ProcessNER.R +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env Rscript - -### Provides functionality to use NER, POS and Dependency Grammars - -## Author: David - -cat("Initializing spacy backend...\n") - -# It's important to do this prior to loading any python related stuff - -reticulate::use_condaenv("spcy", required = TRUE) - -# Load librarys - -library(cleanNLP) - -# Init nlp models - -cnlp_init_spacy(entity_flag = TRUE) - -cat("Done.\n") diff --git a/processing/bin/Master.R b/processing/script/master.R similarity index 81% rename from processing/bin/Master.R rename to processing/script/master.R index 8e338d5d65a3ada707fbabdd63d3e6f8b8a620fe..22cd34728e20a5eb1906d2a562431af670a3c2ca 100755 --- a/processing/bin/Master.R +++ b/processing/script/master.R @@ -3,10 +3,13 @@ ### This script consolidates everything library(pbapply) -library(rvest) library(wikiproc) library(rprojroot) +## Set up nlp + +init_nlp("conda", "spcy") + ## Fetch data cat("Starting data import...\n") @@ -15,7 +18,7 @@ cat("Starting data import...\n") project_root <- find_root(has_file("README.md")) data_dir <- paste(project_root, "data", sep = .Platform$file.sep) -articles <- wikiproc:::getData(use.cache = TRUE, data.dir = data_dir) +articles <- get_data(use.cache = TRUE, data.dir = data_dir) ## Data processing @@ -31,11 +34,11 @@ results <- pbapply(articles, 1, function(article) { ## Data preprocessing/annotating - # annotation <- createAnnotations(cleaned.text, article[2], article[3]) + annotation <- create_annotations(cleaned.text, article[2], article[3], data.dir = data_dir) ## Extract information from Text - no.spouses <- wikiproc:::getNoOfSpouses(article[4]) + no.spouses <- get_no_of_spouses(article[4]) ## Create Results diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE index 6ae926839dd1829f1016a96f766d970ff184ad97..5e2056db2f27b4dfd24924442b85fe72621177c8 100644 --- a/processing/wikiproc/NAMESPACE +++ b/processing/wikiproc/NAMESPACE @@ -1,2 +1,12 @@ # Generated by roxygen2: do not edit by hand +export(create_annotations) +export(get_data) +export(get_no_of_spouses) +export(init_nlp) +import(rvest) +importFrom(data.table,"%like%") +importFrom(xml2,read_html) +importFrom(xml2,xml_add_sibling) +importFrom(xml2,xml_find_all) +importFrom(xml2,xml_remove) diff --git a/processing/wikiproc/R/CleanHtml.R b/processing/wikiproc/R/CleanHtml.R index 182e9c839e512b15475b51821304eafec72cf959..0421cd0bd08aa5bd7c7259072a7267dcb0435cd4 100644 --- a/processing/wikiproc/R/CleanHtml.R +++ b/processing/wikiproc/R/CleanHtml.R @@ -2,10 +2,6 @@ # Author: Lucas -library(rvest) -library(stringi) -library(textclean) - #' Clean a html formatted wikipedia page. #' Nodes of interest from the DOM are extracted and then cleaned from all html #' tags and annotations. @@ -24,8 +20,8 @@ cleanHtml <- function(html) { # - replace multiple newlines with single newline result <- read_html(html) %>% html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>% - stri_flatten(collapse = " ") %>% - replace_html() %>% + stringi::stri_flatten(collapse = " ") %>% + textclean::replace_html() %>% gsub("\\[\\d*\\]", "", .) %>% gsub(" +", " ", .) %>% gsub("\n ", "\n", .) %>% diff --git a/processing/wikiproc/R/GetNoOfSpouses.R b/processing/wikiproc/R/GetNoOfSpouses.R deleted file mode 100755 index 5190edab023ba3063d2afe1bc4f67f85f7cf4e36..0000000000000000000000000000000000000000 --- a/processing/wikiproc/R/GetNoOfSpouses.R +++ /dev/null @@ -1,62 +0,0 @@ -### GetNoOfSpouses.R -### This extracts the number of spouses from the infobox -### If no infobox or no information about spouses is found assumes there are none -### Not for use in production, this does not actually get information from text - -# Author: David - -## Librarys - -library(rvest) -library(data.table) - -### Get number of spouses - -getNoOfSpouses <- function(article) { - - # If there is no infobox we assume there were no spouses - if(!grepl("vcard", article)) { - return(0) - } - - infoBox <- getInfoBox(article) - - # Get the spouse field - spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content - # Remove everything in parentheses - spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses) - # Split the strings by newlines to get one spouse per line - spouses <- strsplit(spouses, "\n") - spouses <- unlist(spouses) - if(length(spouses) > 0) { - return(length(spouses)) - } - return(0) -} - -### Converts info box to table -getInfoBox <- function(article) { - # Read page as html - page <- read_html(article) - - # Extracting text from the html will erase all <br> tags, - # this will replace them with line breaks - - xml_find_all(page, ".//br") %>% - xml_add_sibling("p", "\n") - - xml_find_all(page, ".//br") %>% - xml_remove() - - # Get the info box - # Will throw an error if there isnt any, so that should be checked beforehand - - table <- page %>% - html_nodes("table.vcard") %>% - html_table(fill = TRUE) %>% - .[[1]] - - colnames(table) <- c("Desc", "Content") - - return(table) -} diff --git a/processing/wikiproc/R/createAnnotations.R b/processing/wikiproc/R/createAnnotations.R deleted file mode 100644 index b9ca6ebea7029055fc484efb723beb4605a607c9..0000000000000000000000000000000000000000 --- a/processing/wikiproc/R/createAnnotations.R +++ /dev/null @@ -1,41 +0,0 @@ -library(cleanNLP) - -createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) { - - # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces - # To prevent this we stip 'em again - - filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE) - - # Check if there is a cached version of the annotations for this article in this specific revision - - if(use.cache & file.exists(filename)) { - res <- tryCatch({ - data <- readRDS(filename) - data - }, error = function (e) { - cat("Cached data seems to be corrupted, redoing annotation.\n") - }) - return(res) - } - - annotation <- cnlp_annotate(text, as_strings = TRUE) - - # Write cache if desired - - if(write.cache) { - if (!dir.exists("data")) { - dir.create("data") - } - if (!dir.exists("data/annotations")) { - dir.create("data/annotations") - } - saveRDS(annotation, filename) - } - - # Return data - # On a side note: Should we do this? The tidyverse style guide discourages explicit returns. - # But then again, it suggests snake case for variables... - - return(annotation) -} \ No newline at end of file diff --git a/processing/wikiproc/R/GetData.R b/processing/wikiproc/R/get_data.R similarity index 90% rename from processing/wikiproc/R/GetData.R rename to processing/wikiproc/R/get_data.R index ef8713e866678a1313db70927bc63216683f2d79..abcf0094a43ee04b8385e6e978258170777b0f7b 100644 --- a/processing/wikiproc/R/GetData.R +++ b/processing/wikiproc/R/get_data.R @@ -2,10 +2,6 @@ # Author: David -library(WikipediR) # For querying wikipedia -library(rvest) # For getting the list of physicits -library(xml2) - ## Though we could get the pages within the category 'physicists' with something like this ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers ## this gives us only about 50 pages. @@ -18,7 +14,8 @@ library(xml2) #' @param write.cache Write downloaded results into cache for use on future calls #' @param data.dir Directory the data should be read from and/or written to #' @return data.frame containing the title, id, revisionID and html-formatted full text -getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { +#' @export +get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep) dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep) @@ -81,9 +78,9 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { # Call the wikipedia api for each entry in our list - articles <- pblapply(physicists, function(x) { + articles <- pbapply::pblapply(physicists, function(x) { res <- tryCatch({ - article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE) + article <- WikipediR::page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE) # Check if the article is a redirect page if (grepl(".redirectText", article$parse$text$`*`)) { # Get the real article name @@ -101,7 +98,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { Encoding(tmp) <- "UTF-8" pname <- tmp - article <- page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE) + article <- WikipediR::page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE) } data.frame(Title = article$parse$title, PageID = article$parse$pageid, diff --git a/processing/wikiproc/R/get_no_of_spouses.R b/processing/wikiproc/R/get_no_of_spouses.R new file mode 100755 index 0000000000000000000000000000000000000000..c0fb31eeec13d9e4724da79a6ec7a3a35253e40b --- /dev/null +++ b/processing/wikiproc/R/get_no_of_spouses.R @@ -0,0 +1,43 @@ +### GetNoOfSpouses.R +### This extracts the number of spouses from the infobox +### If no infobox or no information about spouses is found assumes there are none +### Not for use in production, this does not actually get information from text + +# Author: David + +#' Reads the number of spouses from the infobox of an wikipedia article +#' +#' @param article Wikipedia article in html format +#' +#' @return Integer indicating the number of spouses +#' @export +#' +#' @examples +#' \dontrun{ +#' articles <- get_data() +#' +#' no.spouses <- get_no_of_spouses(articles$Text[54]) +#' +#' no,spouses +#' } +get_no_of_spouses <- function(article) { + + # If there is no infobox we assume there were no spouses + if(!grepl("vcard", article)) { + return(0) + } + + infoBox <- get_infobox(article) + + # Get the spouse field + spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content + # Remove everything in parentheses + spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses) + # Split the strings by newlines to get one spouse per line + spouses <- strsplit(spouses, "\n") + spouses <- unlist(spouses) + if(length(spouses) > 0) { + return(length(spouses)) + } + return(0) +} diff --git a/processing/wikiproc/R/import_packages.R b/processing/wikiproc/R/import_packages.R new file mode 100644 index 0000000000000000000000000000000000000000..03025a3e2cbb9f89380516124e74c48f8b4b95c2 --- /dev/null +++ b/processing/wikiproc/R/import_packages.R @@ -0,0 +1,11 @@ +### File used to automatically create package imports with roxygen2 +### Note that it is discouraged to import many packages fully to avoid name conflicts +### If possible reference functions directy e.g. reshape2::melt() +### There is a (very) minor performance penalty for ::, +### if some functions are used frequently you may just import them +### with something like @importFrom reshape2 melt cast + +#' @import rvest +#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove read_html +#' @importFrom data.table %like% +NULL \ No newline at end of file diff --git a/processing/wikiproc/R/nlp_annotate.R b/processing/wikiproc/R/nlp_annotate.R new file mode 100644 index 0000000000000000000000000000000000000000..c83921ec93dcc21b20cbebec03c1b458a2bd8609 --- /dev/null +++ b/processing/wikiproc/R/nlp_annotate.R @@ -0,0 +1,74 @@ +#' Initialize the nlp backend +#' +#' A wrapper used to set the python environment and call cnlp_init +#' +#' @param type Type of python env to use, either "conda" or "python" +#' @param value Connection string, if using a conda environment the name of it +#' if using python directly the path to the python executable +#' +#' @return Does not return data +#' @export +#' +#' @examples +#' \dontrun{ +#' init_nlp("conda", "spcy") +#' } +init_nlp <- function(type, value) { + if (type == "conda") { + reticulate::use_condaenv(value, required = TRUE) + } else if (type == "python") { + reticulate::use_python(value, required = TRUE) + } + cleanNLP::cnlp_init_spacy(entity_flag = TRUE) +} + +#' Create annotations for the given text +#' +#' @param text Text to annotate +#' @param article.id ArticleID used for cashing +#' @param article.rev.id ArticleRevisionID used for cashing +#' @param use.cache Should cashed data be uses +#' @param write.cache Should the generated annotations be cashed +#' @param data.dir Directory the data should be read from and/or written to +#' +#' @return Annotation object for use with cleanNLP methods +#' @export +create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") { + + # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces + # To prevent this we stip 'em again + + filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE) + + # Check if there is a cached version of the annotations for this article in this specific revision + + if(use.cache & file.exists(filename)) { + res <- tryCatch({ + data <- readRDS(filename) + data + }, error = function (e) { + cat("Cached data seems to be corrupted, redoing annotation.\n") + }) + return(res) + } + + annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE) + + # Write cache if desired + + if(write.cache) { + if (!dir.exists("data")) { + dir.create("data") + } + if (!dir.exists("data/annotations")) { + dir.create("data/annotations") + } + saveRDS(annotation, filename) + } + + # Return data + # On a side note: Should we do this? The tidyverse style guide discourages explicit returns. + # But then again, it suggests snake case for variables... + + return(annotation) +} \ No newline at end of file diff --git a/processing/wikiproc/R/utils.R b/processing/wikiproc/R/utils.R new file mode 100644 index 0000000000000000000000000000000000000000..a518f16f9fa62497250c8d64f87cf9dc2bb012ab --- /dev/null +++ b/processing/wikiproc/R/utils.R @@ -0,0 +1,43 @@ +### Utility functions used internally + + +#' Extract the inforbox contents from wikipedia articles +#' +#' @param article Character vector containing the contents of an wikipedia +#' article as html +#' +#' @return Data frame holding the contents of the table +#' +#' @examples +#' \dontrun{ +#' articles <- get_data() +#' +#' infobox <- get_infobox(articles$Text[54]) +#' +#' infobox[3:4,] +#' } +get_infobox <- function(article) { + # Read page as html + page <- read_html(article) + + # Extracting text from the html will erase all <br> tags, + # this will replace them with line breaks + + xml_find_all(page, ".//br") %>% + xml_add_sibling("p", "\n") + + xml_find_all(page, ".//br") %>% + xml_remove() + + # Get the info box + # Will throw an error if there isnt any, so that should be checked beforehand + + table <- page %>% + html_nodes("table.vcard") %>% + html_table(fill = TRUE) %>% + .[[1]] + + colnames(table) <- c("Desc", "Content") + + return(table) +} diff --git a/processing/wikiproc/man/cleanHtml.Rd b/processing/wikiproc/man/cleanHtml.Rd index 56994f44d9eadcd9f8cb1fee71bd54d91e518629..7247852e83e59f5f6c7ba54df2f90692b7f98a9c 100644 --- a/processing/wikiproc/man/cleanHtml.Rd +++ b/processing/wikiproc/man/cleanHtml.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/CleanHtml.R \name{cleanHtml} \alias{cleanHtml} -\title{Clean a html formatted wikipedia page. +\title{Clean a html formatted wikipedia page. Nodes of interest from the DOM are extracted and then cleaned from all html tags and annotations.} \usage{ @@ -15,7 +15,7 @@ cleanHtml(html) Plaintext document containing only the maintext of the give wikipedia page. } \description{ -Clean a html formatted wikipedia page. +Clean a html formatted wikipedia page. Nodes of interest from the DOM are extracted and then cleaned from all html tags and annotations. } diff --git a/processing/wikiproc/man/create_annotations.Rd b/processing/wikiproc/man/create_annotations.Rd new file mode 100644 index 0000000000000000000000000000000000000000..305b279d56dea9811b9c147f0912a005bb1f808e --- /dev/null +++ b/processing/wikiproc/man/create_annotations.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nlp_annotate.R +\name{create_annotations} +\alias{create_annotations} +\title{Create annotations for the given text} +\usage{ +create_annotations(text, article.id, article.rev.id, use.cache = TRUE, + write.cache = FALSE, data.dir = "data") +} +\arguments{ +\item{text}{Text to annotate} + +\item{article.id}{ArticleID used for cashing} + +\item{article.rev.id}{ArticleRevisionID used for cashing} + +\item{use.cache}{Should cashed data be uses} + +\item{write.cache}{Should the generated annotations be cashed} + +\item{data.dir}{Directory the data should be read from and/or written to} +} +\value{ +Annotation object for use with cleanNLP methods +} +\description{ +Create annotations for the given text +} diff --git a/processing/wikiproc/man/getData.Rd b/processing/wikiproc/man/get_data.Rd similarity index 65% rename from processing/wikiproc/man/getData.Rd rename to processing/wikiproc/man/get_data.Rd index 13e362d15d94d684eb53fe4c6a8001bc4b89949e..cec7d173ad8abf28d0811a9f6ed2a00cc3f77b13 100644 --- a/processing/wikiproc/man/getData.Rd +++ b/processing/wikiproc/man/get_data.Rd @@ -1,15 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/GetData.R -\name{getData} -\alias{getData} +% Please edit documentation in R/get_data.R +\name{get_data} +\alias{get_data} \title{Retrieve wikipedia articles about physicists} \usage{ -getData(use.cache = TRUE, write.cache = FALSE) +get_data(use.cache = TRUE, write.cache = FALSE, data.dir = "data") } \arguments{ \item{use.cache}{Use cached data if it exists over downloading new data} \item{write.cache}{Write downloaded results into cache for use on future calls} + +\item{data.dir}{Directory the data should be read from and/or written to} } \value{ data.frame containing the title, id, revisionID and html-formatted full text diff --git a/processing/wikiproc/man/get_infobox.Rd b/processing/wikiproc/man/get_infobox.Rd new file mode 100644 index 0000000000000000000000000000000000000000..ef8d03180df7bce6c794ee1f7377e22de96c06af --- /dev/null +++ b/processing/wikiproc/man/get_infobox.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{get_infobox} +\alias{get_infobox} +\title{Extract the inforbox contents from wikipedia articles} +\usage{ +get_infobox(article) +} +\arguments{ +\item{article}{Character vector containing the contents of an wikipedia +article as html} +} +\value{ +Data frame holding the contents of the table +} +\description{ +Extract the inforbox contents from wikipedia articles +} +\examples{ +\dontrun{ +articles <- get_data() + +infobox <- get_infobox(articles$Text[54]) + +infobox[3:4,] +} +} diff --git a/processing/wikiproc/man/get_no_of_spouses.Rd b/processing/wikiproc/man/get_no_of_spouses.Rd new file mode 100644 index 0000000000000000000000000000000000000000..131c526cb56e28994b3295be2ada26343ecfb103 --- /dev/null +++ b/processing/wikiproc/man/get_no_of_spouses.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_no_of_spouses.R +\name{get_no_of_spouses} +\alias{get_no_of_spouses} +\title{Reads the number of spouses from the infobox of an wikipedia article} +\usage{ +get_no_of_spouses(article) +} +\arguments{ +\item{article}{Wikipedia article in html format} +} +\value{ +Integer indicating the number of spouses +} +\description{ +Reads the number of spouses from the infobox of an wikipedia article +} +\examples{ +\dontrun{ +articles <- get_data() + +no.spouses <- get_no_of_spouses(articles$Text[54]) + +no,spouses +} +} diff --git a/processing/wikiproc/man/init_nlp.Rd b/processing/wikiproc/man/init_nlp.Rd new file mode 100644 index 0000000000000000000000000000000000000000..47644aaed461189eefcea0cfaf43dfcbcb1f045e --- /dev/null +++ b/processing/wikiproc/man/init_nlp.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nlp_annotate.R +\name{init_nlp} +\alias{init_nlp} +\title{Initialize the nlp backend} +\usage{ +init_nlp(type, value) +} +\arguments{ +\item{type}{Type of python env to use, either "conda" or "python"} + +\item{value}{Connection string, if using a conda environment the name of it +if using python directly the path to the python executable} +} +\value{ +Does not return data +} +\description{ +A wrapper used to set the python environment and call cnlp_init +} +\examples{ +\dontrun{ +init_nlp("conda", "spcy") +} +}