From 1c9037f0dbd2e9204f63eea823951e87253edaaf Mon Sep 17 00:00:00 2001 From: David Fuhry <david@129a-records.de> Date: Sat, 12 Jan 2019 16:00:20 +0100 Subject: [PATCH] Refactoring * Added roxygen comments * Moved nlp init into package * Fixed various bugs * Added import_packages.R and replaced all other imports with explicit ones * Converted my methods to snake case * Some minor stuff --- processing/bin/ProcessNER.R | 21 ------ processing/{bin/Master.R => script/master.R} | 11 ++- processing/wikiproc/NAMESPACE | 10 +++ processing/wikiproc/R/CleanHtml.R | 8 +- processing/wikiproc/R/GetNoOfSpouses.R | 62 ---------------- processing/wikiproc/R/createAnnotations.R | 41 ---------- .../wikiproc/R/{GetData.R => get_data.R} | 13 ++-- processing/wikiproc/R/get_no_of_spouses.R | 43 +++++++++++ processing/wikiproc/R/import_packages.R | 11 +++ processing/wikiproc/R/nlp_annotate.R | 74 +++++++++++++++++++ processing/wikiproc/R/utils.R | 43 +++++++++++ processing/wikiproc/man/cleanHtml.Rd | 4 +- processing/wikiproc/man/create_annotations.Rd | 28 +++++++ .../wikiproc/man/{getData.Rd => get_data.Rd} | 10 ++- processing/wikiproc/man/get_infobox.Rd | 27 +++++++ processing/wikiproc/man/get_no_of_spouses.Rd | 26 +++++++ processing/wikiproc/man/init_nlp.Rd | 25 +++++++ 17 files changed, 309 insertions(+), 148 deletions(-) delete mode 100644 processing/bin/ProcessNER.R rename processing/{bin/Master.R => script/master.R} (81%) delete mode 100755 processing/wikiproc/R/GetNoOfSpouses.R delete mode 100644 processing/wikiproc/R/createAnnotations.R rename processing/wikiproc/R/{GetData.R => get_data.R} (90%) create mode 100755 processing/wikiproc/R/get_no_of_spouses.R create mode 100644 processing/wikiproc/R/import_packages.R create mode 100644 processing/wikiproc/R/nlp_annotate.R create mode 100644 processing/wikiproc/R/utils.R create mode 100644 processing/wikiproc/man/create_annotations.Rd rename processing/wikiproc/man/{getData.Rd => get_data.Rd} (65%) create mode 100644 processing/wikiproc/man/get_infobox.Rd create mode 100644 processing/wikiproc/man/get_no_of_spouses.Rd create mode 100644 processing/wikiproc/man/init_nlp.Rd diff --git a/processing/bin/ProcessNER.R b/processing/bin/ProcessNER.R deleted file mode 100644 index 775ffb1..0000000 --- a/processing/bin/ProcessNER.R +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env Rscript - -### Provides functionality to use NER, POS and Dependency Grammars - -## Author: David - -cat("Initializing spacy backend...\n") - -# It's important to do this prior to loading any python related stuff - -reticulate::use_condaenv("spcy", required = TRUE) - -# Load librarys - -library(cleanNLP) - -# Init nlp models - -cnlp_init_spacy(entity_flag = TRUE) - -cat("Done.\n") diff --git a/processing/bin/Master.R b/processing/script/master.R similarity index 81% rename from processing/bin/Master.R rename to processing/script/master.R index 8e338d5..22cd347 100755 --- a/processing/bin/Master.R +++ b/processing/script/master.R @@ -3,10 +3,13 @@ ### This script consolidates everything library(pbapply) -library(rvest) library(wikiproc) library(rprojroot) +## Set up nlp + +init_nlp("conda", "spcy") + ## Fetch data cat("Starting data import...\n") @@ -15,7 +18,7 @@ cat("Starting data import...\n") project_root <- find_root(has_file("README.md")) data_dir <- paste(project_root, "data", sep = .Platform$file.sep) -articles <- wikiproc:::getData(use.cache = TRUE, data.dir = data_dir) +articles <- get_data(use.cache = TRUE, data.dir = data_dir) ## Data processing @@ -31,11 +34,11 @@ results <- pbapply(articles, 1, function(article) { ## Data preprocessing/annotating - # annotation <- createAnnotations(cleaned.text, article[2], article[3]) + annotation <- create_annotations(cleaned.text, article[2], article[3], data.dir = data_dir) ## Extract information from Text - no.spouses <- wikiproc:::getNoOfSpouses(article[4]) + no.spouses <- get_no_of_spouses(article[4]) ## Create Results diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE index 6ae9268..5e2056d 100644 --- a/processing/wikiproc/NAMESPACE +++ b/processing/wikiproc/NAMESPACE @@ -1,2 +1,12 @@ # Generated by roxygen2: do not edit by hand +export(create_annotations) +export(get_data) +export(get_no_of_spouses) +export(init_nlp) +import(rvest) +importFrom(data.table,"%like%") +importFrom(xml2,read_html) +importFrom(xml2,xml_add_sibling) +importFrom(xml2,xml_find_all) +importFrom(xml2,xml_remove) diff --git a/processing/wikiproc/R/CleanHtml.R b/processing/wikiproc/R/CleanHtml.R index 182e9c8..0421cd0 100644 --- a/processing/wikiproc/R/CleanHtml.R +++ b/processing/wikiproc/R/CleanHtml.R @@ -2,10 +2,6 @@ # Author: Lucas -library(rvest) -library(stringi) -library(textclean) - #' Clean a html formatted wikipedia page. #' Nodes of interest from the DOM are extracted and then cleaned from all html #' tags and annotations. @@ -24,8 +20,8 @@ cleanHtml <- function(html) { # - replace multiple newlines with single newline result <- read_html(html) %>% html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>% - stri_flatten(collapse = " ") %>% - replace_html() %>% + stringi::stri_flatten(collapse = " ") %>% + textclean::replace_html() %>% gsub("\\[\\d*\\]", "", .) %>% gsub(" +", " ", .) %>% gsub("\n ", "\n", .) %>% diff --git a/processing/wikiproc/R/GetNoOfSpouses.R b/processing/wikiproc/R/GetNoOfSpouses.R deleted file mode 100755 index 5190eda..0000000 --- a/processing/wikiproc/R/GetNoOfSpouses.R +++ /dev/null @@ -1,62 +0,0 @@ -### GetNoOfSpouses.R -### This extracts the number of spouses from the infobox -### If no infobox or no information about spouses is found assumes there are none -### Not for use in production, this does not actually get information from text - -# Author: David - -## Librarys - -library(rvest) -library(data.table) - -### Get number of spouses - -getNoOfSpouses <- function(article) { - - # If there is no infobox we assume there were no spouses - if(!grepl("vcard", article)) { - return(0) - } - - infoBox <- getInfoBox(article) - - # Get the spouse field - spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content - # Remove everything in parentheses - spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses) - # Split the strings by newlines to get one spouse per line - spouses <- strsplit(spouses, "\n") - spouses <- unlist(spouses) - if(length(spouses) > 0) { - return(length(spouses)) - } - return(0) -} - -### Converts info box to table -getInfoBox <- function(article) { - # Read page as html - page <- read_html(article) - - # Extracting text from the html will erase all <br> tags, - # this will replace them with line breaks - - xml_find_all(page, ".//br") %>% - xml_add_sibling("p", "\n") - - xml_find_all(page, ".//br") %>% - xml_remove() - - # Get the info box - # Will throw an error if there isnt any, so that should be checked beforehand - - table <- page %>% - html_nodes("table.vcard") %>% - html_table(fill = TRUE) %>% - .[[1]] - - colnames(table) <- c("Desc", "Content") - - return(table) -} diff --git a/processing/wikiproc/R/createAnnotations.R b/processing/wikiproc/R/createAnnotations.R deleted file mode 100644 index b9ca6eb..0000000 --- a/processing/wikiproc/R/createAnnotations.R +++ /dev/null @@ -1,41 +0,0 @@ -library(cleanNLP) - -createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) { - - # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces - # To prevent this we stip 'em again - - filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE) - - # Check if there is a cached version of the annotations for this article in this specific revision - - if(use.cache & file.exists(filename)) { - res <- tryCatch({ - data <- readRDS(filename) - data - }, error = function (e) { - cat("Cached data seems to be corrupted, redoing annotation.\n") - }) - return(res) - } - - annotation <- cnlp_annotate(text, as_strings = TRUE) - - # Write cache if desired - - if(write.cache) { - if (!dir.exists("data")) { - dir.create("data") - } - if (!dir.exists("data/annotations")) { - dir.create("data/annotations") - } - saveRDS(annotation, filename) - } - - # Return data - # On a side note: Should we do this? The tidyverse style guide discourages explicit returns. - # But then again, it suggests snake case for variables... - - return(annotation) -} \ No newline at end of file diff --git a/processing/wikiproc/R/GetData.R b/processing/wikiproc/R/get_data.R similarity index 90% rename from processing/wikiproc/R/GetData.R rename to processing/wikiproc/R/get_data.R index ef8713e..abcf009 100644 --- a/processing/wikiproc/R/GetData.R +++ b/processing/wikiproc/R/get_data.R @@ -2,10 +2,6 @@ # Author: David -library(WikipediR) # For querying wikipedia -library(rvest) # For getting the list of physicits -library(xml2) - ## Though we could get the pages within the category 'physicists' with something like this ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers ## this gives us only about 50 pages. @@ -18,7 +14,8 @@ library(xml2) #' @param write.cache Write downloaded results into cache for use on future calls #' @param data.dir Directory the data should be read from and/or written to #' @return data.frame containing the title, id, revisionID and html-formatted full text -getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { +#' @export +get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep) dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep) @@ -81,9 +78,9 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { # Call the wikipedia api for each entry in our list - articles <- pblapply(physicists, function(x) { + articles <- pbapply::pblapply(physicists, function(x) { res <- tryCatch({ - article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE) + article <- WikipediR::page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE) # Check if the article is a redirect page if (grepl(".redirectText", article$parse$text$`*`)) { # Get the real article name @@ -101,7 +98,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { Encoding(tmp) <- "UTF-8" pname <- tmp - article <- page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE) + article <- WikipediR::page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE) } data.frame(Title = article$parse$title, PageID = article$parse$pageid, diff --git a/processing/wikiproc/R/get_no_of_spouses.R b/processing/wikiproc/R/get_no_of_spouses.R new file mode 100755 index 0000000..c0fb31e --- /dev/null +++ b/processing/wikiproc/R/get_no_of_spouses.R @@ -0,0 +1,43 @@ +### GetNoOfSpouses.R +### This extracts the number of spouses from the infobox +### If no infobox or no information about spouses is found assumes there are none +### Not for use in production, this does not actually get information from text + +# Author: David + +#' Reads the number of spouses from the infobox of an wikipedia article +#' +#' @param article Wikipedia article in html format +#' +#' @return Integer indicating the number of spouses +#' @export +#' +#' @examples +#' \dontrun{ +#' articles <- get_data() +#' +#' no.spouses <- get_no_of_spouses(articles$Text[54]) +#' +#' no,spouses +#' } +get_no_of_spouses <- function(article) { + + # If there is no infobox we assume there were no spouses + if(!grepl("vcard", article)) { + return(0) + } + + infoBox <- get_infobox(article) + + # Get the spouse field + spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content + # Remove everything in parentheses + spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses) + # Split the strings by newlines to get one spouse per line + spouses <- strsplit(spouses, "\n") + spouses <- unlist(spouses) + if(length(spouses) > 0) { + return(length(spouses)) + } + return(0) +} diff --git a/processing/wikiproc/R/import_packages.R b/processing/wikiproc/R/import_packages.R new file mode 100644 index 0000000..03025a3 --- /dev/null +++ b/processing/wikiproc/R/import_packages.R @@ -0,0 +1,11 @@ +### File used to automatically create package imports with roxygen2 +### Note that it is discouraged to import many packages fully to avoid name conflicts +### If possible reference functions directy e.g. reshape2::melt() +### There is a (very) minor performance penalty for ::, +### if some functions are used frequently you may just import them +### with something like @importFrom reshape2 melt cast + +#' @import rvest +#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove read_html +#' @importFrom data.table %like% +NULL \ No newline at end of file diff --git a/processing/wikiproc/R/nlp_annotate.R b/processing/wikiproc/R/nlp_annotate.R new file mode 100644 index 0000000..c83921e --- /dev/null +++ b/processing/wikiproc/R/nlp_annotate.R @@ -0,0 +1,74 @@ +#' Initialize the nlp backend +#' +#' A wrapper used to set the python environment and call cnlp_init +#' +#' @param type Type of python env to use, either "conda" or "python" +#' @param value Connection string, if using a conda environment the name of it +#' if using python directly the path to the python executable +#' +#' @return Does not return data +#' @export +#' +#' @examples +#' \dontrun{ +#' init_nlp("conda", "spcy") +#' } +init_nlp <- function(type, value) { + if (type == "conda") { + reticulate::use_condaenv(value, required = TRUE) + } else if (type == "python") { + reticulate::use_python(value, required = TRUE) + } + cleanNLP::cnlp_init_spacy(entity_flag = TRUE) +} + +#' Create annotations for the given text +#' +#' @param text Text to annotate +#' @param article.id ArticleID used for cashing +#' @param article.rev.id ArticleRevisionID used for cashing +#' @param use.cache Should cashed data be uses +#' @param write.cache Should the generated annotations be cashed +#' @param data.dir Directory the data should be read from and/or written to +#' +#' @return Annotation object for use with cleanNLP methods +#' @export +create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") { + + # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces + # To prevent this we stip 'em again + + filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE) + + # Check if there is a cached version of the annotations for this article in this specific revision + + if(use.cache & file.exists(filename)) { + res <- tryCatch({ + data <- readRDS(filename) + data + }, error = function (e) { + cat("Cached data seems to be corrupted, redoing annotation.\n") + }) + return(res) + } + + annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE) + + # Write cache if desired + + if(write.cache) { + if (!dir.exists("data")) { + dir.create("data") + } + if (!dir.exists("data/annotations")) { + dir.create("data/annotations") + } + saveRDS(annotation, filename) + } + + # Return data + # On a side note: Should we do this? The tidyverse style guide discourages explicit returns. + # But then again, it suggests snake case for variables... + + return(annotation) +} \ No newline at end of file diff --git a/processing/wikiproc/R/utils.R b/processing/wikiproc/R/utils.R new file mode 100644 index 0000000..a518f16 --- /dev/null +++ b/processing/wikiproc/R/utils.R @@ -0,0 +1,43 @@ +### Utility functions used internally + + +#' Extract the inforbox contents from wikipedia articles +#' +#' @param article Character vector containing the contents of an wikipedia +#' article as html +#' +#' @return Data frame holding the contents of the table +#' +#' @examples +#' \dontrun{ +#' articles <- get_data() +#' +#' infobox <- get_infobox(articles$Text[54]) +#' +#' infobox[3:4,] +#' } +get_infobox <- function(article) { + # Read page as html + page <- read_html(article) + + # Extracting text from the html will erase all <br> tags, + # this will replace them with line breaks + + xml_find_all(page, ".//br") %>% + xml_add_sibling("p", "\n") + + xml_find_all(page, ".//br") %>% + xml_remove() + + # Get the info box + # Will throw an error if there isnt any, so that should be checked beforehand + + table <- page %>% + html_nodes("table.vcard") %>% + html_table(fill = TRUE) %>% + .[[1]] + + colnames(table) <- c("Desc", "Content") + + return(table) +} diff --git a/processing/wikiproc/man/cleanHtml.Rd b/processing/wikiproc/man/cleanHtml.Rd index 56994f4..7247852 100644 --- a/processing/wikiproc/man/cleanHtml.Rd +++ b/processing/wikiproc/man/cleanHtml.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/CleanHtml.R \name{cleanHtml} \alias{cleanHtml} -\title{Clean a html formatted wikipedia page. +\title{Clean a html formatted wikipedia page. Nodes of interest from the DOM are extracted and then cleaned from all html tags and annotations.} \usage{ @@ -15,7 +15,7 @@ cleanHtml(html) Plaintext document containing only the maintext of the give wikipedia page. } \description{ -Clean a html formatted wikipedia page. +Clean a html formatted wikipedia page. Nodes of interest from the DOM are extracted and then cleaned from all html tags and annotations. } diff --git a/processing/wikiproc/man/create_annotations.Rd b/processing/wikiproc/man/create_annotations.Rd new file mode 100644 index 0000000..305b279 --- /dev/null +++ b/processing/wikiproc/man/create_annotations.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nlp_annotate.R +\name{create_annotations} +\alias{create_annotations} +\title{Create annotations for the given text} +\usage{ +create_annotations(text, article.id, article.rev.id, use.cache = TRUE, + write.cache = FALSE, data.dir = "data") +} +\arguments{ +\item{text}{Text to annotate} + +\item{article.id}{ArticleID used for cashing} + +\item{article.rev.id}{ArticleRevisionID used for cashing} + +\item{use.cache}{Should cashed data be uses} + +\item{write.cache}{Should the generated annotations be cashed} + +\item{data.dir}{Directory the data should be read from and/or written to} +} +\value{ +Annotation object for use with cleanNLP methods +} +\description{ +Create annotations for the given text +} diff --git a/processing/wikiproc/man/getData.Rd b/processing/wikiproc/man/get_data.Rd similarity index 65% rename from processing/wikiproc/man/getData.Rd rename to processing/wikiproc/man/get_data.Rd index 13e362d..cec7d17 100644 --- a/processing/wikiproc/man/getData.Rd +++ b/processing/wikiproc/man/get_data.Rd @@ -1,15 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/GetData.R -\name{getData} -\alias{getData} +% Please edit documentation in R/get_data.R +\name{get_data} +\alias{get_data} \title{Retrieve wikipedia articles about physicists} \usage{ -getData(use.cache = TRUE, write.cache = FALSE) +get_data(use.cache = TRUE, write.cache = FALSE, data.dir = "data") } \arguments{ \item{use.cache}{Use cached data if it exists over downloading new data} \item{write.cache}{Write downloaded results into cache for use on future calls} + +\item{data.dir}{Directory the data should be read from and/or written to} } \value{ data.frame containing the title, id, revisionID and html-formatted full text diff --git a/processing/wikiproc/man/get_infobox.Rd b/processing/wikiproc/man/get_infobox.Rd new file mode 100644 index 0000000..ef8d031 --- /dev/null +++ b/processing/wikiproc/man/get_infobox.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{get_infobox} +\alias{get_infobox} +\title{Extract the inforbox contents from wikipedia articles} +\usage{ +get_infobox(article) +} +\arguments{ +\item{article}{Character vector containing the contents of an wikipedia +article as html} +} +\value{ +Data frame holding the contents of the table +} +\description{ +Extract the inforbox contents from wikipedia articles +} +\examples{ +\dontrun{ +articles <- get_data() + +infobox <- get_infobox(articles$Text[54]) + +infobox[3:4,] +} +} diff --git a/processing/wikiproc/man/get_no_of_spouses.Rd b/processing/wikiproc/man/get_no_of_spouses.Rd new file mode 100644 index 0000000..131c526 --- /dev/null +++ b/processing/wikiproc/man/get_no_of_spouses.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_no_of_spouses.R +\name{get_no_of_spouses} +\alias{get_no_of_spouses} +\title{Reads the number of spouses from the infobox of an wikipedia article} +\usage{ +get_no_of_spouses(article) +} +\arguments{ +\item{article}{Wikipedia article in html format} +} +\value{ +Integer indicating the number of spouses +} +\description{ +Reads the number of spouses from the infobox of an wikipedia article +} +\examples{ +\dontrun{ +articles <- get_data() + +no.spouses <- get_no_of_spouses(articles$Text[54]) + +no,spouses +} +} diff --git a/processing/wikiproc/man/init_nlp.Rd b/processing/wikiproc/man/init_nlp.Rd new file mode 100644 index 0000000..47644aa --- /dev/null +++ b/processing/wikiproc/man/init_nlp.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nlp_annotate.R +\name{init_nlp} +\alias{init_nlp} +\title{Initialize the nlp backend} +\usage{ +init_nlp(type, value) +} +\arguments{ +\item{type}{Type of python env to use, either "conda" or "python"} + +\item{value}{Connection string, if using a conda environment the name of it +if using python directly the path to the python executable} +} +\value{ +Does not return data +} +\description{ +A wrapper used to set the python environment and call cnlp_init +} +\examples{ +\dontrun{ +init_nlp("conda", "spcy") +} +} -- GitLab