From 1c9037f0dbd2e9204f63eea823951e87253edaaf Mon Sep 17 00:00:00 2001
From: David Fuhry <>
Date: Sat, 12 Jan 2019 16:00:20 +0100
Subject: [PATCH] Refactoring

* Added roxygen comments
* Moved nlp init into package
* Fixed various bugs
* Added import_packages.R and replaced all other imports with explicit ones
* Converted my methods to snake case
* Some minor stuff
 processing/bin/ProcessNER.R                   | 21 ------
 processing/{bin/Master.R => script/master.R}  | 11 ++-
 processing/wikiproc/NAMESPACE                 | 10 +++
 processing/wikiproc/R/CleanHtml.R             |  8 +-
 processing/wikiproc/R/GetNoOfSpouses.R        | 62 ----------------
 processing/wikiproc/R/createAnnotations.R     | 41 ----------
 .../wikiproc/R/{GetData.R => get_data.R}      | 13 ++--
 processing/wikiproc/R/get_no_of_spouses.R     | 43 +++++++++++
 processing/wikiproc/R/import_packages.R       | 11 +++
 processing/wikiproc/R/nlp_annotate.R          | 74 +++++++++++++++++++
 processing/wikiproc/R/utils.R                 | 43 +++++++++++
 processing/wikiproc/man/cleanHtml.Rd          |  4 +-
 processing/wikiproc/man/create_annotations.Rd | 28 +++++++
 .../wikiproc/man/{getData.Rd => get_data.Rd}  | 10 ++-
 processing/wikiproc/man/get_infobox.Rd        | 27 +++++++
 processing/wikiproc/man/get_no_of_spouses.Rd  | 26 +++++++
 processing/wikiproc/man/init_nlp.Rd           | 25 +++++++
 17 files changed, 309 insertions(+), 148 deletions(-)
 delete mode 100644 processing/bin/ProcessNER.R
 rename processing/{bin/Master.R => script/master.R} (81%)
 delete mode 100755 processing/wikiproc/R/GetNoOfSpouses.R
 delete mode 100644 processing/wikiproc/R/createAnnotations.R
 rename processing/wikiproc/R/{GetData.R => get_data.R} (90%)
 create mode 100755 processing/wikiproc/R/get_no_of_spouses.R
 create mode 100644 processing/wikiproc/R/import_packages.R
 create mode 100644 processing/wikiproc/R/nlp_annotate.R
 create mode 100644 processing/wikiproc/R/utils.R
 create mode 100644 processing/wikiproc/man/create_annotations.Rd
 rename processing/wikiproc/man/{getData.Rd => get_data.Rd} (65%)
 create mode 100644 processing/wikiproc/man/get_infobox.Rd
 create mode 100644 processing/wikiproc/man/get_no_of_spouses.Rd
 create mode 100644 processing/wikiproc/man/init_nlp.Rd

diff --git a/processing/bin/ProcessNER.R b/processing/bin/ProcessNER.R
deleted file mode 100644
index 775ffb1..0000000
--- a/processing/bin/ProcessNER.R
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env Rscript
-### Provides functionality to use NER, POS and Dependency Grammars
-## Author: David
-cat("Initializing spacy backend...\n")
-# It's important to do this prior to loading any python related stuff
-reticulate::use_condaenv("spcy", required = TRUE)
-# Load librarys
-# Init nlp models
-cnlp_init_spacy(entity_flag = TRUE)
diff --git a/processing/bin/Master.R b/processing/script/master.R
similarity index 81%
rename from processing/bin/Master.R
rename to processing/script/master.R
index 8e338d5..22cd347 100755
--- a/processing/bin/Master.R
+++ b/processing/script/master.R
@@ -3,10 +3,13 @@
 ### This script consolidates everything
+## Set up nlp
+init_nlp("conda", "spcy")
 ## Fetch data
 cat("Starting data import...\n")
@@ -15,7 +18,7 @@ cat("Starting data import...\n")
 project_root <- find_root(has_file(""))
 data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
-articles <- wikiproc:::getData(use.cache = TRUE, data.dir = data_dir)
+articles <- get_data(use.cache = TRUE, data.dir = data_dir)
 ## Data processing
@@ -31,11 +34,11 @@ results <- pbapply(articles, 1, function(article) {
   ## Data preprocessing/annotating
-  # annotation <- createAnnotations(cleaned.text, article[2], article[3])
+  annotation <- create_annotations(cleaned.text, article[2], article[3], data.dir = data_dir)
   ## Extract information from Text
-  no.spouses <- wikiproc:::getNoOfSpouses(article[4])
+  no.spouses <- get_no_of_spouses(article[4])
   ## Create Results
diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE
index 6ae9268..5e2056d 100644
--- a/processing/wikiproc/NAMESPACE
+++ b/processing/wikiproc/NAMESPACE
@@ -1,2 +1,12 @@
 # Generated by roxygen2: do not edit by hand
diff --git a/processing/wikiproc/R/CleanHtml.R b/processing/wikiproc/R/CleanHtml.R
index 182e9c8..0421cd0 100644
--- a/processing/wikiproc/R/CleanHtml.R
+++ b/processing/wikiproc/R/CleanHtml.R
@@ -2,10 +2,6 @@
 # Author: Lucas
 #' Clean a html formatted wikipedia page.
 #' Nodes of interest from the DOM are extracted and then cleaned from all html
 #' tags and annotations.
@@ -24,8 +20,8 @@ cleanHtml <- function(html) {
   #     - replace multiple newlines with single newline
   result <- read_html(html) %>%
     html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>%
-    stri_flatten(collapse = " ") %>%
-    replace_html() %>%
+    stringi::stri_flatten(collapse = " ") %>%
+    textclean::replace_html() %>%
     gsub("\\[\\d*\\]", "", .) %>%
     gsub("  +", " ", .) %>%
     gsub("\n ", "\n", .) %>%
diff --git a/processing/wikiproc/R/GetNoOfSpouses.R b/processing/wikiproc/R/GetNoOfSpouses.R
deleted file mode 100755
index 5190eda..0000000
--- a/processing/wikiproc/R/GetNoOfSpouses.R
+++ /dev/null
@@ -1,62 +0,0 @@
-### GetNoOfSpouses.R
-### This extracts the number of spouses from the infobox
-### If no infobox or no information about spouses is found assumes there are none
-### Not for use in production, this does not actually get information from text
-# Author: David
-## Librarys
-### Get number of spouses
-getNoOfSpouses <- function(article) {
-  # If there is no infobox we assume there were no spouses
-  if(!grepl("vcard", article)) {
-    return(0)
-  }
-  infoBox <- getInfoBox(article)
-  # Get the spouse field
-  spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
-  # Remove everything in parentheses
-  spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
-  # Split the strings by newlines to get one spouse per line
-  spouses <- strsplit(spouses, "\n")
-  spouses <- unlist(spouses)
-  if(length(spouses) > 0) {
-    return(length(spouses))
-  }
-  return(0)
-### Converts info box to table
-getInfoBox <- function(article) {
-  # Read page as html
-  page <- read_html(article)
-  # Extracting text from the html will erase all <br> tags,
-  # this will replace them with line breaks
-  xml_find_all(page, ".//br") %>%
-    xml_add_sibling("p", "\n")
-  xml_find_all(page, ".//br") %>%
-    xml_remove()
-  # Get the info box
-  # Will throw an error if there isnt any, so that should be checked beforehand
-  table <- page %>%
-    html_nodes("table.vcard") %>%
-    html_table(fill = TRUE) %>%
-    .[[1]]
-  colnames(table) <- c("Desc", "Content")
-  return(table)
diff --git a/processing/wikiproc/R/createAnnotations.R b/processing/wikiproc/R/createAnnotations.R
deleted file mode 100644
index b9ca6eb..0000000
--- a/processing/wikiproc/R/createAnnotations.R
+++ /dev/null
@@ -1,41 +0,0 @@
-createAnnotations <- function(text,,, use.cache = TRUE, write.cache = FALSE) {
-  # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
-  # To prevent this we stip 'em again
-  filename <- gsub(" ", "", paste0("data/annotations/",, "-",, ".RDS"), fixed = TRUE)
-  # Check if there is a cached version of the annotations for this article in this specific revision
-  if(use.cache & file.exists(filename)) {
-    res <- tryCatch({
-      data <- readRDS(filename)
-      data
-    }, error = function (e) {
-      cat("Cached data seems to be corrupted, redoing annotation.\n")
-    })
-    return(res)
-  }
-  annotation <- cnlp_annotate(text, as_strings = TRUE)
-  # Write cache if desired
-  if(write.cache) {
-    if (!dir.exists("data")) {
-      dir.create("data")
-    }
-    if (!dir.exists("data/annotations")) {
-      dir.create("data/annotations")
-    }
-    saveRDS(annotation, filename)
-  }
-  # Return data
-  # On a side note: Should we do this? The tidyverse style guide discourages explicit returns.
-  # But then again, it suggests snake case for variables...
-  return(annotation)
\ No newline at end of file
diff --git a/processing/wikiproc/R/GetData.R b/processing/wikiproc/R/get_data.R
similarity index 90%
rename from processing/wikiproc/R/GetData.R
rename to processing/wikiproc/R/get_data.R
index ef8713e..abcf009 100644
--- a/processing/wikiproc/R/GetData.R
+++ b/processing/wikiproc/R/get_data.R
@@ -2,10 +2,6 @@
 # Author: David
-library(WikipediR) # For querying wikipedia
-library(rvest) # For getting the list of physicits
 ## Though we could get the pages within the category 'physicists' with something like this
 ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
 ## this gives us only about 50 pages.
@@ -18,7 +14,8 @@ library(xml2)
 #' @param write.cache Write downloaded results into cache for use on future calls
 #' @param data.dir Directory the data should be read from and/or written to
 #' @return data.frame containing the title, id, revisionID and html-formatted full text
-getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
+#' @export
+get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
   dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep)
   dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep)
@@ -81,9 +78,9 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
   # Call the wikipedia api for each entry in our list
-  articles <- pblapply(physicists, function(x) {
+  articles <- pbapply::pblapply(physicists, function(x) {
     res <- tryCatch({
-      article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
+      article <- WikipediR::page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
       # Check if the article is a redirect page
       if (grepl(".redirectText", article$parse$text$`*`)) {
         # Get the real article name
@@ -101,7 +98,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
         Encoding(tmp) <- "UTF-8"
         pname <- tmp
-        article <- page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
+        article <- WikipediR::page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
       data.frame(Title = article$parse$title,
                  PageID = article$parse$pageid,
diff --git a/processing/wikiproc/R/get_no_of_spouses.R b/processing/wikiproc/R/get_no_of_spouses.R
new file mode 100755
index 0000000..c0fb31e
--- /dev/null
+++ b/processing/wikiproc/R/get_no_of_spouses.R
@@ -0,0 +1,43 @@
+### GetNoOfSpouses.R
+### This extracts the number of spouses from the infobox
+### If no infobox or no information about spouses is found assumes there are none
+### Not for use in production, this does not actually get information from text
+# Author: David
+#' Reads the number of spouses from the infobox of an wikipedia article
+#' @param article Wikipedia article in html format
+#' @return Integer indicating the number of spouses
+#' @export
+#' @examples
+#' \dontrun{
+#' articles <- get_data()
+#' no.spouses <- get_no_of_spouses(articles$Text[54])
+#' no,spouses 
+#' }
+get_no_of_spouses <- function(article) {
+  # If there is no infobox we assume there were no spouses
+  if(!grepl("vcard", article)) {
+    return(0)
+  }
+  infoBox <- get_infobox(article)
+  # Get the spouse field
+  spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
+  # Remove everything in parentheses
+  spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
+  # Split the strings by newlines to get one spouse per line
+  spouses <- strsplit(spouses, "\n")
+  spouses <- unlist(spouses)
+  if(length(spouses) > 0) {
+    return(length(spouses))
+  }
+  return(0)
diff --git a/processing/wikiproc/R/import_packages.R b/processing/wikiproc/R/import_packages.R
new file mode 100644
index 0000000..03025a3
--- /dev/null
+++ b/processing/wikiproc/R/import_packages.R
@@ -0,0 +1,11 @@
+### File used to automatically create package imports with roxygen2
+### Note that it is discouraged to import many packages fully to avoid name conflicts
+### If possible reference functions directy e.g. reshape2::melt()
+### There is a (very) minor performance penalty for ::,
+### if some functions are used frequently you may just import them
+### with something like @importFrom reshape2 melt cast
+#' @import rvest
+#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove read_html
+#' @importFrom data.table %like%
\ No newline at end of file
diff --git a/processing/wikiproc/R/nlp_annotate.R b/processing/wikiproc/R/nlp_annotate.R
new file mode 100644
index 0000000..c83921e
--- /dev/null
+++ b/processing/wikiproc/R/nlp_annotate.R
@@ -0,0 +1,74 @@
+#' Initialize the nlp backend
+#' A wrapper used to set the python environment and call cnlp_init
+#' @param type Type of python env to use, either "conda" or "python"
+#' @param value Connection string, if using a conda environment the name of it
+#' if using python directly the path to the python executable
+#' @return Does not return data
+#' @export
+#' @examples
+#' \dontrun{
+#' init_nlp("conda", "spcy")
+#' }
+init_nlp <- function(type, value) {
+  if (type == "conda") {
+    reticulate::use_condaenv(value, required = TRUE)
+  } else if (type == "python") {
+    reticulate::use_python(value, required = TRUE)
+  }
+  cleanNLP::cnlp_init_spacy(entity_flag = TRUE)
+#' Create annotations for the given text
+#' @param text Text to annotate
+#' @param ArticleID used for cashing
+#' @param ArticleRevisionID used for cashing
+#' @param use.cache Should cashed data be uses
+#' @param write.cache Should the generated annotations be cashed
+#' @param data.dir Directory the data should be read from and/or written to
+#' @return Annotation object for use with cleanNLP methods
+#' @export
+create_annotations <- function(text,,, use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
+  # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
+  # To prevent this we stip 'em again
+  filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(, "-",, ".RDS"), sep = .Platform$file.sep), fixed = TRUE)
+  # Check if there is a cached version of the annotations for this article in this specific revision
+  if(use.cache & file.exists(filename)) {
+    res <- tryCatch({
+      data <- readRDS(filename)
+      data
+    }, error = function (e) {
+      cat("Cached data seems to be corrupted, redoing annotation.\n")
+    })
+    return(res)
+  }
+  annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE)
+  # Write cache if desired
+  if(write.cache) {
+    if (!dir.exists("data")) {
+      dir.create("data")
+    }
+    if (!dir.exists("data/annotations")) {
+      dir.create("data/annotations")
+    }
+    saveRDS(annotation, filename)
+  }
+  # Return data
+  # On a side note: Should we do this? The tidyverse style guide discourages explicit returns.
+  # But then again, it suggests snake case for variables...
+  return(annotation)
\ No newline at end of file
diff --git a/processing/wikiproc/R/utils.R b/processing/wikiproc/R/utils.R
new file mode 100644
index 0000000..a518f16
--- /dev/null
+++ b/processing/wikiproc/R/utils.R
@@ -0,0 +1,43 @@
+### Utility functions used internally
+#' Extract the inforbox contents from wikipedia articles
+#' @param article Character vector containing the contents of an wikipedia
+#' article as html
+#' @return Data frame holding the contents of the table
+#' @examples
+#' \dontrun{
+#' articles <- get_data()
+#' infobox <- get_infobox(articles$Text[54])
+#' infobox[3:4,]
+#' }
+get_infobox <- function(article) {
+  # Read page as html
+  page <- read_html(article)
+  # Extracting text from the html will erase all <br> tags,
+  # this will replace them with line breaks
+  xml_find_all(page, ".//br") %>%
+    xml_add_sibling("p", "\n")
+  xml_find_all(page, ".//br") %>%
+    xml_remove()
+  # Get the info box
+  # Will throw an error if there isnt any, so that should be checked beforehand
+  table <- page %>%
+    html_nodes("table.vcard") %>%
+    html_table(fill = TRUE) %>%
+    .[[1]]
+  colnames(table) <- c("Desc", "Content")
+  return(table)
diff --git a/processing/wikiproc/man/cleanHtml.Rd b/processing/wikiproc/man/cleanHtml.Rd
index 56994f4..7247852 100644
--- a/processing/wikiproc/man/cleanHtml.Rd
+++ b/processing/wikiproc/man/cleanHtml.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/CleanHtml.R
-\title{Clean a html formatted wikipedia page. 
+\title{Clean a html formatted wikipedia page.
 Nodes of interest from the DOM are extracted and then cleaned from all html
 tags and annotations.}
@@ -15,7 +15,7 @@ cleanHtml(html)
 Plaintext document containing only the maintext of the give wikipedia page.
-Clean a html formatted wikipedia page. 
+Clean a html formatted wikipedia page.
 Nodes of interest from the DOM are extracted and then cleaned from all html
 tags and annotations.
diff --git a/processing/wikiproc/man/create_annotations.Rd b/processing/wikiproc/man/create_annotations.Rd
new file mode 100644
index 0000000..305b279
--- /dev/null
+++ b/processing/wikiproc/man/create_annotations.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/nlp_annotate.R
+\title{Create annotations for the given text}
+create_annotations(text,,, use.cache = TRUE,
+  write.cache = FALSE, data.dir = "data")
+\item{text}{Text to annotate}
+\item{}{ArticleID used for cashing}
+\item{}{ArticleRevisionID used for cashing}
+\item{use.cache}{Should cashed data be uses}
+\item{write.cache}{Should the generated annotations be cashed}
+\item{data.dir}{Directory the data should be read from and/or written to}
+Annotation object for use with cleanNLP methods
+Create annotations for the given text
diff --git a/processing/wikiproc/man/getData.Rd b/processing/wikiproc/man/get_data.Rd
similarity index 65%
rename from processing/wikiproc/man/getData.Rd
rename to processing/wikiproc/man/get_data.Rd
index 13e362d..cec7d17 100644
--- a/processing/wikiproc/man/getData.Rd
+++ b/processing/wikiproc/man/get_data.Rd
@@ -1,15 +1,17 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/GetData.R
+% Please edit documentation in R/get_data.R
 \title{Retrieve wikipedia articles about physicists}
-getData(use.cache = TRUE, write.cache = FALSE)
+get_data(use.cache = TRUE, write.cache = FALSE, data.dir = "data")
 \item{use.cache}{Use cached data if it exists over downloading new data}
 \item{write.cache}{Write downloaded results into cache for use on future calls}
+\item{data.dir}{Directory the data should be read from and/or written to}
 data.frame containing the title, id, revisionID and html-formatted full text
diff --git a/processing/wikiproc/man/get_infobox.Rd b/processing/wikiproc/man/get_infobox.Rd
new file mode 100644
index 0000000..ef8d031
--- /dev/null
+++ b/processing/wikiproc/man/get_infobox.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\title{Extract the inforbox contents from wikipedia articles}
+\item{article}{Character vector containing the contents of an wikipedia
+article as html}
+Data frame holding the contents of the table
+Extract the inforbox contents from wikipedia articles
+articles <- get_data()
+infobox <- get_infobox(articles$Text[54])
diff --git a/processing/wikiproc/man/get_no_of_spouses.Rd b/processing/wikiproc/man/get_no_of_spouses.Rd
new file mode 100644
index 0000000..131c526
--- /dev/null
+++ b/processing/wikiproc/man/get_no_of_spouses.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_no_of_spouses.R
+\title{Reads the number of spouses from the infobox of an wikipedia article}
+\item{article}{Wikipedia article in html format}
+Integer indicating the number of spouses
+Reads the number of spouses from the infobox of an wikipedia article
+articles <- get_data()
+no.spouses <- get_no_of_spouses(articles$Text[54])
diff --git a/processing/wikiproc/man/init_nlp.Rd b/processing/wikiproc/man/init_nlp.Rd
new file mode 100644
index 0000000..47644aa
--- /dev/null
+++ b/processing/wikiproc/man/init_nlp.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/nlp_annotate.R
+\title{Initialize the nlp backend}
+init_nlp(type, value)
+\item{type}{Type of python env to use, either "conda" or "python"}
+\item{value}{Connection string, if using a conda environment the name of it
+if using python directly the path to the python executable}
+Does not return data
+A wrapper used to set the python environment and call cnlp_init
+init_nlp("conda", "spcy")