Merge branch '32-add-unit-tests-for-cleanhtml-r' of...

Merge branch '32-add-unit-tests-for-cleanhtml-r' of git.informatik.uni-leipzig.de:text-mining-chatbot/wiki-rasa into 32-add-unit-tests-for-cleanhtml-r

Merge branch '32-add-unit-tests-for-cleanhtml-r' of...
80c1245f · Lucas Schons · 002a88f1 · 1c9037f0 · 002a88f1 · 80c1245f
Commit 80c1245f authored 6 years ago by Lucas Schons
--- a/processing/bin/ProcessNER.R
+++ b/processing/bin/ProcessNER.R
-#!/usr/bin/env Rscript
-### Provides functionality to use NER, POS and Dependency Grammars
-## Author: David
-cat("Initializing spacy backend...\n")
-# It's important to do this prior to loading any python related stuff
-reticulate::use_condaenv("spcy", required = TRUE)
-# Load librarys
-library(cleanNLP)
-# Init nlp models
-cnlp_init_spacy(entity_flag = TRUE)
-cat("Done.\n")
--- a/processing/bin/Master.R
+++ b/processing/bin/Master.R
@@ -3,10 +3,13 @@
 ### This script consolidates everything
 library(pbapply)
-library(rvest)
 library(wikiproc)
 library(rprojroot)
+## Set up nlp
+init_nlp("conda", "spcy")
 ## Fetch data
 cat("Starting data import...\n")
@@ -15,7 +18,7 @@ cat("Starting data import...\n")
 project_root <- find_root(has_file("README.md"))
 data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
-articles <- wikiproc::getData(use.cache = TRUE, data.dir = data_dir)
+articles <- get_data(use.cache = TRUE, data.dir = data_dir)
 ## Data processing
@@ -31,11 +34,11 @@ results <- pbapply(articles, 1, function(article) {
  ## Data preprocessing/annotating
-  # annotation <- createAnnotations(cleaned.text, article[2], article[3])
+  annotation <- create_annotations(cleaned.text, article[2], article[3], data.dir = data_dir)
  ## Extract information from Text
-  no.spouses <- wikiproc::getNoOfSpouses(article[4])
+  no.spouses <- get_no_of_spouses(article[4])
  ## Create Results

--- a/processing/wikiproc/NAMESPACE
+++ b/processing/wikiproc/NAMESPACE
 # Generated by roxygen2: do not edit by hand
-export(cleanHtml)
+export(create_annotations)
-export(createAnnotations)
+export(get_data)
-export(getBirthdate)
+export(get_no_of_spouses)
-export(getBirthplace)
+export(init_nlp)
-export(getData)
+import(rvest)
-export(getNoOfSpouses)
+importFrom(data.table,"%like%")
+importFrom(xml2,read_html)
+importFrom(xml2,xml_add_sibling)
+importFrom(xml2,xml_find_all)
+importFrom(xml2,xml_remove)
--- a/processing/wikiproc/R/CleanHtml.R
+++ b/processing/wikiproc/R/CleanHtml.R
@@ -2,11 +2,6 @@
 # Author: Lucas
-library(rvest)
-library(stringi)
-library(textclean)
-library(xml2)
 #' Clean a html formatted wikipedia page.
 #' Nodes of interest from the DOM are extracted and then cleaned from all html
 #' tags and annotations.

--- a/processing/wikiproc/R/GetData.R
+++ b/processing/wikiproc/R/GetData.R
@@ -2,10 +2,6 @@
 # Author: David
-library(WikipediR) # For querying wikipedia
-library(rvest) # For getting the list of physicits
-library(xml2)
 ## Though we could get the pages within the category 'physicists' with something like this
 ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
 ## this gives us only about 50 pages.
@@ -19,7 +15,8 @@ library(xml2)
 #' @param write.cache Write downloaded results into cache for use on future calls
 #' @param data.dir Directory the data should be read from and/or written to
 #' @return data.frame containing the title, id, revisionID and html-formatted full text
-getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
+#' @export
+get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
  dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep)
  dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep)

--- a/processing/wikiproc/R/get_no_of_spouses.R
+++ b/processing/wikiproc/R/get_no_of_spouses.R
+### GetNoOfSpouses.R
+### This extracts the number of spouses from the infobox
+### If no infobox or no information about spouses is found assumes there are none
+### Not for use in production, this does not actually get information from text
+# Author: David
+#' Reads the number of spouses from the infobox of an wikipedia article
+#'
+#' @param article Wikipedia article in html format
+#'
+#' @return Integer indicating the number of spouses
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' articles <- get_data()
+#' 
+#' no.spouses <- get_no_of_spouses(articles$Text[54])
+#' 
+#' no,spouses 
+#' }
+get_no_of_spouses <- function(article) {
+  # If there is no infobox we assume there were no spouses
+  if(!grepl("vcard", article)) {
+    return(0)
+  }
+  infoBox <- get_infobox(article)
+  # Get the spouse field
+  spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
+  # Remove everything in parentheses
+  spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
+  # Split the strings by newlines to get one spouse per line
+  spouses <- strsplit(spouses, "\n")
+  spouses <- unlist(spouses)
+  if(length(spouses) > 0) {
+    return(length(spouses))
+  }
+  return(0)
+}
--- a/processing/wikiproc/R/import_packages.R
+++ b/processing/wikiproc/R/import_packages.R
+### File used to automatically create package imports with roxygen2
+### Note that it is discouraged to import many packages fully to avoid name conflicts
+### If possible reference functions directy e.g. reshape2::melt()
+### There is a (very) minor performance penalty for ::,
+### if some functions are used frequently you may just import them
+### with something like @importFrom reshape2 melt cast
+#' @import rvest
+#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove read_html
+#' @importFrom data.table %like%
+NULL
\ No newline at end of file
--- a/processing/wikiproc/R/nlp_annotate.R
+++ b/processing/wikiproc/R/nlp_annotate.R
+#' Initialize the nlp backend
+#' 
+#' A wrapper used to set the python environment and call cnlp_init
+#'
+#' @param type Type of python env to use, either "conda" or "python"
+#' @param value Connection string, if using a conda environment the name of it
+#' if using python directly the path to the python executable
+#'
+#' @return Does not return data
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' init_nlp("conda", "spcy")
+#' }
+init_nlp <- function(type, value) {
+  if (type == "conda") {
+    reticulate::use_condaenv(value, required = TRUE)
+  } else if (type == "python") {
+    reticulate::use_python(value, required = TRUE)
+  }
+  cleanNLP::cnlp_init_spacy(entity_flag = TRUE)
+}
+#' Create annotations for the given text
+#'
+#' @param text Text to annotate
+#' @param article.id ArticleID used for cashing
+#' @param article.rev.id ArticleRevisionID used for cashing
+#' @param use.cache Should cashed data be uses
+#' @param write.cache Should the generated annotations be cashed
+#' @param data.dir Directory the data should be read from and/or written to
+#'
+#' @return Annotation object for use with cleanNLP methods
+#' @export
+create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
+  # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
+  # To prevent this we stip 'em again
+  filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE)
+  # Check if there is a cached version of the annotations for this article in this specific revision
+  if(use.cache & file.exists(filename)) {
+    res <- tryCatch({
+      data <- readRDS(filename)
+      data
+    }, error = function (e) {
+      cat("Cached data seems to be corrupted, redoing annotation.\n")
+    })
+    return(res)
+  }
+  annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE)
+  # Write cache if desired
+  if(write.cache) {
+    if (!dir.exists("data")) {
+      dir.create("data")
+    }
+    if (!dir.exists("data/annotations")) {
+      dir.create("data/annotations")
+    }
+    saveRDS(annotation, filename)
+  }
+  # Return data
+  # On a side note: Should we do this? The tidyverse style guide discourages explicit returns.
+  # But then again, it suggests snake case for variables...
+  return(annotation)
+}
\ No newline at end of file
--- a/processing/wikiproc/R/utils.R
+++ b/processing/wikiproc/R/utils.R
+### Utility functions used internally
+#' Extract the inforbox contents from wikipedia articles
+#'
+#' @param article Character vector containing the contents of an wikipedia
+#' article as html
+#'
+#' @return Data frame holding the contents of the table
+#' 
+#' @examples
+#' \dontrun{
+#' articles <- get_data()
+#' 
+#' infobox <- get_infobox(articles$Text[54])
+#' 
+#' infobox[3:4,]
+#' }
+get_infobox <- function(article) {
+  # Read page as html
+  page <- read_html(article)
+  # Extracting text from the html will erase all <br> tags,
+  # this will replace them with line breaks
+  xml_find_all(page, ".//br") %>%
+    xml_add_sibling("p", "\n")
+  xml_find_all(page, ".//br") %>%
+    xml_remove()
+  # Get the info box
+  # Will throw an error if there isnt any, so that should be checked beforehand
+  table <- page %>%
+    html_nodes("table.vcard") %>%
+    html_table(fill = TRUE) %>%
+    .[[1]]
+  colnames(table) <- c("Desc", "Content")
+  return(table)
+}
--- a/processing/wikiproc/man/create_annotations.Rd
+++ b/processing/wikiproc/man/create_annotations.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/nlp_annotate.R
+\name{create_annotations}
+\alias{create_annotations}
+\title{Create annotations for the given text}
+\usage{
+create_annotations(text, article.id, article.rev.id, use.cache = TRUE,
+  write.cache = FALSE, data.dir = "data")
+}
+\arguments{
+\item{text}{Text to annotate}
+\item{article.id}{ArticleID used for cashing}
+\item{article.rev.id}{ArticleRevisionID used for cashing}
+\item{use.cache}{Should cashed data be uses}
+\item{write.cache}{Should the generated annotations be cashed}
+\item{data.dir}{Directory the data should be read from and/or written to}
+}
+\value{
+Annotation object for use with cleanNLP methods
+}
+\description{
+Create annotations for the given text
+}
--- a/processing/wikiproc/man/getData.Rd
+++ b/processing/wikiproc/man/getData.Rd
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/GetData.R
+% Please edit documentation in R/get_data.R
-\name{getData}
+\name{get_data}
-\alias{getData}
+\alias{get_data}
 \title{Retrieve wikipedia articles about physicists}
 \usage{
-getData(use.cache = TRUE, write.cache = FALSE, data.dir = "data")
+get_data(use.cache = TRUE, write.cache = FALSE, data.dir = "data")
 }
 \arguments{
 \item{use.cache}{Use cached data if it exists over downloading new data}

--- a/processing/wikiproc/man/get_infobox.Rd
+++ b/processing/wikiproc/man/get_infobox.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{get_infobox}
+\alias{get_infobox}
+\title{Extract the inforbox contents from wikipedia articles}
+\usage{
+get_infobox(article)
+}
+\arguments{
+\item{article}{Character vector containing the contents of an wikipedia
+article as html}
+}
+\value{
+Data frame holding the contents of the table
+}
+\description{
+Extract the inforbox contents from wikipedia articles
+}
+\examples{
+\dontrun{
+articles <- get_data()
+infobox <- get_infobox(articles$Text[54])
+infobox[3:4,]
+}
+}
--- a/processing/wikiproc/man/get_no_of_spouses.Rd
+++ b/processing/wikiproc/man/get_no_of_spouses.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_no_of_spouses.R
+\name{get_no_of_spouses}
+\alias{get_no_of_spouses}
+\title{Reads the number of spouses from the infobox of an wikipedia article}
+\usage{
+get_no_of_spouses(article)
+}
+\arguments{
+\item{article}{Wikipedia article in html format}
+}
+\value{
+Integer indicating the number of spouses
+}
+\description{
+Reads the number of spouses from the infobox of an wikipedia article
+}
+\examples{
+\dontrun{
+articles <- get_data()
+no.spouses <- get_no_of_spouses(articles$Text[54])
+no,spouses 
+}
+}
--- a/processing/wikiproc/man/init_nlp.Rd
+++ b/processing/wikiproc/man/init_nlp.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/nlp_annotate.R
+\name{init_nlp}
+\alias{init_nlp}
+\title{Initialize the nlp backend}
+\usage{
+init_nlp(type, value)
+}
+\arguments{
+\item{type}{Type of python env to use, either "conda" or "python"}
+\item{value}{Connection string, if using a conda environment the name of it
+if using python directly the path to the python executable}
+}
+\value{
+Does not return data
+}
+\description{
+A wrapper used to set the python environment and call cnlp_init
+}
+\examples{
+\dontrun{
+init_nlp("conda", "spcy")
+}
+}