From 815047b47e401546d03ab2c0157e6a3f3f5860c4 Mon Sep 17 00:00:00 2001
From: Lulu Roth <ls80zyse@studserv.uni-leipzig.de>
Date: Fri, 11 Jan 2019 01:09:40 +0100
Subject: [PATCH] Refactor text processing

* Create R package containing function definitions
* Create directory processing containing scripts
* fix some typos
* general cleanup
---
 {R => processing/bin}/Master.R                | 27 +++----------------
 processing/bin/ProcessNER.R                   | 21 +++++++++++++++
 packages.list => processing/packages.list     |  0
 processing/wikiproc/.Rbuildignore             |  2 ++
 processing/wikiproc/DESCRIPTION               | 12 +++++++++
 processing/wikiproc/NAMESPACE                 |  2 ++
 {R => processing/wikiproc/R}/CleanHtml.R      |  0
 {R => processing/wikiproc/R}/GetBirthdate.R   |  0
 {R => processing/wikiproc/R}/GetBirthplace.R  |  0
 {R => processing/wikiproc/R}/GetData.R        | 19 ++++++++-----
 {R => processing/wikiproc/R}/GetNoOfSpouses.R |  0
 .../wikiproc/R/createAnnotations.R            | 22 +--------------
 processing/wikiproc/man/cleanHtml.Rd          | 21 +++++++++++++++
 processing/wikiproc/man/getBirthdate.Rd       | 25 +++++++++++++++++
 processing/wikiproc/man/getBirthplace.Rd      | 23 ++++++++++++++++
 processing/wikiproc/man/getData.Rd            | 19 +++++++++++++
 processing/wikiproc/man/getIntroduction.Rd    | 17 ++++++++++++
 processing/wikiproc/tests/testthat.R          |  5 ++++
 .../wikiproc/tests/testthat/test-cleanhtml.R  |  5 ++++
 tests/testthat/testCleanHtml.R                |  1 -
 20 files changed, 169 insertions(+), 52 deletions(-)
 rename {R => processing/bin}/Master.R (67%)
 create mode 100644 processing/bin/ProcessNER.R
 rename packages.list => processing/packages.list (100%)
 create mode 100644 processing/wikiproc/.Rbuildignore
 create mode 100644 processing/wikiproc/DESCRIPTION
 create mode 100644 processing/wikiproc/NAMESPACE
 rename {R => processing/wikiproc/R}/CleanHtml.R (100%)
 rename {R => processing/wikiproc/R}/GetBirthdate.R (100%)
 rename {R => processing/wikiproc/R}/GetBirthplace.R (100%)
 rename {R => processing/wikiproc/R}/GetData.R (87%)
 rename {R => processing/wikiproc/R}/GetNoOfSpouses.R (100%)
 rename R/ProcessNER.R => processing/wikiproc/R/createAnnotations.R (77%)
 create mode 100644 processing/wikiproc/man/cleanHtml.Rd
 create mode 100644 processing/wikiproc/man/getBirthdate.Rd
 create mode 100644 processing/wikiproc/man/getBirthplace.Rd
 create mode 100644 processing/wikiproc/man/getData.Rd
 create mode 100644 processing/wikiproc/man/getIntroduction.Rd
 create mode 100644 processing/wikiproc/tests/testthat.R
 create mode 100644 processing/wikiproc/tests/testthat/test-cleanhtml.R
 delete mode 100644 tests/testthat/testCleanHtml.R

diff --git a/R/Master.R b/processing/bin/Master.R
similarity index 67%
rename from R/Master.R
rename to processing/bin/Master.R
index fe4ae64..53de2c0 100755
--- a/R/Master.R
+++ b/processing/bin/Master.R
@@ -1,31 +1,16 @@
 #!/usr/bin/env Rscript
 
-
 ### This script consolidates everything
 
-## Librarys
-
 library(pbapply)
-
-#library(SomeLibrary)
-
-## Load Scripts
-
-cat("Sourcing R scripts... ")
-
-source("r/GetData.R")
-source("r/GetNoOfSpouses.R")
-source("r/CleanHtml.R")
-source("r/ProcessNER.R")
-#source("r/getSomethingElse.R")
-
-cat("Done.\n")
+library(rvest)
+library(wikiproc)
 
 ## Fetch data
 
 cat("Starting data import...\n")
 
-articles <- getData(use.cache = TRUE)
+articles <- getData(use.cache = FALSE, write.cache = TRUE, data.dir = "../../data/")
 
 ## Data processing
 
@@ -41,16 +26,12 @@ results <- pbapply(articles, 1, function(article) {
   
   ## Data preprocessing/annotating
   
-  annotation <- createAnnotations(cleaned.text, article[2], article[3])
+  # annotation <- createAnnotations(cleaned.text, article[2], article[3])
   
   ## Extract information from Text
   
   no.spouses <- getNoOfSpouses(article[4])
   
-  # someFact <- getFactFromTextFunctioN(annotated.text)
-  
-  # someOtherFact <- getOtherFactFromText(data$Text)
-  
   ## Create Results
   
   data.frame(Name = article[1],
diff --git a/processing/bin/ProcessNER.R b/processing/bin/ProcessNER.R
new file mode 100644
index 0000000..775ffb1
--- /dev/null
+++ b/processing/bin/ProcessNER.R
@@ -0,0 +1,21 @@
+#!/usr/bin/env Rscript
+
+### Provides functionality to use NER, POS and Dependency Grammars
+
+## Author: David
+
+cat("Initializing spacy backend...\n")
+
+# It's important to do this prior to loading any python related stuff
+
+reticulate::use_condaenv("spcy", required = TRUE)
+
+# Load librarys
+
+library(cleanNLP)
+
+# Init nlp models
+
+cnlp_init_spacy(entity_flag = TRUE)
+
+cat("Done.\n")
diff --git a/packages.list b/processing/packages.list
similarity index 100%
rename from packages.list
rename to processing/packages.list
diff --git a/processing/wikiproc/.Rbuildignore b/processing/wikiproc/.Rbuildignore
new file mode 100644
index 0000000..3b7c4e3
--- /dev/null
+++ b/processing/wikiproc/.Rbuildignore
@@ -0,0 +1,2 @@
+^wikiproc\.Rproj$
+^\.Rproj\.user$
diff --git a/processing/wikiproc/DESCRIPTION b/processing/wikiproc/DESCRIPTION
new file mode 100644
index 0000000..52400db
--- /dev/null
+++ b/processing/wikiproc/DESCRIPTION
@@ -0,0 +1,12 @@
+Package: wikiproc
+Title: What the Package Does (one line, title case)
+Version: 0.0.0.9000
+Authors@R: person("First", "Last", email = "first.last@example.com", role = c("aut", "cre"))
+Description: What the package does (one paragraph).
+Depends: R (>= 3.5.2)
+License: What license is it under?
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.1.1
+Suggests: 
+    testthat
diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE
new file mode 100644
index 0000000..6ae9268
--- /dev/null
+++ b/processing/wikiproc/NAMESPACE
@@ -0,0 +1,2 @@
+# Generated by roxygen2: do not edit by hand
+
diff --git a/R/CleanHtml.R b/processing/wikiproc/R/CleanHtml.R
similarity index 100%
rename from R/CleanHtml.R
rename to processing/wikiproc/R/CleanHtml.R
diff --git a/R/GetBirthdate.R b/processing/wikiproc/R/GetBirthdate.R
similarity index 100%
rename from R/GetBirthdate.R
rename to processing/wikiproc/R/GetBirthdate.R
diff --git a/R/GetBirthplace.R b/processing/wikiproc/R/GetBirthplace.R
similarity index 100%
rename from R/GetBirthplace.R
rename to processing/wikiproc/R/GetBirthplace.R
diff --git a/R/GetData.R b/processing/wikiproc/R/GetData.R
similarity index 87%
rename from R/GetData.R
rename to processing/wikiproc/R/GetData.R
index d48a0c6..c529fbe 100644
--- a/R/GetData.R
+++ b/processing/wikiproc/R/GetData.R
@@ -4,6 +4,7 @@
 
 library(WikipediR) # For querying wikipedia
 library(rvest) # For getting the list of physicits
+library(xml2)
 
 ## Though we could get the pages within the category 'physicists' with something like this
 ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
@@ -15,12 +16,16 @@ library(rvest) # For getting the list of physicits
 #' 
 #' @param use.cache Use cached data if it exists over downloading new data
 #' @param write.cache Write downloaded results into cache for use on future calls
+#' @param data.dir Directory the data should be read from and/or written to
 #' @return data.frame containing the title, id, revisionID and html-formatted full text
-getData <- function(use.cache = TRUE, write.cache = FALSE) {
+getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") {
+  
+  dest.articlesRDS <- paste(data.dir, "articles.RDS", "")
+  dest.articlesCSV <- paste(data.dir, "articles.csv", "")
   ### First we check if the data already exists and try to load it if it does
-  if(file.exists("data/articles.RDS") & use.cache ) {
+  if(file.exists(dest.articlesRDS) && use.cache ) {
     res <- tryCatch({
-      data <- readRDS("data/articles.RDS")
+      data <- readRDS(dest.articlesRDS)
       cat("Found chached data to use, import finished.\n")
       data
     }, error = function (e) {
@@ -118,12 +123,12 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) {
   # Write result if desired
   
   if(write.cache) {
-    if (!dir.exists("data")) {
-      dir.create("data")
+    if (!dir.exists(data.dir)) {
+      dir.create(data.dir)
     }
     cat("Writing data to files... ")
-    write.table(articles, "data/articles.csv")
-    saveRDS(articles, "data/articles.RDS")
+    write.table(articles, dest.articlesCSV)
+    saveRDS(articles, dest.articlesRDS)
     cat("Done.\n")
   }
   
diff --git a/R/GetNoOfSpouses.R b/processing/wikiproc/R/GetNoOfSpouses.R
similarity index 100%
rename from R/GetNoOfSpouses.R
rename to processing/wikiproc/R/GetNoOfSpouses.R
diff --git a/R/ProcessNER.R b/processing/wikiproc/R/createAnnotations.R
similarity index 77%
rename from R/ProcessNER.R
rename to processing/wikiproc/R/createAnnotations.R
index 661d4a9..b9ca6eb 100644
--- a/R/ProcessNER.R
+++ b/processing/wikiproc/R/createAnnotations.R
@@ -1,32 +1,12 @@
-#!/usr/bin/env Rscript
-
-### Provides functionality to use NER, POS and Dependency Grammers
-
-## Author: David
-
-cat("Initializing spacy backend...\n")
-
-# It's important to do this prior to loading any python related stuff
-
-reticulate::use_condaenv("spcy", required = TRUE)
-
-# Load librarys
-
 library(cleanNLP)
 
-# Init nlp models
-
-cnlp_init_spacy(entity_flag = TRUE)
-
-cat("Done.\n")
-
 createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) {
   
   # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
   # To prevent this we stip 'em again
   
   filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE)
-
+  
   # Check if there is a cached version of the annotations for this article in this specific revision
   
   if(use.cache & file.exists(filename)) {
diff --git a/processing/wikiproc/man/cleanHtml.Rd b/processing/wikiproc/man/cleanHtml.Rd
new file mode 100644
index 0000000..56994f4
--- /dev/null
+++ b/processing/wikiproc/man/cleanHtml.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/CleanHtml.R
+\name{cleanHtml}
+\alias{cleanHtml}
+\title{Clean a html formatted wikipedia page. 
+Nodes of interest from the DOM are extracted and then cleaned from all html
+tags and annotations.}
+\usage{
+cleanHtml(html)
+}
+\arguments{
+\item{html}{Url linking to a wikipedia webpage or a html formatted document.}
+}
+\value{
+Plaintext document containing only the maintext of the give wikipedia page.
+}
+\description{
+Clean a html formatted wikipedia page. 
+Nodes of interest from the DOM are extracted and then cleaned from all html
+tags and annotations.
+}
diff --git a/processing/wikiproc/man/getBirthdate.Rd b/processing/wikiproc/man/getBirthdate.Rd
new file mode 100644
index 0000000..a614ade
--- /dev/null
+++ b/processing/wikiproc/man/getBirthdate.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/GetBirthdate.R
+\name{getBirthdate}
+\alias{getBirthdate}
+\title{Extract birthdate from infobox
+Will try to get infobox as table and extract birthdate
+from 'Born'-entry
+If there is no infobox, first paragraph of the article
+will be checked for birthdate}
+\usage{
+getBirthdate(article)
+}
+\arguments{
+\item{article}{Article in HTML-format}
+}
+\value{
+String birthdate as string|NULL
+}
+\description{
+Extract birthdate from infobox
+Will try to get infobox as table and extract birthdate
+from 'Born'-entry
+If there is no infobox, first paragraph of the article
+will be checked for birthdate
+}
diff --git a/processing/wikiproc/man/getBirthplace.Rd b/processing/wikiproc/man/getBirthplace.Rd
new file mode 100644
index 0000000..8dd69d7
--- /dev/null
+++ b/processing/wikiproc/man/getBirthplace.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/GetBirthplace.R
+\name{getBirthplace}
+\alias{getBirthplace}
+\title{This script extracts Birthplace from physicist texts
+Try to get the infobox and extract the birthplace
+If there is no infobox, 0 will be returned as 
+birthplace is hard to extract from text}
+\usage{
+getBirthplace(article)
+}
+\arguments{
+\item{article}{Article in HTML-format}
+}
+\value{
+String with birthplace of the physicist|0
+}
+\description{
+This script extracts Birthplace from physicist texts
+Try to get the infobox and extract the birthplace
+If there is no infobox, 0 will be returned as 
+birthplace is hard to extract from text
+}
diff --git a/processing/wikiproc/man/getData.Rd b/processing/wikiproc/man/getData.Rd
new file mode 100644
index 0000000..13e362d
--- /dev/null
+++ b/processing/wikiproc/man/getData.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/GetData.R
+\name{getData}
+\alias{getData}
+\title{Retrieve wikipedia articles about physicists}
+\usage{
+getData(use.cache = TRUE, write.cache = FALSE)
+}
+\arguments{
+\item{use.cache}{Use cached data if it exists over downloading new data}
+
+\item{write.cache}{Write downloaded results into cache for use on future calls}
+}
+\value{
+data.frame containing the title, id, revisionID and html-formatted full text
+}
+\description{
+Retrieve wikipedia articles about physicists
+}
diff --git a/processing/wikiproc/man/getIntroduction.Rd b/processing/wikiproc/man/getIntroduction.Rd
new file mode 100644
index 0000000..3dfe196
--- /dev/null
+++ b/processing/wikiproc/man/getIntroduction.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/GetBirthdate.R
+\name{getIntroduction}
+\alias{getIntroduction}
+\title{Get Introduction Text from Wikipedia page that contains birthdate}
+\usage{
+getIntroduction(article)
+}
+\arguments{
+\item{article}{article in HTML-format}
+}
+\value{
+string introduction text from wikipedia article
+}
+\description{
+Get Introduction Text from Wikipedia page that contains birthdate
+}
diff --git a/processing/wikiproc/tests/testthat.R b/processing/wikiproc/tests/testthat.R
new file mode 100644
index 0000000..3425482
--- /dev/null
+++ b/processing/wikiproc/tests/testthat.R
@@ -0,0 +1,5 @@
+library(testthat)
+library(wikiproc)
+
+test_check("wikiproc")
+
diff --git a/processing/wikiproc/tests/testthat/test-cleanhtml.R b/processing/wikiproc/tests/testthat/test-cleanhtml.R
new file mode 100644
index 0000000..9390f44
--- /dev/null
+++ b/processing/wikiproc/tests/testthat/test-cleanhtml.R
@@ -0,0 +1,5 @@
+context("test-cleanhtml")
+
+test_that("multiplication works", {
+  expect_equal(2 * 2, 4)
+})
diff --git a/tests/testthat/testCleanHtml.R b/tests/testthat/testCleanHtml.R
deleted file mode 100644
index 1342ffa..0000000
--- a/tests/testthat/testCleanHtml.R
+++ /dev/null
@@ -1 +0,0 @@
-# Test cleanHtml function.
\ No newline at end of file
-- 
GitLab