Minor tweeks

* Fixed a few function calls * Deleted unneeded r script * Deleted unneeded function declarations * Some cleanup

Minor tweeks
f3165856 · David Fuhry · 83529e59 · f3165856 · f3165856 · 83529e59
Commit f3165856 authored 6 years ago by David Fuhry
--- a/processing/script/master.R
+++ b/processing/script/master.R
@@ -30,7 +30,7 @@ results <- pbapply(articles, 1, function(article) {
  ## Data cleaning
-  cleaned.text <- wikiproc::cleanHtml(article[4])
+  cleaned.text <- wikiproc::clean_html(article[4])
  ## Data preprocessing/annotating

--- a/processing/wikiproc/NAMESPACE
+++ b/processing/wikiproc/NAMESPACE
@@ -7,9 +7,4 @@ export(get_birthplace)
 export(get_data)
 export(get_no_of_spouses)
 export(init_nlp)
-import(rvest)
 importFrom(data.table,"%like%")
-importFrom(xml2,read_html)
-importFrom(xml2,xml_add_sibling)
-importFrom(xml2,xml_find_all)
-importFrom(xml2,xml_remove)
--- a/processing/wikiproc/R/create_annotations.R
+++ b/processing/wikiproc/R/create_annotations.R
-library(cleanNLP)
-#' @export
-create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) {
-  # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
-  # To prevent this we stip 'em again
-  filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE)
-  # Check if there is a cached version of the annotations for this article in this specific revision
-  if(use.cache & file.exists(filename)) {
-    res <- tryCatch({
-      data <- readRDS(filename)
-      data
-    }, error = function (e) {
-      cat("Cached data seems to be corrupted, redoing annotation.\n")
-    })
-    return(res)
-  }
-  annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE)
-  # Write cache if desired
-  if(write.cache) {
-    if (!dir.exists("data")) {
-      dir.create("data")
-    }
-    if (!dir.exists("data/annotations")) {
-      dir.create("data/annotations")
-    }
-    saveRDS(annotation, filename)
-  }
-  # Return data
-  # On a side note: Should we do this? The tidyverse style guide discourages explicit returns.
-  # But then again, it suggests snake case for variables...
-  return(annotation)
-}
--- a/processing/wikiproc/R/get_birthdate.R
+++ b/processing/wikiproc/R/get_birthdate.R
@@ -21,7 +21,7 @@ get_birthdate <- function(article) {
  if(grepl("vcard", article)) {
    # Check if there is an infobox
-    infoBox <- getInfoBox(article)
+    infoBox <- get_infobox(article)
    # Get the Born field
    birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content
@@ -53,33 +53,6 @@ get_birthdate <- function(article) {
  }
 }
-### Converts info box to table
-getInfoBox <- function(article) {
-  # Read page as html
-  page <- xml2::read_html(article)
-  # Extracting text from the html will erase all <br> tags,
-  # This will replace them with line breaks
-  xml2::xml_find_all(page, ".//br") %>%
-    xml2::xml_add_sibling("p", "\n")
-  xml2::xml_find_all(page, ".//br") %>%
-    xml2::xml_remove()
-  # Get the info box
-  # Will throw an error if there isnt any, so that should be checked beforehand
-  table <- page %>%
-    rvest::html_nodes("table.vcard") %>%
-    rvest::html_table(fill = TRUE) %>%
-    .[[1]]
-  colnames(table) <- c("Desc", "Content")
-  return(table)
-}
 #' Get Introduction Text from Wikipedia page that contains birthdate
 #'
 #' @param article article in HTML-format

--- a/processing/wikiproc/R/get_birthplace.R
+++ b/processing/wikiproc/R/get_birthplace.R
@@ -24,7 +24,7 @@ get_birthplace <- function(article) {
  }
  # Use infobox to get Birthplace
-  infoBox <- getInfoBox(article)
+  infoBox <- get_infobox(article)
  # Get 'Born' field
  birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
@@ -36,30 +36,3 @@ get_birthplace <- function(article) {
  # return birthplace
  return(birthplace)
 }
-### Converts info box to table
-getInfoBox <- function(article) {
-  # Read page as html
-  page <- xml2::read_html(article)
-  # Extracting text from the html will erase all <br> tags,
-  # this will replace them with line breaks
-  xml2::xml_find_all(page, ".//br") %>%
-    xml2::xml_add_sibling("p", "\n")
-  xml2::xml_find_all(page, ".//br") %>%
-    xml2::xml_remove()
-  # Get the info box
-  # Will throw an error if there isnt any, so that should be checked beforehand
-  table <- page %>%
-    rvest::html_nodes("table.vcard") %>%
-    rvest::tml_table(fill = TRUE) %>%
-    .[[1]]
-  colnames(table) <- c("Desc", "Content")
-  return(table)
-}
--- a/processing/wikiproc/R/get_data.R
+++ b/processing/wikiproc/R/get_data.R
@@ -10,7 +10,6 @@
 #' Retrieve wikipedia articles about physicists
 #'
-#' @export
 #' @param use.cache Use cached data if it exists over downloading new data
 #' @param write.cache Write downloaded results into cache for use on future calls
 #' @param data.dir Directory the data should be read from and/or written to
@@ -38,7 +37,7 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
  cat("Downloading list from wikipedia... ")
-  page <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_physicists")
+  page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
  cat("Done.\n")
@@ -47,8 +46,8 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
  cat("Processing data:\n")
  physicists <- page %>%
-    rvest::html_nodes(".mw-parser-output li a") %>%
+    html_nodes(".mw-parser-output li a") %>%
-    rvest::html_attr("href")
+    html_attr("href")
  # Clean the list
@@ -85,9 +84,9 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
      # Check if the article is a redirect page
      if (grepl(".redirectText", article$parse$text$`*`)) {
        # Get the real article name
-        pname <- xml2::read_html(article$parse$text$`*`) %>%
+        pname <- read_html(article$parse$text$`*`) %>%
-          rvest::html_nodes(".redirectText a") %>%
+          html_nodes(".redirectText a") %>%
-          rvest::html_attr("href")
+          html_attr("href")
        panme <- gsub("_", " ", pname)

--- a/processing/wikiproc/R/import_packages.R
+++ b/processing/wikiproc/R/import_packages.R
@@ -5,7 +5,5 @@
 ### if some functions are used frequently you may just import them
 ### with something like @importFrom reshape2 melt cast
-#' @import rvest
-#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove read_html
 #' @importFrom data.table %like%
 NULL
\ No newline at end of file
--- a/processing/wikiproc/R/utils.R
+++ b/processing/wikiproc/R/utils.R
@@ -18,16 +18,16 @@
 #' }
 get_infobox <- function(article) {
  # Read page as html
-  page <- read_html(article)
+  page <- xml2::read_html(article)
  # Extracting text from the html will erase all <br> tags,
  # this will replace them with line breaks
-  xml_find_all(page, ".//br") %>%
+  xml2::xml_find_all(page, ".//br") %>%
-    xml_add_sibling("p", "\n")
+    xml2::xml_add_sibling("p", "\n")
-  xml_find_all(page, ".//br") %>%
+  xml2::xml_find_all(page, ".//br") %>%
-    xml_remove()
+    xml2::xml_remove()
  # Get the info box
  # Will throw an error if there isnt any, so that should be checked beforehand