Enhance R module

* resolve paths from project root * modify DESCRIPTION

Enhance R module
c886643b · Lucas Schons · 815047b4 · c886643b · c886643b · c886643b
Commit c886643b authored 6 years ago by Lucas Schons
--- a/processing/bin/Master.R
+++ b/processing/bin/Master.R
@@ -5,12 +5,17 @@
 library(pbapply)
 library(rvest)
 library(wikiproc)
+library(rprojroot)
 ## Fetch data
 cat("Starting data import...\n")
-articles <- getData(use.cache = FALSE, write.cache = TRUE, data.dir = "../../data/")
+# Define paths
+project_root <- find_root(has_file("README.md"))
+data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
+articles <- wikiproc:::getData(use.cache = TRUE, data.dir = data_dir)
 ## Data processing
@@ -19,25 +24,25 @@ cat("Processing data:\n")
 results <- pbapply(articles, 1, function(article) {
  # Within this function article is a vector representing a single row of our original data frame
  # This means article[1] represents the Title, article[2] the PageID etc.
  ## Data cleaning
-  cleaned.text <- cleanHtml(article[4])
+  cleaned.text <- wikiproc:::cleanHtml(article[4])
  ## Data preprocessing/annotating
  # annotation <- createAnnotations(cleaned.text, article[2], article[3])
  ## Extract information from Text
-  no.spouses <- getNoOfSpouses(article[4])
+  no.spouses <- wikiproc:::getNoOfSpouses(article[4])
  ## Create Results
  data.frame(Name = article[1],
             NoSpouses = no.spouses,
             stringsAsFactors = FALSE)
 })
 results <- do.call(rbind, results)
@@ -46,7 +51,7 @@ cat("Data processing finished.\n")
 ## Results are now in results
-## Format for rasa 
+## Format for rasa
 cat("Writing rasa files to 'rasa/'...\n")

--- a/processing/packages.list
+++ b/processing/packages.list
@@ -7,4 +7,5 @@ data.table
 xml2
 WikipediR
 reticulate
 cleanNLP
\ No newline at end of file
+rprojroot
--- a/processing/wikiproc/DESCRIPTION
+++ b/processing/wikiproc/DESCRIPTION
 Package: wikiproc
 Title: What the Package Does (one line, title case)
 Version: 0.0.0.9000
-Authors@R: person("First", "Last", email = "first.last@example.com", role = c("aut", "cre"))
+Authors@R: c(
+  person("David", "Fuhry"),
+  person("Lukas", "Gehrke"),
+  person("Lucas", "Schons")
+)
 Description: What the package does (one paragraph).
 Depends: R (>= 3.5.2)
-License: What license is it under?
+License: GPL-2
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 6.1.1
+Imports:
+  pbapply,
+  rvest,
+  stringi,
+  textclean,
+  stringr,
+  data.table,
+  xml2,
+  WikipediR,
+  reticulate,
+  cleanNLP,
 Suggests: 
    testthat
--- a/processing/wikiproc/R/CleanHtml.R
+++ b/processing/wikiproc/R/CleanHtml.R
@@ -6,15 +6,14 @@ library(rvest)
 library(stringi)
 library(textclean)
+#' Clean a html formatted wikipedia page.
-#' Clean a html formatted wikipedia page. 
 #' Nodes of interest from the DOM are extracted and then cleaned from all html
 #' tags and annotations.
-#' 
+#'
 #' @param html Url linking to a wikipedia webpage or a html formatted document.
 #' @return Plaintext document containing only the maintext of the give wikipedia page.
 cleanHtml <- function(html) {
  # 1. read data from url or html-formatted text
  # 2 .extract nodes containing main information (ignore infoboxes, list of literature, ...)
  # 3. collapse vektors into a single one
@@ -33,4 +32,4 @@ cleanHtml <- function(html) {
    gsub(" *([.!?:,'’])", "\\1", .) %>%
    gsub("\n *\n+", "\n", .) %>%
    sub(" ", "", .)
 }
\ No newline at end of file
--- a/processing/wikiproc/R/GetData.R
+++ b/processing/wikiproc/R/GetData.R
@@ -13,15 +13,15 @@ library(xml2)
 ## which gives us something short of a thousand articles
 #' Retrieve wikipedia articles about physicists
-#' 
+#'
 #' @param use.cache Use cached data if it exists over downloading new data
 #' @param write.cache Write downloaded results into cache for use on future calls
 #' @param data.dir Directory the data should be read from and/or written to
 #' @return data.frame containing the title, id, revisionID and html-formatted full text
-getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") {
+getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
-  dest.articlesRDS <- paste(data.dir, "articles.RDS", "")
+  dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep)
-  dest.articlesCSV <- paste(data.dir, "articles.csv", "")
+  dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep)
  ### First we check if the data already exists and try to load it if it does
  if(file.exists(dest.articlesRDS) && use.cache ) {
    res <- tryCatch({
@@ -33,54 +33,54 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") {
    })
    return(res)
  }
  ### Get the list of names
  # Download page
  cat("Downloading list from wikipedia... ")
  page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
  cat("Done.\n")
  # Extract links as the names given here are not the article names in about 20 cases
  cat("Processing data:\n")
  physicists <- page %>%
    html_nodes(".mw-parser-output li a") %>%
    html_attr("href")
  # Clean the list
  physicists <- physicists[nchar(physicists) > 5]
  length(physicists) <- length(physicists) - 3
  physicists <- gsub("_", " ", physicists)
  physicists <- gsub("/wiki/", "", physicists)
  physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
  # This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
  # On linux `physicists <- URLdecode(physicists)` should do the trick
  physicists <- sapply(physicists, function(x) {
    tmp <- URLdecode(x)
    Encoding(tmp) <- "UTF-8"
    tmp
  })
  names(physicists) <- NULL
  cat("Done.\nDownloading articles now. This might take a while.\n")
  ### Get articles
  # Call the wikipedia api for each entry in our list
  articles <- pblapply(physicists, function(x) {
    res <- tryCatch({
      article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
@@ -90,17 +90,17 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") {
        pname <- read_html(article$parse$text$`*`) %>%
          html_nodes(".redirectText a") %>%
          html_attr("href")
        panme <- gsub("_", " ", pname)
        pname <- gsub("/wiki/", "", pname)
        pname <- gsub("\\s*\\([^\\)]+\\)", "", pname)
        tmp <- URLdecode(pname)
        Encoding(tmp) <- "UTF-8"
        pname <- tmp
        article <- page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
      }
      data.frame(Title = article$parse$title,
@@ -111,17 +111,17 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") {
    }, error = function(e) {
      cat("Error: Crawling failed for article", x, "with error message: ", conditionMessage(e),"\n")
    })
  })
  # Bind it all together
  articles <- do.call(rbind, articles)
  cat("Download finished.\n")
  # Write result if desired
  if(write.cache) {
    if (!dir.exists(data.dir)) {
      dir.create(data.dir)
@@ -131,8 +131,8 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") {
    saveRDS(articles, dest.articlesRDS)
    cat("Done.\n")
  }
  cat("Data import finished.\n")
  return(articles)
 }
\ No newline at end of file