From 815047b47e401546d03ab2c0157e6a3f3f5860c4 Mon Sep 17 00:00:00 2001 From: Lulu Roth <ls80zyse@studserv.uni-leipzig.de> Date: Fri, 11 Jan 2019 01:09:40 +0100 Subject: [PATCH] Refactor text processing * Create R package containing function definitions * Create directory processing containing scripts * fix some typos * general cleanup --- {R => processing/bin}/Master.R | 27 +++---------------- processing/bin/ProcessNER.R | 21 +++++++++++++++ packages.list => processing/packages.list | 0 processing/wikiproc/.Rbuildignore | 2 ++ processing/wikiproc/DESCRIPTION | 12 +++++++++ processing/wikiproc/NAMESPACE | 2 ++ {R => processing/wikiproc/R}/CleanHtml.R | 0 {R => processing/wikiproc/R}/GetBirthdate.R | 0 {R => processing/wikiproc/R}/GetBirthplace.R | 0 {R => processing/wikiproc/R}/GetData.R | 19 ++++++++----- {R => processing/wikiproc/R}/GetNoOfSpouses.R | 0 .../wikiproc/R/createAnnotations.R | 22 +-------------- processing/wikiproc/man/cleanHtml.Rd | 21 +++++++++++++++ processing/wikiproc/man/getBirthdate.Rd | 25 +++++++++++++++++ processing/wikiproc/man/getBirthplace.Rd | 23 ++++++++++++++++ processing/wikiproc/man/getData.Rd | 19 +++++++++++++ processing/wikiproc/man/getIntroduction.Rd | 17 ++++++++++++ processing/wikiproc/tests/testthat.R | 5 ++++ .../wikiproc/tests/testthat/test-cleanhtml.R | 5 ++++ tests/testthat/testCleanHtml.R | 1 - 20 files changed, 169 insertions(+), 52 deletions(-) rename {R => processing/bin}/Master.R (67%) create mode 100644 processing/bin/ProcessNER.R rename packages.list => processing/packages.list (100%) create mode 100644 processing/wikiproc/.Rbuildignore create mode 100644 processing/wikiproc/DESCRIPTION create mode 100644 processing/wikiproc/NAMESPACE rename {R => processing/wikiproc/R}/CleanHtml.R (100%) rename {R => processing/wikiproc/R}/GetBirthdate.R (100%) rename {R => processing/wikiproc/R}/GetBirthplace.R (100%) rename {R => processing/wikiproc/R}/GetData.R (87%) rename {R => processing/wikiproc/R}/GetNoOfSpouses.R (100%) rename R/ProcessNER.R => processing/wikiproc/R/createAnnotations.R (77%) create mode 100644 processing/wikiproc/man/cleanHtml.Rd create mode 100644 processing/wikiproc/man/getBirthdate.Rd create mode 100644 processing/wikiproc/man/getBirthplace.Rd create mode 100644 processing/wikiproc/man/getData.Rd create mode 100644 processing/wikiproc/man/getIntroduction.Rd create mode 100644 processing/wikiproc/tests/testthat.R create mode 100644 processing/wikiproc/tests/testthat/test-cleanhtml.R delete mode 100644 tests/testthat/testCleanHtml.R diff --git a/R/Master.R b/processing/bin/Master.R similarity index 67% rename from R/Master.R rename to processing/bin/Master.R index fe4ae64..53de2c0 100755 --- a/R/Master.R +++ b/processing/bin/Master.R @@ -1,31 +1,16 @@ #!/usr/bin/env Rscript - ### This script consolidates everything -## Librarys - library(pbapply) - -#library(SomeLibrary) - -## Load Scripts - -cat("Sourcing R scripts... ") - -source("r/GetData.R") -source("r/GetNoOfSpouses.R") -source("r/CleanHtml.R") -source("r/ProcessNER.R") -#source("r/getSomethingElse.R") - -cat("Done.\n") +library(rvest) +library(wikiproc) ## Fetch data cat("Starting data import...\n") -articles <- getData(use.cache = TRUE) +articles <- getData(use.cache = FALSE, write.cache = TRUE, data.dir = "../../data/") ## Data processing @@ -41,16 +26,12 @@ results <- pbapply(articles, 1, function(article) { ## Data preprocessing/annotating - annotation <- createAnnotations(cleaned.text, article[2], article[3]) + # annotation <- createAnnotations(cleaned.text, article[2], article[3]) ## Extract information from Text no.spouses <- getNoOfSpouses(article[4]) - # someFact <- getFactFromTextFunctioN(annotated.text) - - # someOtherFact <- getOtherFactFromText(data$Text) - ## Create Results data.frame(Name = article[1], diff --git a/processing/bin/ProcessNER.R b/processing/bin/ProcessNER.R new file mode 100644 index 0000000..775ffb1 --- /dev/null +++ b/processing/bin/ProcessNER.R @@ -0,0 +1,21 @@ +#!/usr/bin/env Rscript + +### Provides functionality to use NER, POS and Dependency Grammars + +## Author: David + +cat("Initializing spacy backend...\n") + +# It's important to do this prior to loading any python related stuff + +reticulate::use_condaenv("spcy", required = TRUE) + +# Load librarys + +library(cleanNLP) + +# Init nlp models + +cnlp_init_spacy(entity_flag = TRUE) + +cat("Done.\n") diff --git a/packages.list b/processing/packages.list similarity index 100% rename from packages.list rename to processing/packages.list diff --git a/processing/wikiproc/.Rbuildignore b/processing/wikiproc/.Rbuildignore new file mode 100644 index 0000000..3b7c4e3 --- /dev/null +++ b/processing/wikiproc/.Rbuildignore @@ -0,0 +1,2 @@ +^wikiproc\.Rproj$ +^\.Rproj\.user$ diff --git a/processing/wikiproc/DESCRIPTION b/processing/wikiproc/DESCRIPTION new file mode 100644 index 0000000..52400db --- /dev/null +++ b/processing/wikiproc/DESCRIPTION @@ -0,0 +1,12 @@ +Package: wikiproc +Title: What the Package Does (one line, title case) +Version: 0.0.0.9000 +Authors@R: person("First", "Last", email = "first.last@example.com", role = c("aut", "cre")) +Description: What the package does (one paragraph). +Depends: R (>= 3.5.2) +License: What license is it under? +Encoding: UTF-8 +LazyData: true +RoxygenNote: 6.1.1 +Suggests: + testthat diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE new file mode 100644 index 0000000..6ae9268 --- /dev/null +++ b/processing/wikiproc/NAMESPACE @@ -0,0 +1,2 @@ +# Generated by roxygen2: do not edit by hand + diff --git a/R/CleanHtml.R b/processing/wikiproc/R/CleanHtml.R similarity index 100% rename from R/CleanHtml.R rename to processing/wikiproc/R/CleanHtml.R diff --git a/R/GetBirthdate.R b/processing/wikiproc/R/GetBirthdate.R similarity index 100% rename from R/GetBirthdate.R rename to processing/wikiproc/R/GetBirthdate.R diff --git a/R/GetBirthplace.R b/processing/wikiproc/R/GetBirthplace.R similarity index 100% rename from R/GetBirthplace.R rename to processing/wikiproc/R/GetBirthplace.R diff --git a/R/GetData.R b/processing/wikiproc/R/GetData.R similarity index 87% rename from R/GetData.R rename to processing/wikiproc/R/GetData.R index d48a0c6..c529fbe 100644 --- a/R/GetData.R +++ b/processing/wikiproc/R/GetData.R @@ -4,6 +4,7 @@ library(WikipediR) # For querying wikipedia library(rvest) # For getting the list of physicits +library(xml2) ## Though we could get the pages within the category 'physicists' with something like this ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers @@ -15,12 +16,16 @@ library(rvest) # For getting the list of physicits #' #' @param use.cache Use cached data if it exists over downloading new data #' @param write.cache Write downloaded results into cache for use on future calls +#' @param data.dir Directory the data should be read from and/or written to #' @return data.frame containing the title, id, revisionID and html-formatted full text -getData <- function(use.cache = TRUE, write.cache = FALSE) { +getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") { + + dest.articlesRDS <- paste(data.dir, "articles.RDS", "") + dest.articlesCSV <- paste(data.dir, "articles.csv", "") ### First we check if the data already exists and try to load it if it does - if(file.exists("data/articles.RDS") & use.cache ) { + if(file.exists(dest.articlesRDS) && use.cache ) { res <- tryCatch({ - data <- readRDS("data/articles.RDS") + data <- readRDS(dest.articlesRDS) cat("Found chached data to use, import finished.\n") data }, error = function (e) { @@ -118,12 +123,12 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) { # Write result if desired if(write.cache) { - if (!dir.exists("data")) { - dir.create("data") + if (!dir.exists(data.dir)) { + dir.create(data.dir) } cat("Writing data to files... ") - write.table(articles, "data/articles.csv") - saveRDS(articles, "data/articles.RDS") + write.table(articles, dest.articlesCSV) + saveRDS(articles, dest.articlesRDS) cat("Done.\n") } diff --git a/R/GetNoOfSpouses.R b/processing/wikiproc/R/GetNoOfSpouses.R similarity index 100% rename from R/GetNoOfSpouses.R rename to processing/wikiproc/R/GetNoOfSpouses.R diff --git a/R/ProcessNER.R b/processing/wikiproc/R/createAnnotations.R similarity index 77% rename from R/ProcessNER.R rename to processing/wikiproc/R/createAnnotations.R index 661d4a9..b9ca6eb 100644 --- a/R/ProcessNER.R +++ b/processing/wikiproc/R/createAnnotations.R @@ -1,32 +1,12 @@ -#!/usr/bin/env Rscript - -### Provides functionality to use NER, POS and Dependency Grammers - -## Author: David - -cat("Initializing spacy backend...\n") - -# It's important to do this prior to loading any python related stuff - -reticulate::use_condaenv("spcy", required = TRUE) - -# Load librarys - library(cleanNLP) -# Init nlp models - -cnlp_init_spacy(entity_flag = TRUE) - -cat("Done.\n") - createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) { # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces # To prevent this we stip 'em again filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE) - + # Check if there is a cached version of the annotations for this article in this specific revision if(use.cache & file.exists(filename)) { diff --git a/processing/wikiproc/man/cleanHtml.Rd b/processing/wikiproc/man/cleanHtml.Rd new file mode 100644 index 0000000..56994f4 --- /dev/null +++ b/processing/wikiproc/man/cleanHtml.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CleanHtml.R +\name{cleanHtml} +\alias{cleanHtml} +\title{Clean a html formatted wikipedia page. +Nodes of interest from the DOM are extracted and then cleaned from all html +tags and annotations.} +\usage{ +cleanHtml(html) +} +\arguments{ +\item{html}{Url linking to a wikipedia webpage or a html formatted document.} +} +\value{ +Plaintext document containing only the maintext of the give wikipedia page. +} +\description{ +Clean a html formatted wikipedia page. +Nodes of interest from the DOM are extracted and then cleaned from all html +tags and annotations. +} diff --git a/processing/wikiproc/man/getBirthdate.Rd b/processing/wikiproc/man/getBirthdate.Rd new file mode 100644 index 0000000..a614ade --- /dev/null +++ b/processing/wikiproc/man/getBirthdate.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetBirthdate.R +\name{getBirthdate} +\alias{getBirthdate} +\title{Extract birthdate from infobox +Will try to get infobox as table and extract birthdate +from 'Born'-entry +If there is no infobox, first paragraph of the article +will be checked for birthdate} +\usage{ +getBirthdate(article) +} +\arguments{ +\item{article}{Article in HTML-format} +} +\value{ +String birthdate as string|NULL +} +\description{ +Extract birthdate from infobox +Will try to get infobox as table and extract birthdate +from 'Born'-entry +If there is no infobox, first paragraph of the article +will be checked for birthdate +} diff --git a/processing/wikiproc/man/getBirthplace.Rd b/processing/wikiproc/man/getBirthplace.Rd new file mode 100644 index 0000000..8dd69d7 --- /dev/null +++ b/processing/wikiproc/man/getBirthplace.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetBirthplace.R +\name{getBirthplace} +\alias{getBirthplace} +\title{This script extracts Birthplace from physicist texts +Try to get the infobox and extract the birthplace +If there is no infobox, 0 will be returned as +birthplace is hard to extract from text} +\usage{ +getBirthplace(article) +} +\arguments{ +\item{article}{Article in HTML-format} +} +\value{ +String with birthplace of the physicist|0 +} +\description{ +This script extracts Birthplace from physicist texts +Try to get the infobox and extract the birthplace +If there is no infobox, 0 will be returned as +birthplace is hard to extract from text +} diff --git a/processing/wikiproc/man/getData.Rd b/processing/wikiproc/man/getData.Rd new file mode 100644 index 0000000..13e362d --- /dev/null +++ b/processing/wikiproc/man/getData.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetData.R +\name{getData} +\alias{getData} +\title{Retrieve wikipedia articles about physicists} +\usage{ +getData(use.cache = TRUE, write.cache = FALSE) +} +\arguments{ +\item{use.cache}{Use cached data if it exists over downloading new data} + +\item{write.cache}{Write downloaded results into cache for use on future calls} +} +\value{ +data.frame containing the title, id, revisionID and html-formatted full text +} +\description{ +Retrieve wikipedia articles about physicists +} diff --git a/processing/wikiproc/man/getIntroduction.Rd b/processing/wikiproc/man/getIntroduction.Rd new file mode 100644 index 0000000..3dfe196 --- /dev/null +++ b/processing/wikiproc/man/getIntroduction.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetBirthdate.R +\name{getIntroduction} +\alias{getIntroduction} +\title{Get Introduction Text from Wikipedia page that contains birthdate} +\usage{ +getIntroduction(article) +} +\arguments{ +\item{article}{article in HTML-format} +} +\value{ +string introduction text from wikipedia article +} +\description{ +Get Introduction Text from Wikipedia page that contains birthdate +} diff --git a/processing/wikiproc/tests/testthat.R b/processing/wikiproc/tests/testthat.R new file mode 100644 index 0000000..3425482 --- /dev/null +++ b/processing/wikiproc/tests/testthat.R @@ -0,0 +1,5 @@ +library(testthat) +library(wikiproc) + +test_check("wikiproc") + diff --git a/processing/wikiproc/tests/testthat/test-cleanhtml.R b/processing/wikiproc/tests/testthat/test-cleanhtml.R new file mode 100644 index 0000000..9390f44 --- /dev/null +++ b/processing/wikiproc/tests/testthat/test-cleanhtml.R @@ -0,0 +1,5 @@ +context("test-cleanhtml") + +test_that("multiplication works", { + expect_equal(2 * 2, 4) +}) diff --git a/tests/testthat/testCleanHtml.R b/tests/testthat/testCleanHtml.R deleted file mode 100644 index 1342ffa..0000000 --- a/tests/testthat/testCleanHtml.R +++ /dev/null @@ -1 +0,0 @@ -# Test cleanHtml function. \ No newline at end of file -- GitLab