diff --git a/R/Master.R b/processing/bin/Master.R similarity index 67% rename from R/Master.R rename to processing/bin/Master.R index fe4ae64cde28991d0950616d74e602242b76e43a..53de2c05a104aeaec473b1ea031bb3c63b05106d 100755 --- a/R/Master.R +++ b/processing/bin/Master.R @@ -1,31 +1,16 @@ #!/usr/bin/env Rscript - ### This script consolidates everything -## Librarys - library(pbapply) - -#library(SomeLibrary) - -## Load Scripts - -cat("Sourcing R scripts... ") - -source("r/GetData.R") -source("r/GetNoOfSpouses.R") -source("r/CleanHtml.R") -source("r/ProcessNER.R") -#source("r/getSomethingElse.R") - -cat("Done.\n") +library(rvest) +library(wikiproc) ## Fetch data cat("Starting data import...\n") -articles <- getData(use.cache = TRUE) +articles <- getData(use.cache = FALSE, write.cache = TRUE, data.dir = "../../data/") ## Data processing @@ -41,16 +26,12 @@ results <- pbapply(articles, 1, function(article) { ## Data preprocessing/annotating - annotation <- createAnnotations(cleaned.text, article[2], article[3]) + # annotation <- createAnnotations(cleaned.text, article[2], article[3]) ## Extract information from Text no.spouses <- getNoOfSpouses(article[4]) - # someFact <- getFactFromTextFunctioN(annotated.text) - - # someOtherFact <- getOtherFactFromText(data$Text) - ## Create Results data.frame(Name = article[1], diff --git a/processing/bin/ProcessNER.R b/processing/bin/ProcessNER.R new file mode 100644 index 0000000000000000000000000000000000000000..775ffb17ffb198b7707bfd51b8e23880cbf2cddd --- /dev/null +++ b/processing/bin/ProcessNER.R @@ -0,0 +1,21 @@ +#!/usr/bin/env Rscript + +### Provides functionality to use NER, POS and Dependency Grammars + +## Author: David + +cat("Initializing spacy backend...\n") + +# It's important to do this prior to loading any python related stuff + +reticulate::use_condaenv("spcy", required = TRUE) + +# Load librarys + +library(cleanNLP) + +# Init nlp models + +cnlp_init_spacy(entity_flag = TRUE) + +cat("Done.\n") diff --git a/packages.list b/processing/packages.list similarity index 100% rename from packages.list rename to processing/packages.list diff --git a/processing/wikiproc/.Rbuildignore b/processing/wikiproc/.Rbuildignore new file mode 100644 index 0000000000000000000000000000000000000000..3b7c4e3607abd076fe538046ebf052153ae9ecc3 --- /dev/null +++ b/processing/wikiproc/.Rbuildignore @@ -0,0 +1,2 @@ +^wikiproc\.Rproj$ +^\.Rproj\.user$ diff --git a/processing/wikiproc/DESCRIPTION b/processing/wikiproc/DESCRIPTION new file mode 100644 index 0000000000000000000000000000000000000000..52400db32fc699aaba8588a0e46df8fd0af28c5b --- /dev/null +++ b/processing/wikiproc/DESCRIPTION @@ -0,0 +1,12 @@ +Package: wikiproc +Title: What the Package Does (one line, title case) +Version: 0.0.0.9000 +Authors@R: person("First", "Last", email = "first.last@example.com", role = c("aut", "cre")) +Description: What the package does (one paragraph). +Depends: R (>= 3.5.2) +License: What license is it under? +Encoding: UTF-8 +LazyData: true +RoxygenNote: 6.1.1 +Suggests: + testthat diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE new file mode 100644 index 0000000000000000000000000000000000000000..6ae926839dd1829f1016a96f766d970ff184ad97 --- /dev/null +++ b/processing/wikiproc/NAMESPACE @@ -0,0 +1,2 @@ +# Generated by roxygen2: do not edit by hand + diff --git a/R/CleanHtml.R b/processing/wikiproc/R/CleanHtml.R similarity index 100% rename from R/CleanHtml.R rename to processing/wikiproc/R/CleanHtml.R diff --git a/R/GetBirthdate.R b/processing/wikiproc/R/GetBirthdate.R similarity index 100% rename from R/GetBirthdate.R rename to processing/wikiproc/R/GetBirthdate.R diff --git a/R/GetBirthplace.R b/processing/wikiproc/R/GetBirthplace.R similarity index 100% rename from R/GetBirthplace.R rename to processing/wikiproc/R/GetBirthplace.R diff --git a/R/GetData.R b/processing/wikiproc/R/GetData.R similarity index 87% rename from R/GetData.R rename to processing/wikiproc/R/GetData.R index d48a0c65b48a4c48237ab27419682ce1ba22e3b8..c529fbeb40e92a4dfcd6139edf1919e87b1e43f3 100644 --- a/R/GetData.R +++ b/processing/wikiproc/R/GetData.R @@ -4,6 +4,7 @@ library(WikipediR) # For querying wikipedia library(rvest) # For getting the list of physicits +library(xml2) ## Though we could get the pages within the category 'physicists' with something like this ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers @@ -15,12 +16,16 @@ library(rvest) # For getting the list of physicits #' #' @param use.cache Use cached data if it exists over downloading new data #' @param write.cache Write downloaded results into cache for use on future calls +#' @param data.dir Directory the data should be read from and/or written to #' @return data.frame containing the title, id, revisionID and html-formatted full text -getData <- function(use.cache = TRUE, write.cache = FALSE) { +getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") { + + dest.articlesRDS <- paste(data.dir, "articles.RDS", "") + dest.articlesCSV <- paste(data.dir, "articles.csv", "") ### First we check if the data already exists and try to load it if it does - if(file.exists("data/articles.RDS") & use.cache ) { + if(file.exists(dest.articlesRDS) && use.cache ) { res <- tryCatch({ - data <- readRDS("data/articles.RDS") + data <- readRDS(dest.articlesRDS) cat("Found chached data to use, import finished.\n") data }, error = function (e) { @@ -118,12 +123,12 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) { # Write result if desired if(write.cache) { - if (!dir.exists("data")) { - dir.create("data") + if (!dir.exists(data.dir)) { + dir.create(data.dir) } cat("Writing data to files... ") - write.table(articles, "data/articles.csv") - saveRDS(articles, "data/articles.RDS") + write.table(articles, dest.articlesCSV) + saveRDS(articles, dest.articlesRDS) cat("Done.\n") } diff --git a/R/GetNoOfSpouses.R b/processing/wikiproc/R/GetNoOfSpouses.R similarity index 100% rename from R/GetNoOfSpouses.R rename to processing/wikiproc/R/GetNoOfSpouses.R diff --git a/R/ProcessNER.R b/processing/wikiproc/R/createAnnotations.R similarity index 77% rename from R/ProcessNER.R rename to processing/wikiproc/R/createAnnotations.R index 661d4a91f3ecd05b419415746f83fb3fd7f9f6ba..b9ca6ebea7029055fc484efb723beb4605a607c9 100644 --- a/R/ProcessNER.R +++ b/processing/wikiproc/R/createAnnotations.R @@ -1,32 +1,12 @@ -#!/usr/bin/env Rscript - -### Provides functionality to use NER, POS and Dependency Grammers - -## Author: David - -cat("Initializing spacy backend...\n") - -# It's important to do this prior to loading any python related stuff - -reticulate::use_condaenv("spcy", required = TRUE) - -# Load librarys - library(cleanNLP) -# Init nlp models - -cnlp_init_spacy(entity_flag = TRUE) - -cat("Done.\n") - createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) { # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces # To prevent this we stip 'em again filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE) - + # Check if there is a cached version of the annotations for this article in this specific revision if(use.cache & file.exists(filename)) { diff --git a/processing/wikiproc/man/cleanHtml.Rd b/processing/wikiproc/man/cleanHtml.Rd new file mode 100644 index 0000000000000000000000000000000000000000..56994f44d9eadcd9f8cb1fee71bd54d91e518629 --- /dev/null +++ b/processing/wikiproc/man/cleanHtml.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CleanHtml.R +\name{cleanHtml} +\alias{cleanHtml} +\title{Clean a html formatted wikipedia page. +Nodes of interest from the DOM are extracted and then cleaned from all html +tags and annotations.} +\usage{ +cleanHtml(html) +} +\arguments{ +\item{html}{Url linking to a wikipedia webpage or a html formatted document.} +} +\value{ +Plaintext document containing only the maintext of the give wikipedia page. +} +\description{ +Clean a html formatted wikipedia page. +Nodes of interest from the DOM are extracted and then cleaned from all html +tags and annotations. +} diff --git a/processing/wikiproc/man/getBirthdate.Rd b/processing/wikiproc/man/getBirthdate.Rd new file mode 100644 index 0000000000000000000000000000000000000000..a614ade2049e244a96e56eda90730b3e1a352abf --- /dev/null +++ b/processing/wikiproc/man/getBirthdate.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetBirthdate.R +\name{getBirthdate} +\alias{getBirthdate} +\title{Extract birthdate from infobox +Will try to get infobox as table and extract birthdate +from 'Born'-entry +If there is no infobox, first paragraph of the article +will be checked for birthdate} +\usage{ +getBirthdate(article) +} +\arguments{ +\item{article}{Article in HTML-format} +} +\value{ +String birthdate as string|NULL +} +\description{ +Extract birthdate from infobox +Will try to get infobox as table and extract birthdate +from 'Born'-entry +If there is no infobox, first paragraph of the article +will be checked for birthdate +} diff --git a/processing/wikiproc/man/getBirthplace.Rd b/processing/wikiproc/man/getBirthplace.Rd new file mode 100644 index 0000000000000000000000000000000000000000..8dd69d7d49dcc12d33914ecdaed68baf1d0fa78c --- /dev/null +++ b/processing/wikiproc/man/getBirthplace.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetBirthplace.R +\name{getBirthplace} +\alias{getBirthplace} +\title{This script extracts Birthplace from physicist texts +Try to get the infobox and extract the birthplace +If there is no infobox, 0 will be returned as +birthplace is hard to extract from text} +\usage{ +getBirthplace(article) +} +\arguments{ +\item{article}{Article in HTML-format} +} +\value{ +String with birthplace of the physicist|0 +} +\description{ +This script extracts Birthplace from physicist texts +Try to get the infobox and extract the birthplace +If there is no infobox, 0 will be returned as +birthplace is hard to extract from text +} diff --git a/processing/wikiproc/man/getData.Rd b/processing/wikiproc/man/getData.Rd new file mode 100644 index 0000000000000000000000000000000000000000..13e362d15d94d684eb53fe4c6a8001bc4b89949e --- /dev/null +++ b/processing/wikiproc/man/getData.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetData.R +\name{getData} +\alias{getData} +\title{Retrieve wikipedia articles about physicists} +\usage{ +getData(use.cache = TRUE, write.cache = FALSE) +} +\arguments{ +\item{use.cache}{Use cached data if it exists over downloading new data} + +\item{write.cache}{Write downloaded results into cache for use on future calls} +} +\value{ +data.frame containing the title, id, revisionID and html-formatted full text +} +\description{ +Retrieve wikipedia articles about physicists +} diff --git a/processing/wikiproc/man/getIntroduction.Rd b/processing/wikiproc/man/getIntroduction.Rd new file mode 100644 index 0000000000000000000000000000000000000000..3dfe196b3bab7bf139554fa2d49042c0e3fec93b --- /dev/null +++ b/processing/wikiproc/man/getIntroduction.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetBirthdate.R +\name{getIntroduction} +\alias{getIntroduction} +\title{Get Introduction Text from Wikipedia page that contains birthdate} +\usage{ +getIntroduction(article) +} +\arguments{ +\item{article}{article in HTML-format} +} +\value{ +string introduction text from wikipedia article +} +\description{ +Get Introduction Text from Wikipedia page that contains birthdate +} diff --git a/processing/wikiproc/tests/testthat.R b/processing/wikiproc/tests/testthat.R new file mode 100644 index 0000000000000000000000000000000000000000..34254824423baccfdba6b8dc895a8c92726cdb86 --- /dev/null +++ b/processing/wikiproc/tests/testthat.R @@ -0,0 +1,5 @@ +library(testthat) +library(wikiproc) + +test_check("wikiproc") + diff --git a/processing/wikiproc/tests/testthat/test-cleanhtml.R b/processing/wikiproc/tests/testthat/test-cleanhtml.R new file mode 100644 index 0000000000000000000000000000000000000000..9390f4496eaa59e814769ebb9d0ae57b841bba59 --- /dev/null +++ b/processing/wikiproc/tests/testthat/test-cleanhtml.R @@ -0,0 +1,5 @@ +context("test-cleanhtml") + +test_that("multiplication works", { + expect_equal(2 * 2, 4) +}) diff --git a/tests/testthat/testCleanHtml.R b/tests/testthat/testCleanHtml.R deleted file mode 100644 index 1342ffa156459d310b360f8ada5fcfdf225c20ec..0000000000000000000000000000000000000000 --- a/tests/testthat/testCleanHtml.R +++ /dev/null @@ -1 +0,0 @@ -# Test cleanHtml function. \ No newline at end of file