diff --git a/processing/bin/Master.R b/processing/bin/Master.R index 53de2c05a104aeaec473b1ea031bb3c63b05106d..8e338d5d65a3ada707fbabdd63d3e6f8b8a620fe 100755 --- a/processing/bin/Master.R +++ b/processing/bin/Master.R @@ -5,12 +5,17 @@ library(pbapply) library(rvest) library(wikiproc) +library(rprojroot) ## Fetch data cat("Starting data import...\n") -articles <- getData(use.cache = FALSE, write.cache = TRUE, data.dir = "../../data/") +# Define paths +project_root <- find_root(has_file("README.md")) +data_dir <- paste(project_root, "data", sep = .Platform$file.sep) + +articles <- wikiproc:::getData(use.cache = TRUE, data.dir = data_dir) ## Data processing @@ -19,25 +24,25 @@ cat("Processing data:\n") results <- pbapply(articles, 1, function(article) { # Within this function article is a vector representing a single row of our original data frame # This means article[1] represents the Title, article[2] the PageID etc. - + ## Data cleaning - - cleaned.text <- cleanHtml(article[4]) - + + cleaned.text <- wikiproc:::cleanHtml(article[4]) + ## Data preprocessing/annotating - + # annotation <- createAnnotations(cleaned.text, article[2], article[3]) - + ## Extract information from Text - - no.spouses <- getNoOfSpouses(article[4]) - + + no.spouses <- wikiproc:::getNoOfSpouses(article[4]) + ## Create Results - + data.frame(Name = article[1], NoSpouses = no.spouses, stringsAsFactors = FALSE) - + }) results <- do.call(rbind, results) @@ -46,7 +51,7 @@ cat("Data processing finished.\n") ## Results are now in results -## Format for rasa +## Format for rasa cat("Writing rasa files to 'rasa/'...\n") diff --git a/processing/packages.list b/processing/packages.list index d61e2fb55d01e0b19aaa9b98f33b84328257a7d5..532daaf429be8cab38d06d4487a38bb4c4b551ae 100644 --- a/processing/packages.list +++ b/processing/packages.list @@ -7,4 +7,5 @@ data.table xml2 WikipediR reticulate -cleanNLP \ No newline at end of file +cleanNLP +rprojroot diff --git a/processing/wikiproc/DESCRIPTION b/processing/wikiproc/DESCRIPTION index 52400db32fc699aaba8588a0e46df8fd0af28c5b..7ec193332d2045893a5b0062a6d51701459f8e19 100644 --- a/processing/wikiproc/DESCRIPTION +++ b/processing/wikiproc/DESCRIPTION @@ -1,12 +1,27 @@ Package: wikiproc Title: What the Package Does (one line, title case) Version: 0.0.0.9000 -Authors@R: person("First", "Last", email = "first.last@example.com", role = c("aut", "cre")) +Authors@R: c( + person("David", "Fuhry"), + person("Lukas", "Gehrke"), + person("Lucas", "Schons") +) Description: What the package does (one paragraph). Depends: R (>= 3.5.2) -License: What license is it under? +License: GPL-2 Encoding: UTF-8 LazyData: true RoxygenNote: 6.1.1 +Imports: + pbapply, + rvest, + stringi, + textclean, + stringr, + data.table, + xml2, + WikipediR, + reticulate, + cleanNLP, Suggests: testthat diff --git a/processing/wikiproc/R/CleanHtml.R b/processing/wikiproc/R/CleanHtml.R index 3f0d11cef3b277b8a8758af131606f5c88e69bad..182e9c839e512b15475b51821304eafec72cf959 100644 --- a/processing/wikiproc/R/CleanHtml.R +++ b/processing/wikiproc/R/CleanHtml.R @@ -6,15 +6,14 @@ library(rvest) library(stringi) library(textclean) - -#' Clean a html formatted wikipedia page. +#' Clean a html formatted wikipedia page. #' Nodes of interest from the DOM are extracted and then cleaned from all html #' tags and annotations. -#' +#' #' @param html Url linking to a wikipedia webpage or a html formatted document. #' @return Plaintext document containing only the maintext of the give wikipedia page. cleanHtml <- function(html) { - + # 1. read data from url or html-formatted text # 2 .extract nodes containing main information (ignore infoboxes, list of literature, ...) # 3. collapse vektors into a single one @@ -33,4 +32,4 @@ cleanHtml <- function(html) { gsub(" *([.!?:,'’])", "\\1", .) %>% gsub("\n *\n+", "\n", .) %>% sub(" ", "", .) -} \ No newline at end of file +} diff --git a/processing/wikiproc/R/GetData.R b/processing/wikiproc/R/GetData.R index c529fbeb40e92a4dfcd6139edf1919e87b1e43f3..ef8713e866678a1313db70927bc63216683f2d79 100644 --- a/processing/wikiproc/R/GetData.R +++ b/processing/wikiproc/R/GetData.R @@ -13,15 +13,15 @@ library(xml2) ## which gives us something short of a thousand articles #' Retrieve wikipedia articles about physicists -#' +#' #' @param use.cache Use cached data if it exists over downloading new data #' @param write.cache Write downloaded results into cache for use on future calls #' @param data.dir Directory the data should be read from and/or written to #' @return data.frame containing the title, id, revisionID and html-formatted full text -getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") { - - dest.articlesRDS <- paste(data.dir, "articles.RDS", "") - dest.articlesCSV <- paste(data.dir, "articles.csv", "") +getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { + + dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep) + dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep) ### First we check if the data already exists and try to load it if it does if(file.exists(dest.articlesRDS) && use.cache ) { res <- tryCatch({ @@ -33,54 +33,54 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") { }) return(res) } - + ### Get the list of names - + # Download page - + cat("Downloading list from wikipedia... ") - + page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists") - + cat("Done.\n") - + # Extract links as the names given here are not the article names in about 20 cases - + cat("Processing data:\n") - + physicists <- page %>% html_nodes(".mw-parser-output li a") %>% html_attr("href") - + # Clean the list - + physicists <- physicists[nchar(physicists) > 5] - + length(physicists) <- length(physicists) - 3 - + physicists <- gsub("_", " ", physicists) - + physicists <- gsub("/wiki/", "", physicists) - + physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists) - + # This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding # On linux `physicists <- URLdecode(physicists)` should do the trick - + physicists <- sapply(physicists, function(x) { tmp <- URLdecode(x) Encoding(tmp) <- "UTF-8" tmp }) - + names(physicists) <- NULL - + cat("Done.\nDownloading articles now. This might take a while.\n") - + ### Get articles - + # Call the wikipedia api for each entry in our list - + articles <- pblapply(physicists, function(x) { res <- tryCatch({ article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE) @@ -90,17 +90,17 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") { pname <- read_html(article$parse$text$`*`) %>% html_nodes(".redirectText a") %>% html_attr("href") - + panme <- gsub("_", " ", pname) - + pname <- gsub("/wiki/", "", pname) - + pname <- gsub("\\s*\\([^\\)]+\\)", "", pname) - + tmp <- URLdecode(pname) Encoding(tmp) <- "UTF-8" pname <- tmp - + article <- page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE) } data.frame(Title = article$parse$title, @@ -111,17 +111,17 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") { }, error = function(e) { cat("Error: Crawling failed for article", x, "with error message: ", conditionMessage(e),"\n") }) - + }) - + # Bind it all together - + articles <- do.call(rbind, articles) - + cat("Download finished.\n") - + # Write result if desired - + if(write.cache) { if (!dir.exists(data.dir)) { dir.create(data.dir) @@ -131,8 +131,8 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") { saveRDS(articles, dest.articlesRDS) cat("Done.\n") } - + cat("Data import finished.\n") - + return(articles) -} \ No newline at end of file +}