diff --git a/processing/bin/Master.R b/processing/bin/Master.R index 8e338d5d65a3ada707fbabdd63d3e6f8b8a620fe..0f476101b3c5fffeff755cd0623afd3947df3141 100755 --- a/processing/bin/Master.R +++ b/processing/bin/Master.R @@ -15,7 +15,7 @@ cat("Starting data import...\n") project_root <- find_root(has_file("README.md")) data_dir <- paste(project_root, "data", sep = .Platform$file.sep) -articles <- wikiproc:::getData(use.cache = TRUE, data.dir = data_dir) +articles <- wikiproc::getData(use.cache = TRUE, data.dir = data_dir) ## Data processing @@ -27,7 +27,7 @@ results <- pbapply(articles, 1, function(article) { ## Data cleaning - cleaned.text <- wikiproc:::cleanHtml(article[4]) + cleaned.text <- wikiproc::cleanHtml(article[4]) ## Data preprocessing/annotating @@ -35,7 +35,7 @@ results <- pbapply(articles, 1, function(article) { ## Extract information from Text - no.spouses <- wikiproc:::getNoOfSpouses(article[4]) + no.spouses <- wikiproc::getNoOfSpouses(article[4]) ## Create Results diff --git a/processing/wikiproc/DESCRIPTION b/processing/wikiproc/DESCRIPTION index 987b1cd672d8c3ee616a5e0dbc1b74ca796756c5..3d5ab21b9c3fabe88bc0bbee3ca5bd55b8718574 100644 --- a/processing/wikiproc/DESCRIPTION +++ b/processing/wikiproc/DESCRIPTION @@ -1,5 +1,5 @@ Package: wikiproc -Title: What the Package Does (one line, title case) +Title: Collection of Data Processing Utility Functions Version: 0.0.0.9000 Authors@R: c( person("David", "Fuhry", role = "aut"), @@ -9,7 +9,7 @@ Author: David Fuhry [aut], Lukas Gehrke [aut], Lucas Schons [aut] Maintainer: David Fuhry <not.an@email.address.net> -Description: What the package does (one paragraph). +Description: This package contains various functions that are needed to transform raw wikipedia html into processable facts. Depends: R (>= 3.5.0) License: GPL-2 Encoding: UTF-8 @@ -18,9 +18,9 @@ RoxygenNote: 6.1.1 Imports: pbapply, rvest, - stringi, textclean, stringr, + stringi, data.table, xml2, WikipediR, diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE index 6ae926839dd1829f1016a96f766d970ff184ad97..f5ebafbf596ac4f41f745b7c83baff9b7edd80d3 100644 --- a/processing/wikiproc/NAMESPACE +++ b/processing/wikiproc/NAMESPACE @@ -1,2 +1,8 @@ # Generated by roxygen2: do not edit by hand +export(cleanHtml) +export(createAnnotations) +export(getBirthdate) +export(getBirthplace) +export(getData) +export(getNoOfSpouses) diff --git a/processing/wikiproc/R/CleanHtml.R b/processing/wikiproc/R/CleanHtml.R index 182e9c839e512b15475b51821304eafec72cf959..b78b74f8e8a0b0badfc313e729e39659236f6463 100644 --- a/processing/wikiproc/R/CleanHtml.R +++ b/processing/wikiproc/R/CleanHtml.R @@ -5,11 +5,13 @@ library(rvest) library(stringi) library(textclean) +library(xml2) #' Clean a html formatted wikipedia page. #' Nodes of interest from the DOM are extracted and then cleaned from all html #' tags and annotations. #' +#' @export #' @param html Url linking to a wikipedia webpage or a html formatted document. #' @return Plaintext document containing only the maintext of the give wikipedia page. cleanHtml <- function(html) { @@ -22,10 +24,10 @@ cleanHtml <- function(html) { # - remove whitespace after newline # - remove whitespace before punctuation # - replace multiple newlines with single newline - result <- read_html(html) %>% - html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>% - stri_flatten(collapse = " ") %>% - replace_html() %>% + result <- xml2::read_html(html) %>% + rvest::html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>% + stringi::stri_flatten(collapse = " ") %>% + textclean::replace_html() %>% gsub("\\[\\d*\\]", "", .) %>% gsub(" +", " ", .) %>% gsub("\n ", "\n", .) %>% diff --git a/processing/wikiproc/R/GetBirthdate.R b/processing/wikiproc/R/GetBirthdate.R index 42dbb69d5a774e2ae37c9015264e4d64cad83e56..dd980b57e69162967f842d62abcea3d508598fae 100644 --- a/processing/wikiproc/R/GetBirthdate.R +++ b/processing/wikiproc/R/GetBirthdate.R @@ -13,15 +13,16 @@ library(xml2) #' If there is no infobox, first paragraph of the article #' will be checked for birthdate #' +#' @export #' @param article Article in HTML-format #' @return String birthdate as string|NULL getBirthdate <- function(article) { - + if(grepl("vcard", article)) { - + # Check if there is an infobox infoBox <- getInfoBox(article) - + # Get the Born field birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content # Remove everything except the birthdate: @@ -29,24 +30,24 @@ getBirthdate <- function(article) { birthdate <- gsub("\\s*\\([^\\)]+\\)", "", birthdate) # - Remove everything starting with newline birthdate <- gsub("\\n.*$", "", birthdate) - + return(birthdate) - - + + } else if(!getIntroduction(article) == "") { - + # Check first paragraph introduction <- getIntroduction(article) - + # Get birthdate inside of parentheses - birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]] + birthdate <- stringr::str_extract_all(introduction, "\\([^()]+\\)")[[1]] # Remove parentheses birthdate <- substring(birthdate, 2, nchar(birthdate)-1) - + return(birthdate) - + } else { - + # Return Null if there is no birthdate return(NULL) } @@ -55,27 +56,27 @@ getBirthdate <- function(article) { ### Converts info box to table getInfoBox <- function(article) { # Read page as html - page <- read_html(article) - + page <- xml2::read_html(article) + # Extracting text from the html will erase all <br> tags, # This will replace them with line breaks - - xml_find_all(page, ".//br") %>% - xml_add_sibling("p", "\n") - - xml_find_all(page, ".//br") %>% - xml_remove() - + + xml2::xml_find_all(page, ".//br") %>% + xml2::xml_add_sibling("p", "\n") + + xml2::xml_find_all(page, ".//br") %>% + xml2::xml_remove() + # Get the info box # Will throw an error if there isnt any, so that should be checked beforehand - + table <- page %>% - html_nodes("table.vcard") %>% - html_table(fill = TRUE) %>% + rvest::html_nodes("table.vcard") %>% + rvest::html_table(fill = TRUE) %>% .[[1]] - + colnames(table) <- c("Desc", "Content") - + return(table) } @@ -85,28 +86,28 @@ getInfoBox <- function(article) { #' @return string introduction text from wikipedia article getIntroduction <- function(article) { # Read page as html - page <- read_html(article) - + page <- xml2::read_html(article) + # Extracting text from the html will erase all <br> tags, # This will replace them with line breaks - - xml_find_all(page, ".//br") %>% - xml_add_sibling("p", "\n") - - xml_find_all(page, ".//br") %>% - xml_remove - + + xml2::xml_find_all(page, ".//br") %>% + xml2::xml_add_sibling("p", "\n") + + xml2::xml_find_all(page, ".//br") %>% + xml2::xml_remove + # Get all paragraphs paragraphs <- page %>% - html_nodes("p") %>% - html_text() - + rvest::html_nodes("p") %>% + rvest::html_text() + # There will be some leading paragraphs containing only "\n" # Remove those leading paragraphs remove <- c("\n") - cleaned <- setdiff(paragraphs, remove) + cleaned <- data.table::setdiff(paragraphs, remove) introduction <- cleaned[1] - + # Return first paragraph return(introduction) } diff --git a/processing/wikiproc/R/GetBirthplace.R b/processing/wikiproc/R/GetBirthplace.R index 8726598874e32feeffa2c015ec5bb43cfec90adc..652217f87e0a9e0348f2fd15f6a3a1d92b6244e5 100644 --- a/processing/wikiproc/R/GetBirthplace.R +++ b/processing/wikiproc/R/GetBirthplace.R @@ -10,28 +10,29 @@ library(data.table) #' This script extracts Birthplace from physicist texts #' Try to get the infobox and extract the birthplace -#' If there is no infobox, 0 will be returned as +#' If there is no infobox, 0 will be returned as #' birthplace is hard to extract from text -#' +#' +#' @export #' @param article Article in HTML-format #' @return String with birthplace of the physicist|0 getBirthplace <- function(article) { - + # If there is no infobox we return 0 if(!grepl("vcard", article)) { return(0) } - + # Use infobox to get Birthplace infoBox <- getInfoBox(article) - + # Get 'Born' field birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content - + # Remove everything in front of the "\n" # Rest is birthplace birthplace <- gsub(".*\\\n", "", birthplace) - + # return birthplace return(birthplace) } @@ -39,26 +40,26 @@ getBirthplace <- function(article) { ### Converts info box to table getInfoBox <- function(article) { # Read page as html - page <- read_html(article) - + page <- xml2::read_html(article) + # Extracting text from the html will erase all <br> tags, # this will replace them with line breaks - - xml_find_all(page, ".//br") %>% - xml_add_sibling("p", "\n") - - xml_find_all(page, ".//br") %>% - xml_remove() - + + xml2::xml_find_all(page, ".//br") %>% + xml2::xml_add_sibling("p", "\n") + + xml2::xml_find_all(page, ".//br") %>% + xml2::xml_remove() + # Get the info box # Will throw an error if there isnt any, so that should be checked beforehand - + table <- page %>% - html_nodes("table.vcard") %>% - html_table(fill = TRUE) %>% + rvest::html_nodes("table.vcard") %>% + rvest::tml_table(fill = TRUE) %>% .[[1]] - + colnames(table) <- c("Desc", "Content") - + return(table) } diff --git a/processing/wikiproc/R/GetData.R b/processing/wikiproc/R/GetData.R index ef8713e866678a1313db70927bc63216683f2d79..2300cb676ad71130d2b789bd97cf58803a513453 100644 --- a/processing/wikiproc/R/GetData.R +++ b/processing/wikiproc/R/GetData.R @@ -14,6 +14,7 @@ library(xml2) #' Retrieve wikipedia articles about physicists #' +#' @export #' @param use.cache Use cached data if it exists over downloading new data #' @param write.cache Write downloaded results into cache for use on future calls #' @param data.dir Directory the data should be read from and/or written to @@ -40,7 +41,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { cat("Downloading list from wikipedia... ") - page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists") + page <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_physicists") cat("Done.\n") @@ -49,8 +50,8 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { cat("Processing data:\n") physicists <- page %>% - html_nodes(".mw-parser-output li a") %>% - html_attr("href") + rvest::html_nodes(".mw-parser-output li a") %>% + rvest::html_attr("href") # Clean the list @@ -81,15 +82,15 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { # Call the wikipedia api for each entry in our list - articles <- pblapply(physicists, function(x) { + articles <- pbapply::pblapply(physicists, function(x) { res <- tryCatch({ - article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE) + article <- WikipediR::page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE) # Check if the article is a redirect page if (grepl(".redirectText", article$parse$text$`*`)) { # Get the real article name - pname <- read_html(article$parse$text$`*`) %>% - html_nodes(".redirectText a") %>% - html_attr("href") + pname <- xml2::read_html(article$parse$text$`*`) %>% + rvest::html_nodes(".redirectText a") %>% + rvest::html_attr("href") panme <- gsub("_", " ", pname) @@ -101,7 +102,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { Encoding(tmp) <- "UTF-8" pname <- tmp - article <- page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE) + article <- WikipediR::page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE) } data.frame(Title = article$parse$title, PageID = article$parse$pageid, diff --git a/processing/wikiproc/R/GetNoOfSpouses.R b/processing/wikiproc/R/GetNoOfSpouses.R index 5190edab023ba3063d2afe1bc4f67f85f7cf4e36..d5882ceff7127d73eb011724dd43791b2466d61c 100755 --- a/processing/wikiproc/R/GetNoOfSpouses.R +++ b/processing/wikiproc/R/GetNoOfSpouses.R @@ -9,25 +9,26 @@ library(rvest) library(data.table) +library(xml2) ### Get number of spouses - +#' @export getNoOfSpouses <- function(article) { - + # If there is no infobox we assume there were no spouses if(!grepl("vcard", article)) { return(0) } - + infoBox <- getInfoBox(article) - + # Get the spouse field spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content # Remove everything in parentheses spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses) # Split the strings by newlines to get one spouse per line - spouses <- strsplit(spouses, "\n") - spouses <- unlist(spouses) + spouses <- base::strsplit(spouses, "\n") + spouses <- base::unlist(spouses) if(length(spouses) > 0) { return(length(spouses)) } @@ -37,26 +38,26 @@ getNoOfSpouses <- function(article) { ### Converts info box to table getInfoBox <- function(article) { # Read page as html - page <- read_html(article) - + page <- xml2::read_html(article) + # Extracting text from the html will erase all <br> tags, # this will replace them with line breaks - - xml_find_all(page, ".//br") %>% - xml_add_sibling("p", "\n") - - xml_find_all(page, ".//br") %>% - xml_remove() - + + xml2::xml_find_all(page, ".//br") %>% + xml2::xml_add_sibling("p", "\n") + + xml2::xml_find_all(page, ".//br") %>% + xml2::xml_remove() + # Get the info box # Will throw an error if there isnt any, so that should be checked beforehand - + table <- page %>% - html_nodes("table.vcard") %>% - html_table(fill = TRUE) %>% + rvest::html_nodes("table.vcard") %>% + rvest::html_table(fill = TRUE) %>% .[[1]] - + colnames(table) <- c("Desc", "Content") - + return(table) } diff --git a/processing/wikiproc/R/createAnnotations.R b/processing/wikiproc/R/createAnnotations.R index b9ca6ebea7029055fc484efb723beb4605a607c9..6deb6a538e7d39472bc5d25a00f93a227ebeac12 100644 --- a/processing/wikiproc/R/createAnnotations.R +++ b/processing/wikiproc/R/createAnnotations.R @@ -1,14 +1,15 @@ library(cleanNLP) +#' @export createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) { - + # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces # To prevent this we stip 'em again - + filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE) - + # Check if there is a cached version of the annotations for this article in this specific revision - + if(use.cache & file.exists(filename)) { res <- tryCatch({ data <- readRDS(filename) @@ -18,11 +19,11 @@ createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE }) return(res) } - - annotation <- cnlp_annotate(text, as_strings = TRUE) - + + annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE) + # Write cache if desired - + if(write.cache) { if (!dir.exists("data")) { dir.create("data") @@ -32,10 +33,10 @@ createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE } saveRDS(annotation, filename) } - + # Return data # On a side note: Should we do this? The tidyverse style guide discourages explicit returns. # But then again, it suggests snake case for variables... - + return(annotation) -} \ No newline at end of file +} diff --git a/processing/wikiproc/man/cleanHtml.Rd b/processing/wikiproc/man/cleanHtml.Rd index 56994f44d9eadcd9f8cb1fee71bd54d91e518629..7247852e83e59f5f6c7ba54df2f90692b7f98a9c 100644 --- a/processing/wikiproc/man/cleanHtml.Rd +++ b/processing/wikiproc/man/cleanHtml.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/CleanHtml.R \name{cleanHtml} \alias{cleanHtml} -\title{Clean a html formatted wikipedia page. +\title{Clean a html formatted wikipedia page. Nodes of interest from the DOM are extracted and then cleaned from all html tags and annotations.} \usage{ @@ -15,7 +15,7 @@ cleanHtml(html) Plaintext document containing only the maintext of the give wikipedia page. } \description{ -Clean a html formatted wikipedia page. +Clean a html formatted wikipedia page. Nodes of interest from the DOM are extracted and then cleaned from all html tags and annotations. } diff --git a/processing/wikiproc/man/getBirthplace.Rd b/processing/wikiproc/man/getBirthplace.Rd index 8dd69d7d49dcc12d33914ecdaed68baf1d0fa78c..b1b083083922001d92b76f97cfb38ec7ca0049bf 100644 --- a/processing/wikiproc/man/getBirthplace.Rd +++ b/processing/wikiproc/man/getBirthplace.Rd @@ -4,7 +4,7 @@ \alias{getBirthplace} \title{This script extracts Birthplace from physicist texts Try to get the infobox and extract the birthplace -If there is no infobox, 0 will be returned as +If there is no infobox, 0 will be returned as birthplace is hard to extract from text} \usage{ getBirthplace(article) @@ -18,6 +18,6 @@ String with birthplace of the physicist|0 \description{ This script extracts Birthplace from physicist texts Try to get the infobox and extract the birthplace -If there is no infobox, 0 will be returned as +If there is no infobox, 0 will be returned as birthplace is hard to extract from text } diff --git a/processing/wikiproc/man/getData.Rd b/processing/wikiproc/man/getData.Rd index 13e362d15d94d684eb53fe4c6a8001bc4b89949e..ec865807ac52c5cc079ccda9998e01632f97e969 100644 --- a/processing/wikiproc/man/getData.Rd +++ b/processing/wikiproc/man/getData.Rd @@ -4,12 +4,14 @@ \alias{getData} \title{Retrieve wikipedia articles about physicists} \usage{ -getData(use.cache = TRUE, write.cache = FALSE) +getData(use.cache = TRUE, write.cache = FALSE, data.dir = "data") } \arguments{ \item{use.cache}{Use cached data if it exists over downloading new data} \item{write.cache}{Write downloaded results into cache for use on future calls} + +\item{data.dir}{Directory the data should be read from and/or written to} } \value{ data.frame containing the title, id, revisionID and html-formatted full text