Skip to content
Snippets Groups Projects
Commit 002a88f1 authored by Lucas Schons's avatar Lucas Schons
Browse files

Replace implicit function calls with explicit ones

parent c69e2f7e
No related branches found
No related tags found
3 merge requests!34Resolve "Add unit tests for clean_html.R",!28WIP: Resolve "Create pattern matching function",!27Resolve "Add unit tests for cleanHtml.R"
......@@ -15,7 +15,7 @@ cat("Starting data import...\n")
project_root <- find_root(has_file("README.md"))
data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
articles <- wikiproc:::getData(use.cache = TRUE, data.dir = data_dir)
articles <- wikiproc::getData(use.cache = TRUE, data.dir = data_dir)
## Data processing
......@@ -27,7 +27,7 @@ results <- pbapply(articles, 1, function(article) {
## Data cleaning
cleaned.text <- wikiproc:::cleanHtml(article[4])
cleaned.text <- wikiproc::cleanHtml(article[4])
## Data preprocessing/annotating
......@@ -35,7 +35,7 @@ results <- pbapply(articles, 1, function(article) {
## Extract information from Text
no.spouses <- wikiproc:::getNoOfSpouses(article[4])
no.spouses <- wikiproc::getNoOfSpouses(article[4])
## Create Results
......
Package: wikiproc
Title: What the Package Does (one line, title case)
Title: Collection of Data Processing Utility Functions
Version: 0.0.0.9000
Authors@R: c(
person("David", "Fuhry", role = "aut"),
......@@ -9,7 +9,7 @@ Author: David Fuhry [aut],
Lukas Gehrke [aut],
Lucas Schons [aut]
Maintainer: David Fuhry <not.an@email.address.net>
Description: What the package does (one paragraph).
Description: This package contains various functions that are needed to transform raw wikipedia html into processable facts.
Depends: R (>= 3.5.0)
License: GPL-2
Encoding: UTF-8
......@@ -18,9 +18,9 @@ RoxygenNote: 6.1.1
Imports:
pbapply,
rvest,
stringi,
textclean,
stringr,
stringi,
data.table,
xml2,
WikipediR,
......
# Generated by roxygen2: do not edit by hand
export(cleanHtml)
export(createAnnotations)
export(getBirthdate)
export(getBirthplace)
export(getData)
export(getNoOfSpouses)
......@@ -5,11 +5,13 @@
library(rvest)
library(stringi)
library(textclean)
library(xml2)
#' Clean a html formatted wikipedia page.
#' Nodes of interest from the DOM are extracted and then cleaned from all html
#' tags and annotations.
#'
#' @export
#' @param html Url linking to a wikipedia webpage or a html formatted document.
#' @return Plaintext document containing only the maintext of the give wikipedia page.
cleanHtml <- function(html) {
......@@ -22,10 +24,10 @@ cleanHtml <- function(html) {
# - remove whitespace after newline
# - remove whitespace before punctuation
# - replace multiple newlines with single newline
result <- read_html(html) %>%
html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>%
stri_flatten(collapse = " ") %>%
replace_html() %>%
result <- xml2::read_html(html) %>%
rvest::html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>%
stringi::stri_flatten(collapse = " ") %>%
textclean::replace_html() %>%
gsub("\\[\\d*\\]", "", .) %>%
gsub(" +", " ", .) %>%
gsub("\n ", "\n", .) %>%
......
......@@ -13,15 +13,16 @@ library(xml2)
#' If there is no infobox, first paragraph of the article
#' will be checked for birthdate
#'
#' @export
#' @param article Article in HTML-format
#' @return String birthdate as string|NULL
getBirthdate <- function(article) {
if(grepl("vcard", article)) {
# Check if there is an infobox
infoBox <- getInfoBox(article)
# Get the Born field
birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content
# Remove everything except the birthdate:
......@@ -29,24 +30,24 @@ getBirthdate <- function(article) {
birthdate <- gsub("\\s*\\([^\\)]+\\)", "", birthdate)
# - Remove everything starting with newline
birthdate <- gsub("\\n.*$", "", birthdate)
return(birthdate)
} else if(!getIntroduction(article) == "") {
# Check first paragraph
introduction <- getIntroduction(article)
# Get birthdate inside of parentheses
birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]]
birthdate <- stringr::str_extract_all(introduction, "\\([^()]+\\)")[[1]]
# Remove parentheses
birthdate <- substring(birthdate, 2, nchar(birthdate)-1)
return(birthdate)
} else {
# Return Null if there is no birthdate
return(NULL)
}
......@@ -55,27 +56,27 @@ getBirthdate <- function(article) {
### Converts info box to table
getInfoBox <- function(article) {
# Read page as html
page <- read_html(article)
page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags,
# This will replace them with line breaks
xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>%
xml_remove()
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
html_nodes("table.vcard") %>%
html_table(fill = TRUE) %>%
rvest::html_nodes("table.vcard") %>%
rvest::html_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
......@@ -85,28 +86,28 @@ getInfoBox <- function(article) {
#' @return string introduction text from wikipedia article
getIntroduction <- function(article) {
# Read page as html
page <- read_html(article)
page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags,
# This will replace them with line breaks
xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>%
xml_remove
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove
# Get all paragraphs
paragraphs <- page %>%
html_nodes("p") %>%
html_text()
rvest::html_nodes("p") %>%
rvest::html_text()
# There will be some leading paragraphs containing only "\n"
# Remove those leading paragraphs
remove <- c("\n")
cleaned <- setdiff(paragraphs, remove)
cleaned <- data.table::setdiff(paragraphs, remove)
introduction <- cleaned[1]
# Return first paragraph
return(introduction)
}
......@@ -10,28 +10,29 @@ library(data.table)
#' This script extracts Birthplace from physicist texts
#' Try to get the infobox and extract the birthplace
#' If there is no infobox, 0 will be returned as
#' If there is no infobox, 0 will be returned as
#' birthplace is hard to extract from text
#'
#'
#' @export
#' @param article Article in HTML-format
#' @return String with birthplace of the physicist|0
getBirthplace <- function(article) {
# If there is no infobox we return 0
if(!grepl("vcard", article)) {
return(0)
}
# Use infobox to get Birthplace
infoBox <- getInfoBox(article)
# Get 'Born' field
birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
# Remove everything in front of the "\n"
# Rest is birthplace
birthplace <- gsub(".*\\\n", "", birthplace)
# return birthplace
return(birthplace)
}
......@@ -39,26 +40,26 @@ getBirthplace <- function(article) {
### Converts info box to table
getInfoBox <- function(article) {
# Read page as html
page <- read_html(article)
page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>%
xml_remove()
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
html_nodes("table.vcard") %>%
html_table(fill = TRUE) %>%
rvest::html_nodes("table.vcard") %>%
rvest::tml_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
......@@ -14,6 +14,7 @@ library(xml2)
#' Retrieve wikipedia articles about physicists
#'
#' @export
#' @param use.cache Use cached data if it exists over downloading new data
#' @param write.cache Write downloaded results into cache for use on future calls
#' @param data.dir Directory the data should be read from and/or written to
......@@ -40,7 +41,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
cat("Downloading list from wikipedia... ")
page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
page <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_physicists")
cat("Done.\n")
......@@ -49,8 +50,8 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
cat("Processing data:\n")
physicists <- page %>%
html_nodes(".mw-parser-output li a") %>%
html_attr("href")
rvest::html_nodes(".mw-parser-output li a") %>%
rvest::html_attr("href")
# Clean the list
......@@ -81,15 +82,15 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
# Call the wikipedia api for each entry in our list
articles <- pblapply(physicists, function(x) {
articles <- pbapply::pblapply(physicists, function(x) {
res <- tryCatch({
article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
article <- WikipediR::page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
# Check if the article is a redirect page
if (grepl(".redirectText", article$parse$text$`*`)) {
# Get the real article name
pname <- read_html(article$parse$text$`*`) %>%
html_nodes(".redirectText a") %>%
html_attr("href")
pname <- xml2::read_html(article$parse$text$`*`) %>%
rvest::html_nodes(".redirectText a") %>%
rvest::html_attr("href")
panme <- gsub("_", " ", pname)
......@@ -101,7 +102,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
Encoding(tmp) <- "UTF-8"
pname <- tmp
article <- page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
article <- WikipediR::page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
}
data.frame(Title = article$parse$title,
PageID = article$parse$pageid,
......
......@@ -9,25 +9,26 @@
library(rvest)
library(data.table)
library(xml2)
### Get number of spouses
#' @export
getNoOfSpouses <- function(article) {
# If there is no infobox we assume there were no spouses
if(!grepl("vcard", article)) {
return(0)
}
infoBox <- getInfoBox(article)
# Get the spouse field
spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
# Remove everything in parentheses
spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
# Split the strings by newlines to get one spouse per line
spouses <- strsplit(spouses, "\n")
spouses <- unlist(spouses)
spouses <- base::strsplit(spouses, "\n")
spouses <- base::unlist(spouses)
if(length(spouses) > 0) {
return(length(spouses))
}
......@@ -37,26 +38,26 @@ getNoOfSpouses <- function(article) {
### Converts info box to table
getInfoBox <- function(article) {
# Read page as html
page <- read_html(article)
page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>%
xml_remove()
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
html_nodes("table.vcard") %>%
html_table(fill = TRUE) %>%
rvest::html_nodes("table.vcard") %>%
rvest::html_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
library(cleanNLP)
#' @export
createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) {
# Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
# To prevent this we stip 'em again
filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE)
# Check if there is a cached version of the annotations for this article in this specific revision
if(use.cache & file.exists(filename)) {
res <- tryCatch({
data <- readRDS(filename)
......@@ -18,11 +19,11 @@ createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE
})
return(res)
}
annotation <- cnlp_annotate(text, as_strings = TRUE)
annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE)
# Write cache if desired
if(write.cache) {
if (!dir.exists("data")) {
dir.create("data")
......@@ -32,10 +33,10 @@ createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE
}
saveRDS(annotation, filename)
}
# Return data
# On a side note: Should we do this? The tidyverse style guide discourages explicit returns.
# But then again, it suggests snake case for variables...
return(annotation)
}
\ No newline at end of file
}
......@@ -2,7 +2,7 @@
% Please edit documentation in R/CleanHtml.R
\name{cleanHtml}
\alias{cleanHtml}
\title{Clean a html formatted wikipedia page.
\title{Clean a html formatted wikipedia page.
Nodes of interest from the DOM are extracted and then cleaned from all html
tags and annotations.}
\usage{
......@@ -15,7 +15,7 @@ cleanHtml(html)
Plaintext document containing only the maintext of the give wikipedia page.
}
\description{
Clean a html formatted wikipedia page.
Clean a html formatted wikipedia page.
Nodes of interest from the DOM are extracted and then cleaned from all html
tags and annotations.
}
......@@ -4,7 +4,7 @@
\alias{getBirthplace}
\title{This script extracts Birthplace from physicist texts
Try to get the infobox and extract the birthplace
If there is no infobox, 0 will be returned as
If there is no infobox, 0 will be returned as
birthplace is hard to extract from text}
\usage{
getBirthplace(article)
......@@ -18,6 +18,6 @@ String with birthplace of the physicist|0
\description{
This script extracts Birthplace from physicist texts
Try to get the infobox and extract the birthplace
If there is no infobox, 0 will be returned as
If there is no infobox, 0 will be returned as
birthplace is hard to extract from text
}
......@@ -4,12 +4,14 @@
\alias{getData}
\title{Retrieve wikipedia articles about physicists}
\usage{
getData(use.cache = TRUE, write.cache = FALSE)
getData(use.cache = TRUE, write.cache = FALSE, data.dir = "data")
}
\arguments{
\item{use.cache}{Use cached data if it exists over downloading new data}
\item{write.cache}{Write downloaded results into cache for use on future calls}
\item{data.dir}{Directory the data should be read from and/or written to}
}
\value{
data.frame containing the title, id, revisionID and html-formatted full text
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment