Skip to content
Snippets Groups Projects
Commit f3165856 authored by David Fuhry's avatar David Fuhry
Browse files

Minor tweeks

* Fixed a few function calls
* Deleted unneeded r script
* Deleted unneeded function declarations
* Some cleanup
parent 83529e59
No related branches found
No related tags found
3 merge requests!34Resolve "Add unit tests for clean_html.R",!28WIP: Resolve "Create pattern matching function",!27Resolve "Add unit tests for cleanHtml.R"
......@@ -30,7 +30,7 @@ results <- pbapply(articles, 1, function(article) {
## Data cleaning
cleaned.text <- wikiproc::cleanHtml(article[4])
cleaned.text <- wikiproc::clean_html(article[4])
## Data preprocessing/annotating
......
......@@ -7,9 +7,4 @@ export(get_birthplace)
export(get_data)
export(get_no_of_spouses)
export(init_nlp)
import(rvest)
importFrom(data.table,"%like%")
importFrom(xml2,read_html)
importFrom(xml2,xml_add_sibling)
importFrom(xml2,xml_find_all)
importFrom(xml2,xml_remove)
library(cleanNLP)
#' @export
create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) {
# Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
# To prevent this we stip 'em again
filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE)
# Check if there is a cached version of the annotations for this article in this specific revision
if(use.cache & file.exists(filename)) {
res <- tryCatch({
data <- readRDS(filename)
data
}, error = function (e) {
cat("Cached data seems to be corrupted, redoing annotation.\n")
})
return(res)
}
annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE)
# Write cache if desired
if(write.cache) {
if (!dir.exists("data")) {
dir.create("data")
}
if (!dir.exists("data/annotations")) {
dir.create("data/annotations")
}
saveRDS(annotation, filename)
}
# Return data
# On a side note: Should we do this? The tidyverse style guide discourages explicit returns.
# But then again, it suggests snake case for variables...
return(annotation)
}
......@@ -21,7 +21,7 @@ get_birthdate <- function(article) {
if(grepl("vcard", article)) {
# Check if there is an infobox
infoBox <- getInfoBox(article)
infoBox <- get_infobox(article)
# Get the Born field
birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content
......@@ -53,33 +53,6 @@ get_birthdate <- function(article) {
}
}
### Converts info box to table
getInfoBox <- function(article) {
# Read page as html
page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags,
# This will replace them with line breaks
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
rvest::html_nodes("table.vcard") %>%
rvest::html_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
#' Get Introduction Text from Wikipedia page that contains birthdate
#'
#' @param article article in HTML-format
......
......@@ -24,7 +24,7 @@ get_birthplace <- function(article) {
}
# Use infobox to get Birthplace
infoBox <- getInfoBox(article)
infoBox <- get_infobox(article)
# Get 'Born' field
birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
......@@ -36,30 +36,3 @@ get_birthplace <- function(article) {
# return birthplace
return(birthplace)
}
### Converts info box to table
getInfoBox <- function(article) {
# Read page as html
page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
rvest::html_nodes("table.vcard") %>%
rvest::tml_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
......@@ -10,7 +10,6 @@
#' Retrieve wikipedia articles about physicists
#'
#' @export
#' @param use.cache Use cached data if it exists over downloading new data
#' @param write.cache Write downloaded results into cache for use on future calls
#' @param data.dir Directory the data should be read from and/or written to
......@@ -38,7 +37,7 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
cat("Downloading list from wikipedia... ")
page <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_physicists")
page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
cat("Done.\n")
......@@ -47,8 +46,8 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
cat("Processing data:\n")
physicists <- page %>%
rvest::html_nodes(".mw-parser-output li a") %>%
rvest::html_attr("href")
html_nodes(".mw-parser-output li a") %>%
html_attr("href")
# Clean the list
......@@ -85,9 +84,9 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
# Check if the article is a redirect page
if (grepl(".redirectText", article$parse$text$`*`)) {
# Get the real article name
pname <- xml2::read_html(article$parse$text$`*`) %>%
rvest::html_nodes(".redirectText a") %>%
rvest::html_attr("href")
pname <- read_html(article$parse$text$`*`) %>%
html_nodes(".redirectText a") %>%
html_attr("href")
panme <- gsub("_", " ", pname)
......
......@@ -5,7 +5,5 @@
### if some functions are used frequently you may just import them
### with something like @importFrom reshape2 melt cast
#' @import rvest
#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove read_html
#' @importFrom data.table %like%
NULL
\ No newline at end of file
......@@ -18,16 +18,16 @@
#' }
get_infobox <- function(article) {
# Read page as html
page <- read_html(article)
page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n")
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>%
xml_remove()
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment