Skip to content
Snippets Groups Projects
Commit f3165856 authored by David Fuhry's avatar David Fuhry
Browse files

Minor tweeks

* Fixed a few function calls
* Deleted unneeded r script
* Deleted unneeded function declarations
* Some cleanup
parent 83529e59
No related branches found
No related tags found
3 merge requests!34Resolve "Add unit tests for clean_html.R",!28WIP: Resolve "Create pattern matching function",!27Resolve "Add unit tests for cleanHtml.R"
...@@ -30,7 +30,7 @@ results <- pbapply(articles, 1, function(article) { ...@@ -30,7 +30,7 @@ results <- pbapply(articles, 1, function(article) {
## Data cleaning ## Data cleaning
cleaned.text <- wikiproc::cleanHtml(article[4]) cleaned.text <- wikiproc::clean_html(article[4])
## Data preprocessing/annotating ## Data preprocessing/annotating
......
...@@ -7,9 +7,4 @@ export(get_birthplace) ...@@ -7,9 +7,4 @@ export(get_birthplace)
export(get_data) export(get_data)
export(get_no_of_spouses) export(get_no_of_spouses)
export(init_nlp) export(init_nlp)
import(rvest)
importFrom(data.table,"%like%") importFrom(data.table,"%like%")
importFrom(xml2,read_html)
importFrom(xml2,xml_add_sibling)
importFrom(xml2,xml_find_all)
importFrom(xml2,xml_remove)
library(cleanNLP)
#' @export
create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) {
# Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
# To prevent this we stip 'em again
filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE)
# Check if there is a cached version of the annotations for this article in this specific revision
if(use.cache & file.exists(filename)) {
res <- tryCatch({
data <- readRDS(filename)
data
}, error = function (e) {
cat("Cached data seems to be corrupted, redoing annotation.\n")
})
return(res)
}
annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE)
# Write cache if desired
if(write.cache) {
if (!dir.exists("data")) {
dir.create("data")
}
if (!dir.exists("data/annotations")) {
dir.create("data/annotations")
}
saveRDS(annotation, filename)
}
# Return data
# On a side note: Should we do this? The tidyverse style guide discourages explicit returns.
# But then again, it suggests snake case for variables...
return(annotation)
}
...@@ -21,7 +21,7 @@ get_birthdate <- function(article) { ...@@ -21,7 +21,7 @@ get_birthdate <- function(article) {
if(grepl("vcard", article)) { if(grepl("vcard", article)) {
# Check if there is an infobox # Check if there is an infobox
infoBox <- getInfoBox(article) infoBox <- get_infobox(article)
# Get the Born field # Get the Born field
birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content
...@@ -53,33 +53,6 @@ get_birthdate <- function(article) { ...@@ -53,33 +53,6 @@ get_birthdate <- function(article) {
} }
} }
### Converts info box to table
getInfoBox <- function(article) {
# Read page as html
page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags,
# This will replace them with line breaks
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
rvest::html_nodes("table.vcard") %>%
rvest::html_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
#' Get Introduction Text from Wikipedia page that contains birthdate #' Get Introduction Text from Wikipedia page that contains birthdate
#' #'
#' @param article article in HTML-format #' @param article article in HTML-format
......
...@@ -24,7 +24,7 @@ get_birthplace <- function(article) { ...@@ -24,7 +24,7 @@ get_birthplace <- function(article) {
} }
# Use infobox to get Birthplace # Use infobox to get Birthplace
infoBox <- getInfoBox(article) infoBox <- get_infobox(article)
# Get 'Born' field # Get 'Born' field
birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
...@@ -36,30 +36,3 @@ get_birthplace <- function(article) { ...@@ -36,30 +36,3 @@ get_birthplace <- function(article) {
# return birthplace # return birthplace
return(birthplace) return(birthplace)
} }
### Converts info box to table
getInfoBox <- function(article) {
# Read page as html
page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
rvest::html_nodes("table.vcard") %>%
rvest::tml_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
...@@ -10,7 +10,6 @@ ...@@ -10,7 +10,6 @@
#' Retrieve wikipedia articles about physicists #' Retrieve wikipedia articles about physicists
#' #'
#' @export
#' @param use.cache Use cached data if it exists over downloading new data #' @param use.cache Use cached data if it exists over downloading new data
#' @param write.cache Write downloaded results into cache for use on future calls #' @param write.cache Write downloaded results into cache for use on future calls
#' @param data.dir Directory the data should be read from and/or written to #' @param data.dir Directory the data should be read from and/or written to
...@@ -38,7 +37,7 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { ...@@ -38,7 +37,7 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
cat("Downloading list from wikipedia... ") cat("Downloading list from wikipedia... ")
page <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_physicists") page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
cat("Done.\n") cat("Done.\n")
...@@ -47,8 +46,8 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { ...@@ -47,8 +46,8 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
cat("Processing data:\n") cat("Processing data:\n")
physicists <- page %>% physicists <- page %>%
rvest::html_nodes(".mw-parser-output li a") %>% html_nodes(".mw-parser-output li a") %>%
rvest::html_attr("href") html_attr("href")
# Clean the list # Clean the list
...@@ -85,9 +84,9 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { ...@@ -85,9 +84,9 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
# Check if the article is a redirect page # Check if the article is a redirect page
if (grepl(".redirectText", article$parse$text$`*`)) { if (grepl(".redirectText", article$parse$text$`*`)) {
# Get the real article name # Get the real article name
pname <- xml2::read_html(article$parse$text$`*`) %>% pname <- read_html(article$parse$text$`*`) %>%
rvest::html_nodes(".redirectText a") %>% html_nodes(".redirectText a") %>%
rvest::html_attr("href") html_attr("href")
panme <- gsub("_", " ", pname) panme <- gsub("_", " ", pname)
......
...@@ -5,7 +5,5 @@ ...@@ -5,7 +5,5 @@
### if some functions are used frequently you may just import them ### if some functions are used frequently you may just import them
### with something like @importFrom reshape2 melt cast ### with something like @importFrom reshape2 melt cast
#' @import rvest
#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove read_html
#' @importFrom data.table %like% #' @importFrom data.table %like%
NULL NULL
\ No newline at end of file
...@@ -18,16 +18,16 @@ ...@@ -18,16 +18,16 @@
#' } #' }
get_infobox <- function(article) { get_infobox <- function(article) {
# Read page as html # Read page as html
page <- read_html(article) page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags, # Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks # this will replace them with line breaks
xml_find_all(page, ".//br") %>% xml2::xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n") xml2::xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>% xml2::xml_find_all(page, ".//br") %>%
xml_remove() xml2::xml_remove()
# Get the info box # Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand # Will throw an error if there isnt any, so that should be checked beforehand
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment