Skip to content
Snippets Groups Projects
Commit c886643b authored by Lucas Schons's avatar Lucas Schons
Browse files

Enhance R module

* resolve paths from project root
* modify DESCRIPTION
parent 815047b4
No related branches found
No related tags found
3 merge requests!34Resolve "Add unit tests for clean_html.R",!28WIP: Resolve "Create pattern matching function",!27Resolve "Add unit tests for cleanHtml.R"
...@@ -5,12 +5,17 @@ ...@@ -5,12 +5,17 @@
library(pbapply) library(pbapply)
library(rvest) library(rvest)
library(wikiproc) library(wikiproc)
library(rprojroot)
## Fetch data ## Fetch data
cat("Starting data import...\n") cat("Starting data import...\n")
articles <- getData(use.cache = FALSE, write.cache = TRUE, data.dir = "../../data/") # Define paths
project_root <- find_root(has_file("README.md"))
data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
articles <- wikiproc:::getData(use.cache = TRUE, data.dir = data_dir)
## Data processing ## Data processing
...@@ -19,25 +24,25 @@ cat("Processing data:\n") ...@@ -19,25 +24,25 @@ cat("Processing data:\n")
results <- pbapply(articles, 1, function(article) { results <- pbapply(articles, 1, function(article) {
# Within this function article is a vector representing a single row of our original data frame # Within this function article is a vector representing a single row of our original data frame
# This means article[1] represents the Title, article[2] the PageID etc. # This means article[1] represents the Title, article[2] the PageID etc.
## Data cleaning ## Data cleaning
cleaned.text <- cleanHtml(article[4]) cleaned.text <- wikiproc:::cleanHtml(article[4])
## Data preprocessing/annotating ## Data preprocessing/annotating
# annotation <- createAnnotations(cleaned.text, article[2], article[3]) # annotation <- createAnnotations(cleaned.text, article[2], article[3])
## Extract information from Text ## Extract information from Text
no.spouses <- getNoOfSpouses(article[4]) no.spouses <- wikiproc:::getNoOfSpouses(article[4])
## Create Results ## Create Results
data.frame(Name = article[1], data.frame(Name = article[1],
NoSpouses = no.spouses, NoSpouses = no.spouses,
stringsAsFactors = FALSE) stringsAsFactors = FALSE)
}) })
results <- do.call(rbind, results) results <- do.call(rbind, results)
...@@ -46,7 +51,7 @@ cat("Data processing finished.\n") ...@@ -46,7 +51,7 @@ cat("Data processing finished.\n")
## Results are now in results ## Results are now in results
## Format for rasa ## Format for rasa
cat("Writing rasa files to 'rasa/'...\n") cat("Writing rasa files to 'rasa/'...\n")
......
...@@ -7,4 +7,5 @@ data.table ...@@ -7,4 +7,5 @@ data.table
xml2 xml2
WikipediR WikipediR
reticulate reticulate
cleanNLP cleanNLP
\ No newline at end of file rprojroot
Package: wikiproc Package: wikiproc
Title: What the Package Does (one line, title case) Title: What the Package Does (one line, title case)
Version: 0.0.0.9000 Version: 0.0.0.9000
Authors@R: person("First", "Last", email = "first.last@example.com", role = c("aut", "cre")) Authors@R: c(
person("David", "Fuhry"),
person("Lukas", "Gehrke"),
person("Lucas", "Schons")
)
Description: What the package does (one paragraph). Description: What the package does (one paragraph).
Depends: R (>= 3.5.2) Depends: R (>= 3.5.2)
License: What license is it under? License: GPL-2
Encoding: UTF-8 Encoding: UTF-8
LazyData: true LazyData: true
RoxygenNote: 6.1.1 RoxygenNote: 6.1.1
Imports:
pbapply,
rvest,
stringi,
textclean,
stringr,
data.table,
xml2,
WikipediR,
reticulate,
cleanNLP,
Suggests: Suggests:
testthat testthat
...@@ -6,15 +6,14 @@ library(rvest) ...@@ -6,15 +6,14 @@ library(rvest)
library(stringi) library(stringi)
library(textclean) library(textclean)
#' Clean a html formatted wikipedia page.
#' Clean a html formatted wikipedia page.
#' Nodes of interest from the DOM are extracted and then cleaned from all html #' Nodes of interest from the DOM are extracted and then cleaned from all html
#' tags and annotations. #' tags and annotations.
#' #'
#' @param html Url linking to a wikipedia webpage or a html formatted document. #' @param html Url linking to a wikipedia webpage or a html formatted document.
#' @return Plaintext document containing only the maintext of the give wikipedia page. #' @return Plaintext document containing only the maintext of the give wikipedia page.
cleanHtml <- function(html) { cleanHtml <- function(html) {
# 1. read data from url or html-formatted text # 1. read data from url or html-formatted text
# 2 .extract nodes containing main information (ignore infoboxes, list of literature, ...) # 2 .extract nodes containing main information (ignore infoboxes, list of literature, ...)
# 3. collapse vektors into a single one # 3. collapse vektors into a single one
...@@ -33,4 +32,4 @@ cleanHtml <- function(html) { ...@@ -33,4 +32,4 @@ cleanHtml <- function(html) {
gsub(" *([.!?:,'’])", "\\1", .) %>% gsub(" *([.!?:,'’])", "\\1", .) %>%
gsub("\n *\n+", "\n", .) %>% gsub("\n *\n+", "\n", .) %>%
sub(" ", "", .) sub(" ", "", .)
} }
\ No newline at end of file
...@@ -13,15 +13,15 @@ library(xml2) ...@@ -13,15 +13,15 @@ library(xml2)
## which gives us something short of a thousand articles ## which gives us something short of a thousand articles
#' Retrieve wikipedia articles about physicists #' Retrieve wikipedia articles about physicists
#' #'
#' @param use.cache Use cached data if it exists over downloading new data #' @param use.cache Use cached data if it exists over downloading new data
#' @param write.cache Write downloaded results into cache for use on future calls #' @param write.cache Write downloaded results into cache for use on future calls
#' @param data.dir Directory the data should be read from and/or written to #' @param data.dir Directory the data should be read from and/or written to
#' @return data.frame containing the title, id, revisionID and html-formatted full text #' @return data.frame containing the title, id, revisionID and html-formatted full text
getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") { getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
dest.articlesRDS <- paste(data.dir, "articles.RDS", "") dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep)
dest.articlesCSV <- paste(data.dir, "articles.csv", "") dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep)
### First we check if the data already exists and try to load it if it does ### First we check if the data already exists and try to load it if it does
if(file.exists(dest.articlesRDS) && use.cache ) { if(file.exists(dest.articlesRDS) && use.cache ) {
res <- tryCatch({ res <- tryCatch({
...@@ -33,54 +33,54 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") { ...@@ -33,54 +33,54 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") {
}) })
return(res) return(res)
} }
### Get the list of names ### Get the list of names
# Download page # Download page
cat("Downloading list from wikipedia... ") cat("Downloading list from wikipedia... ")
page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists") page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
cat("Done.\n") cat("Done.\n")
# Extract links as the names given here are not the article names in about 20 cases # Extract links as the names given here are not the article names in about 20 cases
cat("Processing data:\n") cat("Processing data:\n")
physicists <- page %>% physicists <- page %>%
html_nodes(".mw-parser-output li a") %>% html_nodes(".mw-parser-output li a") %>%
html_attr("href") html_attr("href")
# Clean the list # Clean the list
physicists <- physicists[nchar(physicists) > 5] physicists <- physicists[nchar(physicists) > 5]
length(physicists) <- length(physicists) - 3 length(physicists) <- length(physicists) - 3
physicists <- gsub("_", " ", physicists) physicists <- gsub("_", " ", physicists)
physicists <- gsub("/wiki/", "", physicists) physicists <- gsub("/wiki/", "", physicists)
physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists) physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
# This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding # This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
# On linux `physicists <- URLdecode(physicists)` should do the trick # On linux `physicists <- URLdecode(physicists)` should do the trick
physicists <- sapply(physicists, function(x) { physicists <- sapply(physicists, function(x) {
tmp <- URLdecode(x) tmp <- URLdecode(x)
Encoding(tmp) <- "UTF-8" Encoding(tmp) <- "UTF-8"
tmp tmp
}) })
names(physicists) <- NULL names(physicists) <- NULL
cat("Done.\nDownloading articles now. This might take a while.\n") cat("Done.\nDownloading articles now. This might take a while.\n")
### Get articles ### Get articles
# Call the wikipedia api for each entry in our list # Call the wikipedia api for each entry in our list
articles <- pblapply(physicists, function(x) { articles <- pblapply(physicists, function(x) {
res <- tryCatch({ res <- tryCatch({
article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE) article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
...@@ -90,17 +90,17 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") { ...@@ -90,17 +90,17 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") {
pname <- read_html(article$parse$text$`*`) %>% pname <- read_html(article$parse$text$`*`) %>%
html_nodes(".redirectText a") %>% html_nodes(".redirectText a") %>%
html_attr("href") html_attr("href")
panme <- gsub("_", " ", pname) panme <- gsub("_", " ", pname)
pname <- gsub("/wiki/", "", pname) pname <- gsub("/wiki/", "", pname)
pname <- gsub("\\s*\\([^\\)]+\\)", "", pname) pname <- gsub("\\s*\\([^\\)]+\\)", "", pname)
tmp <- URLdecode(pname) tmp <- URLdecode(pname)
Encoding(tmp) <- "UTF-8" Encoding(tmp) <- "UTF-8"
pname <- tmp pname <- tmp
article <- page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE) article <- page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
} }
data.frame(Title = article$parse$title, data.frame(Title = article$parse$title,
...@@ -111,17 +111,17 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") { ...@@ -111,17 +111,17 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") {
}, error = function(e) { }, error = function(e) {
cat("Error: Crawling failed for article", x, "with error message: ", conditionMessage(e),"\n") cat("Error: Crawling failed for article", x, "with error message: ", conditionMessage(e),"\n")
}) })
}) })
# Bind it all together # Bind it all together
articles <- do.call(rbind, articles) articles <- do.call(rbind, articles)
cat("Download finished.\n") cat("Download finished.\n")
# Write result if desired # Write result if desired
if(write.cache) { if(write.cache) {
if (!dir.exists(data.dir)) { if (!dir.exists(data.dir)) {
dir.create(data.dir) dir.create(data.dir)
...@@ -131,8 +131,8 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") { ...@@ -131,8 +131,8 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") {
saveRDS(articles, dest.articlesRDS) saveRDS(articles, dest.articlesRDS)
cat("Done.\n") cat("Done.\n")
} }
cat("Data import finished.\n") cat("Data import finished.\n")
return(articles) return(articles)
} }
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment