Skip to content
Snippets Groups Projects
Commit 937b3fe7 authored by David Fuhry's avatar David Fuhry
Browse files

Refactoring, using cashes

Implements #10
Fixes #11
Refactored to provide function getData()
parent 91947105
No related branches found
No related tags found
1 merge request!6Use wikipedia api
......@@ -9,68 +9,91 @@ library(rvest) # For getting the list of physicits
## Instead we crawl the names on the article 'List of Physicists' and query those names
## which gives us something short of a thousand articles
### Get the list of names
# Download page
page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
# Extract links as the names given here are not the article names in about 20 cases
physicists <- page %>%
html_nodes(".mw-parser-output li a") %>%
html_attr("href")
# Clean the list
physicists <- physicists[nchar(physicists) > 5]
length(physicists) <- length(physicists) - 3
physicists <- gsub("_", " ", physicists)
physicists <- gsub("/wiki/", "", physicists)
physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
# This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
# On linux `physicists <- URLdecode(physicists)` should do the trick
physicists <- sapply(physicists, function(x) {
tmp <- URLdecode(x)
Encoding(tmp) <- "UTF-8"
tmp
})
names(physicists) <- NULL
### Get articles
# Call the wikipedia api for each entry in our list
articles <- lapply(physicists, function(x) {
res <- tryCatch({
article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
data.frame(Title = article$parse$title,
PageID = article$parse$pageid,
RevID = article$parse$revid,
Text = article$parse$text$`*`,
stringsAsFactors = FALSE)
}, error = function(e) {
cat("Error: Crawling failed for article ", x, "with error message: ", conditionMessage(e),"\n")
#' Retrieve wikipedia articles about physicists
#'
#' @param use.cache Use cached data if it exists over downloading new data
#' @param write.cache Write downloaded results into cache for use on future calls
#' @return data.frame containing the title, id, revisionID and html-formatted full text
getData <- function(use.cache = TRUE, write.cache = FALSE) {
### First we check if the data already exists and try to load it if it does
if(file.exists("data/articles.RDS") & use.cache ) {
res <- tryCatch({
data <- readRDS("data/articles.RDS")
data
}, error = function (e) {
cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes")
})
return(res)
}
### Get the list of names
# Download page
page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
# Extract links as the names given here are not the article names in about 20 cases
physicists <- page %>%
html_nodes(".mw-parser-output li a") %>%
html_attr("href")
# Clean the list
physicists <- physicists[nchar(physicists) > 5]
length(physicists) <- length(physicists) - 3
physicists <- gsub("_", " ", physicists)
physicists <- gsub("/wiki/", "", physicists)
physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
# This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
# On linux `physicists <- URLdecode(physicists)` should do the trick
physicists <- sapply(physicists, function(x) {
tmp <- URLdecode(x)
Encoding(tmp) <- "UTF-8"
tmp
})
})
# Bind it all together
articles <- do.call(rbind, articles)
# Write result
names(physicists) <- NULL
### Get articles
# Call the wikipedia api for each entry in our list
articles <- lapply(physicists, function(x) {
res <- tryCatch({
article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
data.frame(Title = article$parse$title,
PageID = article$parse$pageid,
RevID = article$parse$revid,
Text = article$parse$text$`*`,
stringsAsFactors = FALSE)
}, error = function(e) {
cat("Error: Crawling failed for article ", x, "with error message: ", conditionMessage(e),"\n")
})
})
# Bind it all together
articles <- do.call(rbind, articles)
# Write result if desired
if(write.cache) {
write.table(articles, "data/articles.csv")
saveRDS(articles, "data/articles.RDS")
}
return(articles)
}
write.table(articles, "../data/articles.csv")
saveRDS(articles, "../data/articles.RDS")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment