Skip to content
Snippets Groups Projects
Commit 937b3fe7 authored by David Fuhry's avatar David Fuhry
Browse files

Refactoring, using cashes

Implements #10
Fixes #11
Refactored to provide function getData()
parent 91947105
No related branches found
No related tags found
1 merge request!6Use wikipedia api
...@@ -9,68 +9,91 @@ library(rvest) # For getting the list of physicits ...@@ -9,68 +9,91 @@ library(rvest) # For getting the list of physicits
## Instead we crawl the names on the article 'List of Physicists' and query those names ## Instead we crawl the names on the article 'List of Physicists' and query those names
## which gives us something short of a thousand articles ## which gives us something short of a thousand articles
### Get the list of names #' Retrieve wikipedia articles about physicists
#'
# Download page #' @param use.cache Use cached data if it exists over downloading new data
#' @param write.cache Write downloaded results into cache for use on future calls
page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists") #' @return data.frame containing the title, id, revisionID and html-formatted full text
getData <- function(use.cache = TRUE, write.cache = FALSE) {
# Extract links as the names given here are not the article names in about 20 cases ### First we check if the data already exists and try to load it if it does
if(file.exists("data/articles.RDS") & use.cache ) {
physicists <- page %>% res <- tryCatch({
html_nodes(".mw-parser-output li a") %>% data <- readRDS("data/articles.RDS")
html_attr("href") data
}, error = function (e) {
# Clean the list cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes")
})
physicists <- physicists[nchar(physicists) > 5] return(res)
}
length(physicists) <- length(physicists) - 3
### Get the list of names
physicists <- gsub("_", " ", physicists)
# Download page
physicists <- gsub("/wiki/", "", physicists)
page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
# Extract links as the names given here are not the article names in about 20 cases
# This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
# On linux `physicists <- URLdecode(physicists)` should do the trick physicists <- page %>%
html_nodes(".mw-parser-output li a") %>%
physicists <- sapply(physicists, function(x) { html_attr("href")
tmp <- URLdecode(x)
Encoding(tmp) <- "UTF-8" # Clean the list
tmp
}) physicists <- physicists[nchar(physicists) > 5]
names(physicists) <- NULL length(physicists) <- length(physicists) - 3
physicists <- gsub("_", " ", physicists)
### Get articles physicists <- gsub("/wiki/", "", physicists)
# Call the wikipedia api for each entry in our list physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
articles <- lapply(physicists, function(x) { # This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
res <- tryCatch({ # On linux `physicists <- URLdecode(physicists)` should do the trick
article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
data.frame(Title = article$parse$title, physicists <- sapply(physicists, function(x) {
PageID = article$parse$pageid, tmp <- URLdecode(x)
RevID = article$parse$revid, Encoding(tmp) <- "UTF-8"
Text = article$parse$text$`*`, tmp
stringsAsFactors = FALSE)
}, error = function(e) {
cat("Error: Crawling failed for article ", x, "with error message: ", conditionMessage(e),"\n")
}) })
}) names(physicists) <- NULL
# Bind it all together
articles <- do.call(rbind, articles) ### Get articles
# Write result # Call the wikipedia api for each entry in our list
articles <- lapply(physicists, function(x) {
res <- tryCatch({
article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
data.frame(Title = article$parse$title,
PageID = article$parse$pageid,
RevID = article$parse$revid,
Text = article$parse$text$`*`,
stringsAsFactors = FALSE)
}, error = function(e) {
cat("Error: Crawling failed for article ", x, "with error message: ", conditionMessage(e),"\n")
})
})
# Bind it all together
articles <- do.call(rbind, articles)
# Write result if desired
if(write.cache) {
write.table(articles, "data/articles.csv")
saveRDS(articles, "data/articles.RDS")
}
return(articles)
}
write.table(articles, "../data/articles.csv")
saveRDS(articles, "../data/articles.RDS")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment