diff --git a/r/GetData.R b/r/GetData.R index a8d39bb0d84eff49b05d8f6f7e9deb51aeaf970a..a86c4c9b0e4bc183725ddf16a0b8eb446bbb19f7 100644 --- a/r/GetData.R +++ b/r/GetData.R @@ -9,68 +9,91 @@ library(rvest) # For getting the list of physicits ## Instead we crawl the names on the article 'List of Physicists' and query those names ## which gives us something short of a thousand articles -### Get the list of names - -# Download page - -page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists") - -# Extract links as the names given here are not the article names in about 20 cases - -physicists <- page %>% - html_nodes(".mw-parser-output li a") %>% - html_attr("href") - -# Clean the list - -physicists <- physicists[nchar(physicists) > 5] - -length(physicists) <- length(physicists) - 3 - -physicists <- gsub("_", " ", physicists) - -physicists <- gsub("/wiki/", "", physicists) - -physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists) - -# This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding -# On linux `physicists <- URLdecode(physicists)` should do the trick - -physicists <- sapply(physicists, function(x) { - tmp <- URLdecode(x) - Encoding(tmp) <- "UTF-8" - tmp -}) - -names(physicists) <- NULL - - - -### Get articles - -# Call the wikipedia api for each entry in our list - -articles <- lapply(physicists, function(x) { - res <- tryCatch({ - article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE) - data.frame(Title = article$parse$title, - PageID = article$parse$pageid, - RevID = article$parse$revid, - Text = article$parse$text$`*`, - stringsAsFactors = FALSE) - }, error = function(e) { - cat("Error: Crawling failed for article ", x, "with error message: ", conditionMessage(e),"\n") +#' Retrieve wikipedia articles about physicists +#' +#' @param use.cache Use cached data if it exists over downloading new data +#' @param write.cache Write downloaded results into cache for use on future calls +#' @return data.frame containing the title, id, revisionID and html-formatted full text +getData <- function(use.cache = TRUE, write.cache = FALSE) { + ### First we check if the data already exists and try to load it if it does + if(file.exists("data/articles.RDS") & use.cache ) { + res <- tryCatch({ + data <- readRDS("data/articles.RDS") + data + }, error = function (e) { + cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes") + }) + return(res) + } + + ### Get the list of names + + # Download page + + page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists") + + # Extract links as the names given here are not the article names in about 20 cases + + physicists <- page %>% + html_nodes(".mw-parser-output li a") %>% + html_attr("href") + + # Clean the list + + physicists <- physicists[nchar(physicists) > 5] + + length(physicists) <- length(physicists) - 3 + + physicists <- gsub("_", " ", physicists) + + physicists <- gsub("/wiki/", "", physicists) + + physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists) + + # This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding + # On linux `physicists <- URLdecode(physicists)` should do the trick + + physicists <- sapply(physicists, function(x) { + tmp <- URLdecode(x) + Encoding(tmp) <- "UTF-8" + tmp }) -}) - -# Bind it all together - -articles <- do.call(rbind, articles) - -# Write result + names(physicists) <- NULL + + + + ### Get articles + + # Call the wikipedia api for each entry in our list + + articles <- lapply(physicists, function(x) { + res <- tryCatch({ + article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE) + data.frame(Title = article$parse$title, + PageID = article$parse$pageid, + RevID = article$parse$revid, + Text = article$parse$text$`*`, + stringsAsFactors = FALSE) + }, error = function(e) { + cat("Error: Crawling failed for article ", x, "with error message: ", conditionMessage(e),"\n") + }) + + }) + + # Bind it all together + + articles <- do.call(rbind, articles) + + # Write result if desired + + if(write.cache) { + write.table(articles, "data/articles.csv") + saveRDS(articles, "data/articles.RDS") + } + + return(articles) +} -write.table(articles, "../data/articles.csv") -saveRDS(articles, "../data/articles.RDS")