Newer
Older
## Though we could get the pages within the category 'physicists' with something like this
## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
## this gives us only about 50 pages.
## Instead we crawl the names on the article 'List of Physicists' and query those names
## which gives us something short of a thousand articles
#' Retrieve wikipedia articles about physicists
#' @param use.cache Use cached data if it exists over downloading new data
#' @param write.cache Write downloaded results into cache for use on future calls
#' @param data.dir Directory the data should be read from and/or written to
#' @return data.frame containing the title, id, revisionID and html-formatted full text
#' @export
get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep)
dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep)
### First we check if the data already exists and try to load it if it does
if(file.exists(dest.articlesRDS) && use.cache ) {
cat("Found chached data to use, import finished.\n")
data
}, error = function (e) {
cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes.\n")
})
return(res)
}
page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
# Extract links as the names given here are not the article names in about 20 cases
# This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
# On linux `physicists <- URLdecode(physicists)` should do the trick
physicists <- sapply(physicists, function(x) {
tmp <- URLdecode(x)
Encoding(tmp) <- "UTF-8"
tmp
})
cat("Done.\nDownloading articles now. This might take a while.\n")
articles <- pbapply::pblapply(physicists, function(x) {
article <- WikipediR::page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
# Check if the article is a redirect page
if (grepl(".redirectText", article$parse$text$`*`)) {
# Get the real article name
pname <- read_html(article$parse$text$`*`) %>%
html_nodes(".redirectText a") %>%
html_attr("href")
tmp <- URLdecode(pname)
Encoding(tmp) <- "UTF-8"
pname <- tmp
article <- WikipediR::page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
data.frame(Title = article$parse$title,
PageID = article$parse$pageid,
RevID = article$parse$revid,
Text = article$parse$text$`*`,
stringsAsFactors = FALSE)
}, error = function(e) {
cat("Error: Crawling failed for article", x, "with error message: ", conditionMessage(e),"\n")
})
if (!dir.exists(data.dir)) {
dir.create(data.dir)
write.table(articles, dest.articlesCSV)
saveRDS(articles, dest.articlesRDS)