#!/usr/bin/env Rscript # Author: David ## Though we could get the pages within the category 'physicists' with something like this ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers ## this gives us only about 50 pages. ## Instead we crawl the names on the article 'List of Physicists' and query those names ## which gives us something short of a thousand articles #' Retrieve wikipedia articles about physicists #' #' @param use.cache Use cached data if it exists over downloading new data #' @param write.cache Write downloaded results into cache for use on future calls #' @param data.dir Directory the data should be read from and/or written to #' @return data.frame containing the title, id, revisionID and html-formatted full text #' @export get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep) dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep) ### First we check if the data already exists and try to load it if it does if(file.exists(dest.articlesRDS) && use.cache ) { res <- tryCatch({ data <- readRDS(dest.articlesRDS) cat("Found chached data to use, import finished.\n") data }, error = function (e) { cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes.\n") }) return(res) } ### Get the list of names # Download page cat("Downloading list from wikipedia... ") page <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_physicists") cat("Done.\n") # Extract links as the names given here are not the article names in about 20 cases cat("Processing data:\n") physicists <- page %>% rvest::html_nodes(".mw-parser-output li a") %>% rvest::html_attr("href") # Clean the list physicists <- physicists[nchar(physicists) > 5] length(physicists) <- length(physicists) - 3 physicists <- gsub("_", " ", physicists) physicists <- gsub("/wiki/", "", physicists) physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists) # This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding # On linux `physicists <- URLdecode(physicists)` should do the trick physicists <- sapply(physicists, function(x) { tmp <- URLdecode(x) Encoding(tmp) <- "UTF-8" tmp }) names(physicists) <- NULL cat("Done.\nDownloading articles now. This might take a while.\n") ### Get articles # Call the wikipedia api for each entry in our list articles <- pbapply::pblapply(physicists, function(x) { res <- tryCatch({ article <- WikipediR::page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE) # Check if the article is a redirect page if (grepl(".redirectText", article$parse$text$`*`)) { # Get the real article name pname <- xml2::read_html(article$parse$text$`*`) %>% rvest::html_nodes(".redirectText a") %>% rvest::html_attr("href") panme <- gsub("_", " ", pname) pname <- gsub("/wiki/", "", pname) pname <- gsub("\\s*\\([^\\)]+\\)", "", pname) tmp <- URLdecode(pname) Encoding(tmp) <- "UTF-8" pname <- tmp article <- WikipediR::page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE) } data.frame(Title = article$parse$title, PageID = article$parse$pageid, RevID = article$parse$revid, Text = article$parse$text$`*`, stringsAsFactors = FALSE) }, error = function(e) { cat("Error: Crawling failed for article", x, "with error message: ", conditionMessage(e),"\n") }) }) # Bind it all together articles <- do.call(rbind, articles) cat("Download finished.\n") # Write result if desired if(write.cache) { if (!dir.exists(data.dir)) { dir.create(data.dir) } cat("Writing data to files... ") write.table(articles, dest.articlesCSV) saveRDS(articles, dest.articlesRDS) cat("Done.\n") } cat("Data import finished.\n") # Still need this return return(articles) }