get_data.R

#!/usr/bin/env Rscript

# Author: David

## Though we could get the pages within the category 'physicists' with something like this
## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
## this gives us only about 50 pages.
## Instead we crawl the names on the article 'List of Physicists' and query those names
## which gives us something short of a thousand articles

#' Retrieve wikipedia articles about physicists
#'
#' @param use.cache Use cached data if it exists over downloading new data
#' @param write.cache Write downloaded results into cache for use on future calls
#' @param data.dir Directory the data should be read from and/or written to
#' @return data.frame containing the title, id, revisionID and html-formatted full text
#' @export
get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {

  dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep)
  dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep)
  ### First we check if the data already exists and try to load it if it does
  if(file.exists(dest.articlesRDS) && use.cache ) {
    res <- tryCatch({
      data <- readRDS(dest.articlesRDS)
      cat("Found chached data to use, import finished.\n")
      data
    }, error = function (e) {
      cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes.\n")
    })
    return(res)
  }

  ### Get the list of names

  # Download page

  cat("Downloading list from wikipedia... ")

  page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")

  cat("Done.\n")

  # Extract links as the names given here are not the article names in about 20 cases

  cat("Processing data:\n")

  physicists <- page %>%
    html_nodes(".mw-parser-output li a") %>%
    html_attr("href")

  # Clean the list

  physicists <- physicists[nchar(physicists) > 5]

  length(physicists) <- length(physicists) - 3

  physicists <- gsub("_", " ", physicists)

  physicists <- gsub("/wiki/", "", physicists)

  physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)

  # This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
  # On linux `physicists <- URLdecode(physicists)` should do the trick

  physicists <- sapply(physicists, function(x) {
    tmp <- URLdecode(x)
    Encoding(tmp) <- "UTF-8"
    tmp
  })

  names(physicists) <- NULL

  cat("Done.\nDownloading articles now. This might take a while.\n")

  ### Get articles

  # Call the wikipedia api for each entry in our list

  articles <- pbapply::pblapply(physicists, function(x) {
    res <- tryCatch({
      article <- WikipediR::page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
      # Check if the article is a redirect page
      if (grepl(".redirectText", article$parse$text$`*`)) {
        # Get the real article name
        pname <- read_html(article$parse$text$`*`) %>%
          html_nodes(".redirectText a") %>%
          html_attr("href")

        panme <- gsub("_", " ", pname)

        pname <- gsub("/wiki/", "", pname)

        pname <- gsub("\\s*\\([^\\)]+\\)", "", pname)

        tmp <- URLdecode(pname)
        Encoding(tmp) <- "UTF-8"
        pname <- tmp

        article <- WikipediR::page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
      }
      data.frame(Title = article$parse$title,
                 PageID = article$parse$pageid,
                 RevID = article$parse$revid,
                 Text = article$parse$text$`*`,
                 stringsAsFactors = FALSE)
    }, error = function(e) {
      cat("Error: Crawling failed for article", x, "with error message: ", conditionMessage(e),"\n")
    })

  })

  # Bind it all together

  articles <- do.call(rbind, articles)

  cat("Download finished.\n")

  # Write result if desired

  if(write.cache) {
    if (!dir.exists(data.dir)) {
      dir.create(data.dir)
    }
    cat("Writing data to files... ")
    write.table(articles, dest.articlesCSV)
    saveRDS(articles, dest.articlesRDS)
    cat("Done.\n")
  }

  cat("Data import finished.\n")

  return(articles)
}