Skip to content
Snippets Groups Projects
get_data.R 4.15 KiB
Newer Older
David Fuhry's avatar
David Fuhry committed
#!/usr/bin/env Rscript

David Fuhry's avatar
David Fuhry committed
# Author: David

David Fuhry's avatar
David Fuhry committed
## Though we could get the pages within the category 'physicists' with something like this
## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
## this gives us only about 50 pages.
## Instead we crawl the names on the article 'List of Physicists' and query those names
## which gives us something short of a thousand articles

#' Retrieve wikipedia articles about physicists
Lucas Schons's avatar
Lucas Schons committed
#'
David Fuhry's avatar
David Fuhry committed
#' @param use.cache Use cached data if it exists over downloading new data
#' @param write.cache Write downloaded results into cache for use on future calls
Lucas Schons's avatar
Lucas Schons committed
#' @param data.dir Directory the data should be read from and/or written to
David Fuhry's avatar
David Fuhry committed
#' @return data.frame containing the title, id, revisionID and html-formatted full text
David Fuhry's avatar
David Fuhry committed
#' @export
get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
Lucas Schons's avatar
Lucas Schons committed

  dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep)
  dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep)
David Fuhry's avatar
David Fuhry committed
  ### First we check if the data already exists and try to load it if it does
Lucas Schons's avatar
Lucas Schons committed
  if(file.exists(dest.articlesRDS) && use.cache ) {
David Fuhry's avatar
David Fuhry committed
    res <- tryCatch({
Lucas Schons's avatar
Lucas Schons committed
      data <- readRDS(dest.articlesRDS)
David Fuhry's avatar
David Fuhry committed
      cat("Found chached data to use, import finished.\n")
      data
    }, error = function (e) {
      cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes.\n")
    })
    return(res)
  }
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  ### Get the list of names
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # Download page
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  cat("Downloading list from wikipedia... ")
Lucas Schons's avatar
Lucas Schons committed

  page <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_physicists")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  cat("Done.\n")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # Extract links as the names given here are not the article names in about 20 cases
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  cat("Processing data:\n")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  physicists <- page %>%
    rvest::html_nodes(".mw-parser-output li a") %>%
    rvest::html_attr("href")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # Clean the list
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  physicists <- physicists[nchar(physicists) > 5]
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  length(physicists) <- length(physicists) - 3
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  physicists <- gsub("_", " ", physicists)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  physicists <- gsub("/wiki/", "", physicists)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
  # On linux `physicists <- URLdecode(physicists)` should do the trick
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  physicists <- sapply(physicists, function(x) {
    tmp <- URLdecode(x)
    Encoding(tmp) <- "UTF-8"
    tmp
  })
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  names(physicists) <- NULL
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  cat("Done.\nDownloading articles now. This might take a while.\n")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  ### Get articles
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # Call the wikipedia api for each entry in our list
Lucas Schons's avatar
Lucas Schons committed

  articles <- pbapply::pblapply(physicists, function(x) {
David Fuhry's avatar
David Fuhry committed
    res <- tryCatch({
      article <- WikipediR::page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
David Fuhry's avatar
David Fuhry committed
      # Check if the article is a redirect page
      if (grepl(".redirectText", article$parse$text$`*`)) {
        # Get the real article name
        pname <- xml2::read_html(article$parse$text$`*`) %>%
          rvest::html_nodes(".redirectText a") %>%
          rvest::html_attr("href")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
        panme <- gsub("_", " ", pname)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
        pname <- gsub("/wiki/", "", pname)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
        pname <- gsub("\\s*\\([^\\)]+\\)", "", pname)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
        tmp <- URLdecode(pname)
        Encoding(tmp) <- "UTF-8"
        pname <- tmp
Lucas Schons's avatar
Lucas Schons committed

        article <- WikipediR::page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
David Fuhry's avatar
David Fuhry committed
      }
David Fuhry's avatar
David Fuhry committed
      data.frame(Title = article$parse$title,
                 PageID = article$parse$pageid,
                 RevID = article$parse$revid,
                 Text = article$parse$text$`*`,
                 stringsAsFactors = FALSE)
    }, error = function(e) {
      cat("Error: Crawling failed for article", x, "with error message: ", conditionMessage(e),"\n")
    })
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  })
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # Bind it all together
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  articles <- do.call(rbind, articles)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  cat("Download finished.\n")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # Write result if desired
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  if(write.cache) {
Lucas Schons's avatar
Lucas Schons committed
    if (!dir.exists(data.dir)) {
      dir.create(data.dir)
David Fuhry's avatar
David Fuhry committed
    }
    cat("Writing data to files... ")
Lucas Schons's avatar
Lucas Schons committed
    write.table(articles, dest.articlesCSV)
    saveRDS(articles, dest.articlesRDS)
David Fuhry's avatar
David Fuhry committed
    cat("Done.\n")
  }
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  cat("Data import finished.\n")
Lucas Schons's avatar
Lucas Schons committed

Dev's avatar
Dev committed
  # Still need this return
David Fuhry's avatar
David Fuhry committed
  return(articles)
Lucas Schons's avatar
Lucas Schons committed
}