Skip to content
Snippets Groups Projects
get_data.R 4.09 KiB
Newer Older
David Fuhry's avatar
David Fuhry committed
#!/usr/bin/env Rscript

David Fuhry's avatar
David Fuhry committed
# Author: David

David Fuhry's avatar
David Fuhry committed
## Though we could get the pages within the category 'physicists' with something like this
## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
## this gives us only about 50 pages.
## Instead we crawl the names on the article 'List of Physicists' and query those names
## which gives us something short of a thousand articles

#' Retrieve wikipedia articles about physicists
Lucas Schons's avatar
Lucas Schons committed
#'
David Fuhry's avatar
David Fuhry committed
#' @param use.cache Use cached data if it exists over downloading new data
#' @param write.cache Write downloaded results into cache for use on future calls
Lucas Schons's avatar
Lucas Schons committed
#' @param data.dir Directory the data should be read from and/or written to
David Fuhry's avatar
David Fuhry committed
#' @return data.frame containing the title, id, revisionID and html-formatted full text
David Fuhry's avatar
David Fuhry committed
#' @export
get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
Lucas Schons's avatar
Lucas Schons committed

  dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep)
  dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep)
David Fuhry's avatar
David Fuhry committed
  ### First we check if the data already exists and try to load it if it does
Lucas Schons's avatar
Lucas Schons committed
  if(file.exists(dest.articlesRDS) && use.cache ) {
David Fuhry's avatar
David Fuhry committed
    res <- tryCatch({
Lucas Schons's avatar
Lucas Schons committed
      data <- readRDS(dest.articlesRDS)
David Fuhry's avatar
David Fuhry committed
      cat("Found chached data to use, import finished.\n")
      data
    }, error = function (e) {
      cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes.\n")
    })
    return(res)
  }
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  ### Get the list of names
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # Download page
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  cat("Downloading list from wikipedia... ")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  cat("Done.\n")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # Extract links as the names given here are not the article names in about 20 cases
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  cat("Processing data:\n")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  physicists <- page %>%
David Fuhry's avatar
David Fuhry committed
    html_nodes(".mw-parser-output li a") %>%
    html_attr("href")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # Clean the list
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  physicists <- physicists[nchar(physicists) > 5]
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  length(physicists) <- length(physicists) - 3
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  physicists <- gsub("_", " ", physicists)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  physicists <- gsub("/wiki/", "", physicists)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
  # On linux `physicists <- URLdecode(physicists)` should do the trick
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  physicists <- sapply(physicists, function(x) {
    tmp <- URLdecode(x)
    Encoding(tmp) <- "UTF-8"
    tmp
  })
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  names(physicists) <- NULL
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  cat("Done.\nDownloading articles now. This might take a while.\n")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  ### Get articles
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # Call the wikipedia api for each entry in our list
Lucas Schons's avatar
Lucas Schons committed

  articles <- pbapply::pblapply(physicists, function(x) {
David Fuhry's avatar
David Fuhry committed
    res <- tryCatch({
      article <- WikipediR::page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
David Fuhry's avatar
David Fuhry committed
      # Check if the article is a redirect page
      if (grepl(".redirectText", article$parse$text$`*`)) {
        # Get the real article name
David Fuhry's avatar
David Fuhry committed
        pname <- read_html(article$parse$text$`*`) %>%
          html_nodes(".redirectText a") %>%
          html_attr("href")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
        panme <- gsub("_", " ", pname)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
        pname <- gsub("/wiki/", "", pname)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
        pname <- gsub("\\s*\\([^\\)]+\\)", "", pname)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
        tmp <- URLdecode(pname)
        Encoding(tmp) <- "UTF-8"
        pname <- tmp
Lucas Schons's avatar
Lucas Schons committed

        article <- WikipediR::page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
David Fuhry's avatar
David Fuhry committed
      }
David Fuhry's avatar
David Fuhry committed
      data.frame(Title = article$parse$title,
                 PageID = article$parse$pageid,
                 RevID = article$parse$revid,
                 Text = article$parse$text$`*`,
                 stringsAsFactors = FALSE)
    }, error = function(e) {
      cat("Error: Crawling failed for article", x, "with error message: ", conditionMessage(e),"\n")
    })
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  })
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # Bind it all together
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  articles <- do.call(rbind, articles)
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  cat("Download finished.\n")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  # Write result if desired
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  if(write.cache) {
Lucas Schons's avatar
Lucas Schons committed
    if (!dir.exists(data.dir)) {
      dir.create(data.dir)
David Fuhry's avatar
David Fuhry committed
    }
    cat("Writing data to files... ")
Lucas Schons's avatar
Lucas Schons committed
    write.table(articles, dest.articlesCSV)
    saveRDS(articles, dest.articlesRDS)
David Fuhry's avatar
David Fuhry committed
    cat("Done.\n")
  }
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  cat("Data import finished.\n")
Lucas Schons's avatar
Lucas Schons committed

David Fuhry's avatar
David Fuhry committed
  return(articles)
Lucas Schons's avatar
Lucas Schons committed
}