David Fuhry · David Fuhry · d98b9233 · ec141122 · 937b3fe7 · 91947105
--- a/r/GetData.R

+ 101

− 60
+++ b/r/GetData.R

+ 101

− 60
 @@ -9,68 +9,109 @@ library(rvest) # For getting the list of physicits
 ## Instead we crawl the names on the article 'List of Physicists' and query those names
 ## which gives us something short of a thousand articles

-### Get the list of names
-
-# Download page
-
-page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
-
-# Extract links as the names given here are not the article names in about 20 cases
-
-physicists <- page %>%
-  html_nodes(".mw-parser-output li a") %>%
-  html_attr("href")
-
-# Clean the list
-
-physicists <- physicists[nchar(physicists) > 5]
-
-length(physicists) <- length(physicists) - 3
-
-physicists <- gsub("_", " ", physicists)
-
-physicists <- gsub("/wiki/", "", physicists)
-
-physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
-
-# This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
-# On linux `physicists <- URLdecode(physicists)` should do the trick
-
-physicists <- sapply(physicists, function(x) {
-  tmp <- URLdecode(x)
-  Encoding(tmp) <- "UTF-8"
-  tmp
-})
-
-names(physicists) <- NULL
-
-
-
-### Get articles
-
-# Call the wikipedia api for each entry in our list
-
-articles <- lapply(physicists, function(x) {
-  res <- tryCatch({
-    article <- page_content("en", "wikipedia", page_name = x, as_wikitext = TRUE)
-    data.frame(Title = article$parse$title,
-               PageID = article$parse$pageid,
-               RevID = article$parse$revid,
-               Text = article$parse$wikitext$`*`,
-               stringsAsFactors = FALSE)
-  }, error = function(e) {
-    cat("Error: Crawling failed for article ", x, "with error message: ", conditionMessage(e),"\n")
+#' Retrieve wikipedia articles about physicists
+#' 
+#' @param use.cache Use cached data if it exists over downloading new data
+#' @param write.cache Write downloaded results into cache for use on future calls
+#' @return data.frame containing the title, id, revisionID and html-formatted full text
+getData <- function(use.cache = TRUE, write.cache = FALSE) {
+  ### First we check if the data already exists and try to load it if it does
+  if(file.exists("data/articles.RDS") & use.cache ) {
+    res <- tryCatch({
+      data <- readRDS("data/articles.RDS")
+      cat("Found chached data to use, import finished.\n")
+      data
+    }, error = function (e) {
+      cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes.\n")
+    })
+    return(res)
+  }
+  
+  ### Get the list of names
+  
+  cat("Starting data import\n")
+  
+  # Download page
+  
+  cat("Downloading list from wikipedia... ")
+  
+  page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
+  
+  cat("Done.\n")
+  
+  # Extract links as the names given here are not the article names in about 20 cases
+  
+  cat("Processing data... ")
+  
+  physicists <- page %>%
+    html_nodes(".mw-parser-output li a") %>%
+    html_attr("href")
+  
+  # Clean the list
+  
+  physicists <- physicists[nchar(physicists) > 5]
+  
+  length(physicists) <- length(physicists) - 3
+  
+  physicists <- gsub("_", " ", physicists)
+  
+  physicists <- gsub("/wiki/", "", physicists)
+  
+  physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
+  
+  # This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
+  # On linux `physicists <- URLdecode(physicists)` should do the trick
+  
+  physicists <- sapply(physicists, function(x) {
+    tmp <- URLdecode(x)
+    Encoding(tmp) <- "UTF-8"
+    tmp
  })
  
-})
-
-# Bind it all together
-
-articles <- do.call(rbind, articles)
-
-# Write result
+  names(physicists) <- NULL
+  
+  cat("Done.\nDownloading articles now. This might take a while.\n")
+  
+  ### Get articles
+  
+  # Call the wikipedia api for each entry in our list
+  
+  articles <- lapply(physicists, function(x) {
+    res <- tryCatch({
+      article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
+      data.frame(Title = article$parse$title,
+                 PageID = article$parse$pageid,
+                 RevID = article$parse$revid,
+                 Text = article$parse$text$`*`,
+                 stringsAsFactors = FALSE)
+    }, error = function(e) {
+      cat("Error: Crawling failed for article", x, "with error message: ", conditionMessage(e),"\n")
+    })
+    
+  })
+  
+  # Bind it all together
+  
+  articles <- do.call(rbind, articles)
+  
+  cat("Download finished.")
+  
+  # Write result if desired
+  
+  if(write.cache) {
+    if (!dir.exists("data")) {
+      dir.create("data")
+    }
+    cat("Writing data to files... ")
+    write.table(articles, "data/articles.csv")
+    saveRDS(articles, "data/articles.RDS")
+    cat("Done.\n")
+  }
+  
+  cat("Data import finished.\n")
+  
+  return(articles)
+}

-write.table(articles, "../data/articles.csv")

-saveRDS(articles, "../data/articles.RDS")