Refactoring, using cashes

Implements #10 Fixes #11 Refactored to provide function getData()

Refactoring, using cashes
937b3fe7 · David Fuhry · 91947105 · 937b3fe7
Commit 937b3fe7 authored 6 years ago by David Fuhry
--- a/r/GetData.R
+++ b/r/GetData.R
@@ -9,68 +9,91 @@ library(rvest) # For getting the list of physicits
 ## Instead we crawl the names on the article 'List of Physicists' and query those names
 ## which gives us something short of a thousand articles
-### Get the list of names
+#' Retrieve wikipedia articles about physicists
+#' 
-# Download page
+#' @param use.cache Use cached data if it exists over downloading new data
+#' @param write.cache Write downloaded results into cache for use on future calls
-page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
+#' @return data.frame containing the title, id, revisionID and html-formatted full text
+getData <- function(use.cache = TRUE, write.cache = FALSE) {
-# Extract links as the names given here are not the article names in about 20 cases
+  ### First we check if the data already exists and try to load it if it does
+  if(file.exists("data/articles.RDS") & use.cache ) {
-physicists <- page %>%
+    res <- tryCatch({
-  html_nodes(".mw-parser-output li a") %>%
+      data <- readRDS("data/articles.RDS")
-  html_attr("href")
+      data
+    }, error = function (e) {
-# Clean the list
+      cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes")
+    })
-physicists <- physicists[nchar(physicists) > 5]
+    return(res)
+  }
-length(physicists) <- length(physicists) - 3
+  ### Get the list of names
-physicists <- gsub("_", " ", physicists)
+  # Download page
-physicists <- gsub("/wiki/", "", physicists)
+  page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
-physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
+  # Extract links as the names given here are not the article names in about 20 cases
-# This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
-# On linux `physicists <- URLdecode(physicists)` should do the trick
+  physicists <- page %>%
+    html_nodes(".mw-parser-output li a") %>%
-physicists <- sapply(physicists, function(x) {
+    html_attr("href")
-  tmp <- URLdecode(x)
-  Encoding(tmp) <- "UTF-8"
+  # Clean the list
-  tmp
-})
+  physicists <- physicists[nchar(physicists) > 5]
-names(physicists) <- NULL
+  length(physicists) <- length(physicists) - 3
+  physicists <- gsub("_", " ", physicists)
-### Get articles
+  physicists <- gsub("/wiki/", "", physicists)
-# Call the wikipedia api for each entry in our list
+  physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
-articles <- lapply(physicists, function(x) {
+  # This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
-  res <- tryCatch({
+  # On linux `physicists <- URLdecode(physicists)` should do the trick
-    article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
-    data.frame(Title = article$parse$title,
+  physicists <- sapply(physicists, function(x) {
-               PageID = article$parse$pageid,
+    tmp <- URLdecode(x)
-               RevID = article$parse$revid,
+    Encoding(tmp) <- "UTF-8"
-               Text = article$parse$text$`*`,
+    tmp
-               stringsAsFactors = FALSE)
-  }, error = function(e) {
-    cat("Error: Crawling failed for article ", x, "with error message: ", conditionMessage(e),"\n")
  })
-})
+  names(physicists) <- NULL
-# Bind it all together
-articles <- do.call(rbind, articles)
+  ### Get articles
-# Write result
+  # Call the wikipedia api for each entry in our list
+  articles <- lapply(physicists, function(x) {
+    res <- tryCatch({
+      article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
+      data.frame(Title = article$parse$title,
+                 PageID = article$parse$pageid,
+                 RevID = article$parse$revid,
+                 Text = article$parse$text$`*`,
+                 stringsAsFactors = FALSE)
+    }, error = function(e) {
+      cat("Error: Crawling failed for article ", x, "with error message: ", conditionMessage(e),"\n")
+    })
+  })
+  # Bind it all together
+  articles <- do.call(rbind, articles)
+  # Write result if desired
+  if(write.cache) {
+    write.table(articles, "data/articles.csv")
+    saveRDS(articles, "data/articles.RDS")
+  }
+  return(articles)
+}
-write.table(articles, "../data/articles.csv")
-saveRDS(articles, "../data/articles.RDS")