David Fuhry · d98b9233 · ec141122 · 937b3fe7 · 91947105 · 3555d245
--- a/r/GetData.R

+ 21

− 3
+++ b/r/GetData.R

+ 21

− 3
 @@ -19,21 +19,30 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) {
  if(file.exists("data/articles.RDS") & use.cache ) {
    res <- tryCatch({
      data <- readRDS("data/articles.RDS")
+      cat("Found chached data to use, import finished.\n")
      data
    }, error = function (e) {
-      cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes")
+      cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes.\n")
    })
    return(res)
  }
  
  ### Get the list of names
  
+  cat("Starting data import\n")
+  
  # Download page
  
+  cat("Downloading list from wikipedia... ")
+  
  page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
  
+  cat("Done.\n")
+  
  # Extract links as the names given here are not the article names in about 20 cases
  
+  cat("Processing data... ")
+  
  physicists <- page %>%
    html_nodes(".mw-parser-output li a") %>%
    html_attr("href")
 @@ -61,7 +70,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) {
  
  names(physicists) <- NULL
  
-  
+  cat("Done.\nDownloading articles now. This might take a while.\n")
  
  ### Get articles
  
 @@ -76,7 +85,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) {
                 Text = article$parse$text$`*`,
                 stringsAsFactors = FALSE)
    }, error = function(e) {
-      cat("Error: Crawling failed for article ", x, "with error message: ", conditionMessage(e),"\n")
+      cat("Error: Crawling failed for article", x, "with error message: ", conditionMessage(e),"\n")
    })
    
  })
 @@ -85,13 +94,22 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) {
  
  articles <- do.call(rbind, articles)
  
+  cat("Download finished.")
+  
  # Write result if desired
  
  if(write.cache) {
+    if (!dir.exists("data")) {
+      dir.create("data")
+    }
+    cat("Writing data to files... ")
    write.table(articles, "data/articles.csv")
    saveRDS(articles, "data/articles.RDS")
+    cat("Done.\n")
  }
  
+  cat("Data import finished.\n")
+  
  return(articles)
 }