diff --git a/r/GetData.R b/r/GetData.R index f45c3b98ffb673d9a313ffcfa2292f53e4186d6a..28afae83c861bf975eb9a600ddc1c733261fd7de 100644 --- a/r/GetData.R +++ b/r/GetData.R @@ -1,5 +1,7 @@ #!/usr/bin/env Rscript +# Author: David + library(WikipediR) # For querying wikipedia library(rvest) # For getting the list of physicits @@ -79,6 +81,25 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) { articles <- lapply(physicists, function(x) { res <- tryCatch({ article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE) + # Check if the article is a redirect page + if (grepl(".redirectText", article$parse$text$`*`)) { + # Get the real article name + pname <- read_html(article$parse$text$`*`) %>% + html_nodes(".redirectText a") %>% + html_attr("href") + + panme <- gsub("_", " ", pname) + + pname <- gsub("/wiki/", "", pname) + + pname <- gsub("\\s*\\([^\\)]+\\)", "", pname) + + tmp <- URLdecode(pname) + Encoding(tmp) <- "UTF-8" + pname <- tmp + + article <- page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE) + } data.frame(Title = article$parse$title, PageID = article$parse$pageid, RevID = article$parse$revid, @@ -94,7 +115,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) { articles <- do.call(rbind, articles) - cat("Download finished.") + cat("Download finished.\n") # Write result if desired @@ -111,7 +132,4 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) { cat("Data import finished.\n") return(articles) -} - - - +} \ No newline at end of file