Skip to content
Snippets Groups Projects

Use wikipedia api

Merged David Fuhry requested to merge use-wikipedia-api into master
All threads resolved!
Files
2
+ 21
3
@@ -19,21 +19,30 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) {
if(file.exists("data/articles.RDS") & use.cache ) {
res <- tryCatch({
data <- readRDS("data/articles.RDS")
cat("Found chached data to use, import finished.\n")
data
}, error = function (e) {
cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes")
cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes.\n")
})
return(res)
}
### Get the list of names
cat("Starting data import\n")
# Download page
cat("Downloading list from wikipedia... ")
page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
cat("Done.\n")
# Extract links as the names given here are not the article names in about 20 cases
cat("Processing data... ")
physicists <- page %>%
html_nodes(".mw-parser-output li a") %>%
html_attr("href")
@@ -61,7 +70,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) {
names(physicists) <- NULL
cat("Done.\nDownloading articles now. This might take a while.\n")
### Get articles
@@ -76,7 +85,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) {
Text = article$parse$text$`*`,
stringsAsFactors = FALSE)
}, error = function(e) {
cat("Error: Crawling failed for article ", x, "with error message: ", conditionMessage(e),"\n")
cat("Error: Crawling failed for article", x, "with error message: ", conditionMessage(e),"\n")
})
})
@@ -85,13 +94,22 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) {
articles <- do.call(rbind, articles)
cat("Download finished.")
# Write result if desired
if(write.cache) {
if (!dir.exists("data")) {
dir.create("data")
}
cat("Writing data to files... ")
write.table(articles, "data/articles.csv")
saveRDS(articles, "data/articles.RDS")
cat("Done.\n")
}
cat("Data import finished.\n")
return(articles)
}
Loading