Skip to content
Snippets Groups Projects

Use wikipedia api

Merged David Fuhry requested to merge use-wikipedia-api into master
Files
2
+ 101
60
@@ -9,68 +9,109 @@ library(rvest) # For getting the list of physicits
## Instead we crawl the names on the article 'List of Physicists' and query those names
## which gives us something short of a thousand articles
### Get the list of names
# Download page
page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
# Extract links as the names given here are not the article names in about 20 cases
physicists <- page %>%
html_nodes(".mw-parser-output li a") %>%
html_attr("href")
# Clean the list
physicists <- physicists[nchar(physicists) > 5]
length(physicists) <- length(physicists) - 3
physicists <- gsub("_", " ", physicists)
physicists <- gsub("/wiki/", "", physicists)
physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
# This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
# On linux `physicists <- URLdecode(physicists)` should do the trick
physicists <- sapply(physicists, function(x) {
tmp <- URLdecode(x)
Encoding(tmp) <- "UTF-8"
tmp
})
names(physicists) <- NULL
### Get articles
# Call the wikipedia api for each entry in our list
articles <- lapply(physicists, function(x) {
res <- tryCatch({
article <- page_content("en", "wikipedia", page_name = x, as_wikitext = TRUE)
data.frame(Title = article$parse$title,
PageID = article$parse$pageid,
RevID = article$parse$revid,
Text = article$parse$wikitext$`*`,
stringsAsFactors = FALSE)
}, error = function(e) {
cat("Error: Crawling failed for article ", x, "with error message: ", conditionMessage(e),"\n")
#' Retrieve wikipedia articles about physicists
#'
#' @param use.cache Use cached data if it exists over downloading new data
#' @param write.cache Write downloaded results into cache for use on future calls
#' @return data.frame containing the title, id, revisionID and html-formatted full text
getData <- function(use.cache = TRUE, write.cache = FALSE) {
### First we check if the data already exists and try to load it if it does
if(file.exists("data/articles.RDS") & use.cache ) {
res <- tryCatch({
data <- readRDS("data/articles.RDS")
cat("Found chached data to use, import finished.\n")
data
}, error = function (e) {
cat("Cached data was found but could not be loaded. Downloading from wikipedia, this might take a few minutes.\n")
})
return(res)
}
### Get the list of names
cat("Starting data import\n")
# Download page
cat("Downloading list from wikipedia... ")
page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
cat("Done.\n")
# Extract links as the names given here are not the article names in about 20 cases
cat("Processing data... ")
physicists <- page %>%
html_nodes(".mw-parser-output li a") %>%
html_attr("href")
# Clean the list
physicists <- physicists[nchar(physicists) > 5]
length(physicists) <- length(physicists) - 3
physicists <- gsub("_", " ", physicists)
physicists <- gsub("/wiki/", "", physicists)
physicists <- gsub("\\s*\\([^\\)]+\\)", "", physicists)
# This is probably only needed on windows (and perhaps os x) as R on windows messes quite a bit with the encoding
# On linux `physicists <- URLdecode(physicists)` should do the trick
physicists <- sapply(physicists, function(x) {
tmp <- URLdecode(x)
Encoding(tmp) <- "UTF-8"
tmp
})
})
# Bind it all together
articles <- do.call(rbind, articles)
# Write result
names(physicists) <- NULL
cat("Done.\nDownloading articles now. This might take a while.\n")
### Get articles
# Call the wikipedia api for each entry in our list
articles <- lapply(physicists, function(x) {
res <- tryCatch({
article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
data.frame(Title = article$parse$title,
PageID = article$parse$pageid,
RevID = article$parse$revid,
Text = article$parse$text$`*`,
stringsAsFactors = FALSE)
}, error = function(e) {
cat("Error: Crawling failed for article", x, "with error message: ", conditionMessage(e),"\n")
})
})
# Bind it all together
articles <- do.call(rbind, articles)
cat("Download finished.")
# Write result if desired
if(write.cache) {
if (!dir.exists("data")) {
dir.create("data")
}
cat("Writing data to files... ")
write.table(articles, "data/articles.csv")
saveRDS(articles, "data/articles.RDS")
cat("Done.\n")
}
cat("Data import finished.\n")
return(articles)
}
write.table(articles, "../data/articles.csv")
saveRDS(articles, "../data/articles.RDS")
Loading