Skip to content
Snippets Groups Projects
Commit e9d6421a authored by David Fuhry's avatar David Fuhry
Browse files

Fix #16

parent d66c924f
No related branches found
No related tags found
1 merge request!9Fix #16
#!/usr/bin/env Rscript
# Author: David
library(WikipediR) # For querying wikipedia
library(rvest) # For getting the list of physicits
......@@ -79,6 +81,25 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) {
articles <- lapply(physicists, function(x) {
res <- tryCatch({
article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
# Check if the article is a redirect page
if (grepl(".redirectText", article$parse$text$`*`)) {
# Get the real article name
pname <- read_html(article$parse$text$`*`) %>%
html_nodes(".redirectText a") %>%
html_attr("href")
panme <- gsub("_", " ", pname)
pname <- gsub("/wiki/", "", pname)
pname <- gsub("\\s*\\([^\\)]+\\)", "", pname)
tmp <- URLdecode(pname)
Encoding(tmp) <- "UTF-8"
pname <- tmp
article <- page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
}
data.frame(Title = article$parse$title,
PageID = article$parse$pageid,
RevID = article$parse$revid,
......@@ -94,7 +115,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) {
articles <- do.call(rbind, articles)
cat("Download finished.")
cat("Download finished.\n")
# Write result if desired
......@@ -111,7 +132,4 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) {
cat("Data import finished.\n")
return(articles)
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment