Newer
Older
#!/usr/bin/env Rscript
# Author: Lukas
library(rvest)
library(stringr)
library(data.table)
#' Extract birthdate from infobox
#' Will try to get infobox as table and extract birthdate
#' from 'Born'-entry
#' If there is no infobox, first paragraph of the article
#' will be checked for birthdate
#'
#' @return String birthdate as string|NULL
getBirthdate <- function(article) {
# Check if there is an infobox
infoBox <- getInfoBox(article)
# Get the Born field
birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content
# Remove everything except the birthdate:
# - Remove everything in round brackets
birthdate <- gsub("\\s*\\([^\\)]+\\)", "", birthdate)
# - Remove everything starting with newline
birthdate <- gsub("\\n.*$", "", birthdate)
} else if(!getIntroduction(article) == "") {
# Check first paragraph
introduction <- getIntroduction(article)
# Get birthdate inside of parentheses
birthdate <- stringr::str_extract_all(introduction, "\\([^()]+\\)")[[1]]
# Remove parentheses
birthdate <- substring(birthdate, 2, nchar(birthdate)-1)
# Return Null if there is no birthdate
return(NULL)
}
}
### Converts info box to table
getInfoBox <- function(article) {
# Read page as html
page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags,
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
rvest::html_nodes("table.vcard") %>%
rvest::html_table(fill = TRUE) %>%
colnames(table) <- c("Desc", "Content")
#' Get Introduction Text from Wikipedia page that contains birthdate
#'
#' @param article article in HTML-format
#' @return string introduction text from wikipedia article
getIntroduction <- function(article) {
# Read page as html
page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags,
# This will replace them with line breaks
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove
# Get all paragraphs
paragraphs <- page %>%
rvest::html_nodes("p") %>%
rvest::html_text()
# There will be some leading paragraphs containing only "\n"
# Remove those leading paragraphs
cleaned <- data.table::setdiff(paragraphs, remove)
# Return first paragraph
return(introduction)
}