Newer
Older
#!/usr/bin/env Rscript
# Author: Lukas
library(rvest)
library(stringr)
library(data.table)
#' Extract birthdate from infobox
#' Will try to get infobox as table and extract birthdate
#' from 'Born'-entry
#' If there is no infobox, first paragraph of the article
#' will be checked for birthdate
#'
#' @return String birthdate as string|NULL
# Get the Born field
birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content
# Remove everything except the birthdate:
# - Remove everything in round brackets
birthdate <- gsub("\\s*\\([^\\)]+\\)", "", birthdate)
# - Remove everything starting with newline
birthdate <- gsub("\\n.*$", "", birthdate)
} else if(!getIntroduction(article) == "") {
# Check first paragraph
introduction <- getIntroduction(article)
# Get birthdate inside of parentheses
birthdate <- stringr::str_extract_all(introduction, "\\([^()]+\\)")[[1]]
# Remove parentheses
birthdate <- substring(birthdate, 2, nchar(birthdate)-1)
# Return Null if there is no birthdate
return(NULL)
}
#' Get Introduction Text from Wikipedia page that contains birthdate
#'
#' @param article article in HTML-format
#' @return string introduction text from wikipedia article
getIntroduction <- function(article) {
# Read page as html
page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags,
# This will replace them with line breaks
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove
# Get all paragraphs
paragraphs <- page %>%
rvest::html_nodes("p") %>%
rvest::html_text()
# There will be some leading paragraphs containing only "\n"
# Remove those leading paragraphs
cleaned <- data.table::setdiff(paragraphs, remove)
# Return first paragraph
return(introduction)
}