#!/usr/bin/env Rscript # Author: Lukas library(rvest) library(stringr) library(data.table) library(xml2) #' Extract birthdate from infobox #' Will try to get infobox as table and extract birthdate #' from 'Born'-entry #' If there is no infobox, first paragraph of the article #' will be checked for birthdate #' #' @export #' @param article Article in HTML-format #' @return String birthdate as string|NULL get_birthdate <- function(article) { if(grepl("vcard", article)) { # Check if there is an infobox infoBox <- get_infobox(article) # Get the Born field birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content # Remove everything except the birthdate: # - Remove everything in round brackets birthdate <- gsub("\\s*\\([^\\)]+\\)", "", birthdate) # - Remove everything starting with newline birthdate <- gsub("\\n.*$", "", birthdate) return(birthdate) } else if(!getIntroduction(article) == "") { # Check first paragraph introduction <- getIntroduction(article) # Get birthdate inside of parentheses birthdate <- stringr::str_extract_all(introduction, "\\([^()]+\\)")[[1]] # Remove parentheses birthdate <- substring(birthdate, 2, nchar(birthdate)-1) return(birthdate) } else { # Return Null if there is no birthdate return(NULL) } } #' Get Introduction Text from Wikipedia page that contains birthdate #' #' @param article article in HTML-format #' @return string introduction text from wikipedia article getIntroduction <- function(article) { # Read page as html page <- xml2::read_html(article) # Extracting text from the html will erase all <br> tags, # This will replace them with line breaks xml2::xml_find_all(page, ".//br") %>% xml2::xml_add_sibling("p", "\n") xml2::xml_find_all(page, ".//br") %>% xml2::xml_remove # Get all paragraphs paragraphs <- page %>% rvest::html_nodes("p") %>% rvest::html_text() # There will be some leading paragraphs containing only "\n" # Remove those leading paragraphs remove <- c("\n") cleaned <- data.table::setdiff(paragraphs, remove) introduction <- cleaned[1] # Return first paragraph return(introduction) }