GetBirthdate.R

#!/usr/bin/env Rscript

# Author: Lukas

library(rvest)
library(stringr)
library(data.table)
library(xml2)

#' Extract birthdate from infobox
#' Will try to get infobox as table and extract birthdate
#' from 'Born'-entry
#' If there is no infobox, first paragraph of the article
#' will be checked for birthdate
#'
#' @param article Article in HTML-format
#' @return String birthdate as string|NULL
getBirthdate <- function(article) {
  
  if(grepl("vcard", article)) {
  
    # Check if there is an infobox
    infoBox <- getInfoBox(article)
    
    # Get the Born field
    birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content
    # Remove everything except the birthdate:
    # - Remove everything in round brackets
    birthdate <- gsub("\\s*\\([^\\)]+\\)", "", birthdate)
    # - Remove everything starting with newline
    birthdate <- gsub("\\n.*$", "", birthdate)
    
    return(birthdate)
  
  
  } else if(!getIntroduction(article) == "") {
    
    # Check first paragraph
    introduction <- getIntroduction(article)
    
    # Get birthdate inside of parentheses
    birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]]
    # Remove parentheses
    birthdate <- substring(birthdate, 2, nchar(birthdate)-1)
    
    return(birthdate)
    
  } else {
    
    # Return Null if there is no birthdate
    return(NULL)
  }
}

### Converts info box to table
getInfoBox <- function(article) {
  # Read page as html
  page <- read_html(article)
  
  # Extracting text from the html will erase all <br> tags,
  # This will replace them with line breaks
  
  xml_find_all(page, ".//br") %>%
    xml_add_sibling("p", "\n")
  
  xml_find_all(page, ".//br") %>%
    xml_remove()
  
  # Get the info box
  # Will throw an error if there isnt any, so that should be checked beforehand
  
  table <- page %>%
    html_nodes("table.vcard") %>%
    html_table(fill = TRUE) %>%
    .[[1]]
  
  colnames(table) <- c("Desc", "Content")
  
  return(table)
}

#' Get Introduction Text from Wikipedia page that contains birthdate
#'
#' @param article article in HTML-format
#' @return string introduction text from wikipedia article
getIntroduction <- function(article) {
  # Read page as html
  page <- read_html(article)
  
  # Extracting text from the html will erase all <br> tags,
  # This will replace them with line breaks
  
  xml_find_all(page, ".//br") %>%
    xml_add_sibling("p", "\n")
  
  xml_find_all(page, ".//br") %>%
    xml_remove
  
  # Get all paragraphs
  paragraphs <- page %>%
    html_nodes("p") %>%
    html_text()
  
  # There will be some leading paragraphs containing only "\n"
  # Remove those leading paragraphs
  remove <- c("\n")
  cleaned <- setdiff(paragraphs, remove)
  introduction <- cleaned[1]
  
  # Return first paragraph
  return(introduction)
}