GetBirthdate.R

#!/usr/bin/env Rscript

# Author: Lukas

library(rvest)
library(stringr)
library(data.table)
library(xml2)

#' Extract birthdate from infobox
#' Will try to get infobox as table and extract birthdate
#' from 'Born'-entry
#' If there is no infobox, first paragraph of the article
#' will be checked for birthdate
#'
#' @export
#' @param article Article in HTML-format
#' @return String birthdate as string|NULL
getBirthdate <- function(article) {

  if(grepl("vcard", article)) {

    # Check if there is an infobox
    infoBox <- getInfoBox(article)

    # Get the Born field
    birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content
    # Remove everything except the birthdate:
    # - Remove everything in round brackets
    birthdate <- gsub("\\s*\\([^\\)]+\\)", "", birthdate)
    # - Remove everything starting with newline
    birthdate <- gsub("\\n.*$", "", birthdate)

    return(birthdate)


  } else if(!getIntroduction(article) == "") {

    # Check first paragraph
    introduction <- getIntroduction(article)

    # Get birthdate inside of parentheses
    birthdate <- stringr::str_extract_all(introduction, "\\([^()]+\\)")[[1]]
    # Remove parentheses
    birthdate <- substring(birthdate, 2, nchar(birthdate)-1)

    return(birthdate)

  } else {

    # Return Null if there is no birthdate
    return(NULL)
  }
}

### Converts info box to table
getInfoBox <- function(article) {
  # Read page as html
  page <- xml2::read_html(article)

  # Extracting text from the html will erase all <br> tags,
  # This will replace them with line breaks

  xml2::xml_find_all(page, ".//br") %>%
    xml2::xml_add_sibling("p", "\n")

  xml2::xml_find_all(page, ".//br") %>%
    xml2::xml_remove()

  # Get the info box
  # Will throw an error if there isnt any, so that should be checked beforehand

  table <- page %>%
    rvest::html_nodes("table.vcard") %>%
    rvest::html_table(fill = TRUE) %>%
    .[[1]]

  colnames(table) <- c("Desc", "Content")

  return(table)
}

#' Get Introduction Text from Wikipedia page that contains birthdate
#'
#' @param article article in HTML-format
#' @return string introduction text from wikipedia article
getIntroduction <- function(article) {
  # Read page as html
  page <- xml2::read_html(article)

  # Extracting text from the html will erase all <br> tags,
  # This will replace them with line breaks

  xml2::xml_find_all(page, ".//br") %>%
    xml2::xml_add_sibling("p", "\n")

  xml2::xml_find_all(page, ".//br") %>%
    xml2::xml_remove

  # Get all paragraphs
  paragraphs <- page %>%
    rvest::html_nodes("p") %>%
    rvest::html_text()

  # There will be some leading paragraphs containing only "\n"
  # Remove those leading paragraphs
  remove <- c("\n")
  cleaned <- data.table::setdiff(paragraphs, remove)
  introduction <- cleaned[1]

  # Return first paragraph
  return(introduction)
}