Skip to content
Snippets Groups Projects
get_birthdate.R 2.16 KiB
Newer Older
#!/usr/bin/env Rscript

# Author: Lukas

library(rvest)
library(stringr)
library(data.table)
library(xml2)
Lukas Gehrke's avatar
Lukas Gehrke committed
#' Extract birthdate from infobox
#' Will try to get infobox as table and extract birthdate
#' from 'Born'-entry
#' If there is no infobox, first paragraph of the article
#' will be checked for birthdate
#'
Lukas Gehrke's avatar
Lukas Gehrke committed
#' @param article Article in HTML-format
#' @return String birthdate as string|NULL
Lucas Schons's avatar
Lucas Schons committed
get_birthdate <- function(article) {
  if(grepl("vcard", article)) {
    # Check if there is an infobox
David Fuhry's avatar
David Fuhry committed
    infoBox <- get_infobox(article)
    # Get the Born field
    birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content
    # Remove everything except the birthdate:
    # - Remove everything in round brackets
    birthdate <- gsub("\\s*\\([^\\)]+\\)", "", birthdate)
    # - Remove everything starting with newline
    birthdate <- gsub("\\n.*$", "", birthdate)
    return(birthdate)
  } else if(!getIntroduction(article) == "") {
    # Check first paragraph
    introduction <- getIntroduction(article)
    # Get birthdate inside of parentheses
    birthdate <- stringr::str_extract_all(introduction, "\\([^()]+\\)")[[1]]
    # Remove parentheses
    birthdate <- substring(birthdate, 2, nchar(birthdate)-1)
    return(birthdate)
    # Return Null if there is no birthdate
    return(NULL)
  }
Lukas Gehrke's avatar
Lukas Gehrke committed
#' Get Introduction Text from Wikipedia page that contains birthdate
#'
#' @param article article in HTML-format
#' @return string introduction text from wikipedia article
getIntroduction <- function(article) {
  # Read page as html
  page <- xml2::read_html(article)

  # Extracting text from the html will erase all <br> tags,
  # This will replace them with line breaks

  xml2::xml_find_all(page, ".//br") %>%
    xml2::xml_add_sibling("p", "\n")

  xml2::xml_find_all(page, ".//br") %>%
    xml2::xml_remove

  # Get all paragraphs
  paragraphs <- page %>%
    rvest::html_nodes("p") %>%
    rvest::html_text()

Lukas Gehrke's avatar
Lukas Gehrke committed
  # There will be some leading paragraphs containing only "\n"
  # Remove those leading paragraphs
  remove <- c("\n")
  cleaned <- data.table::setdiff(paragraphs, remove)
  introduction <- cleaned[1]
  # Return first paragraph
  return(introduction)
}