Skip to content
Snippets Groups Projects
get_birthdate.R 907 B
Newer Older
#!/usr/bin/env Rscript

# Author: Lukas

#' Extract birthdate 
#' First sentences of the article
#' will be checked for birthdate with pattern to extract
#' it via NER. Idea: First appearing date is brithdate
Lukas Gehrke's avatar
Lukas Gehrke committed
#'
Lukas Gehrke's avatar
Lukas Gehrke committed
#' @param article Article in HTML-format
#' @param annotations CNLP Annotations
#' @return String birthdate as string|NA
get_birthdate <- function(article, annotations) {

  # Get birthdate from first Date entity in article
  # - Get named entities
  # - Get dates
  # - Get first date
  # - Trim day of death which might follow with "-" or "−" 
  # - Trim parentheses that might occur
  # - Trim whitespaces
  entities <- cleanNLP::cnlp_get_entity(annotations)
  dates <- entities[entities$entity_type == "DATE",]
  birthdate <- dates$entity[1]
  birthdate <- gsub("[–|−].*", "", birthdate)
  birthdate <- gsub("[\\(|\\)]", "", birthdate)
  birthdate <- trimws(birthdate)