#!/usr/bin/env Rscript # Author: Lukas #' Extract birthdate #' First sentences of the article #' will be checked for birthdate with pattern to extract #' it via NER. Idea: First appearing date is brithdate #' #' @export #' @param article Article in HTML-format #' @param annotations CNLP Annotations #' @return String birthdate as string|NA get_birthdate <- function(article, annotations) { # Get birthdate from first Date entity in article # - Get named entities # - Get dates # - Get first date # - Trim day of death which might follow with "-" or "−" # - Trim parentheses that might occur # - Trim whitespaces entities <- cleanNLP::cnlp_get_entity(annotations) dates <- entities[entities$entity_type == "DATE",] birthdate <- dates$entity[1] birthdate <- gsub("[–|−].*", "", birthdate) birthdate <- gsub("[\\(|\\)]", "", birthdate) birthdate <- trimws(birthdate) }