#!/usr/bin/env Rscript # Author: Lukas #' This script extracts Birthplace from physicist texts #' Try to get the infobox and extract the birthplace #' If there is no infobox, try to search text for #' first occurence of an NORP entity #' #' @export #' @param article Article in HTML-format #' @param annotations CNLP Annotations #' @return String with birthplace of the physicist|NA get_birthplace <- function(article, annotations) { # If there is no infobox we return 0 if(grepl("vcard", article)) { # Use infobox to get Birthplace infoBox <- get_infobox(article) # Assumption: In most cases birthplace is after newline in Born # field of infobox # - Get 'Born' field # - Remove everything in front of the "\n" # - Rest is birthplace in most cases birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content birthplace <- gsub(".*\\\n", "", birthplace) } else { # Try to extract birthplace from text as first NORP entitiy # Not beautiful, because it mostly is Nationality of the physicist # - Get named entities # - Get NORP entities # - Get first NORP entities <- cnlp_get_entity(annotations) norps <- entities[entities$entity_type == "NORP",] birthplace <- norps$entity[1] } # Trim whitespaces birthplace <- trimws(birthplace) # Return birthplace or NA if(!is.null(birthplace)) { return(birthplace) } else { return(NA) } }