Skip to content
Snippets Groups Projects
get_birthplace.R 1.43 KiB
Newer Older
#!/usr/bin/env Rscript

# Author: Lukas

#' This script extracts Birthplace from physicist texts
#' Try to get the infobox and extract the birthplace
#' If there is no infobox, try to search text for
#' first occurence of an NORP entity
#'
#' @export
#' @param article Article in HTML-format
#' @param annotations CNLP Annotations
#' @return String with birthplace of the physicist|NA
get_birthplace <- function(article, annotations) {
  
  # If there is no infobox we return 0
  if(grepl("vcard", article)) {
      
    # Use infobox to get Birthplace
    infoBox <- get_infobox(article)
    
    # Assumption: In most cases birthplace is after newline in Born
    # field of infobox
    # - Get 'Born' field
    # - Remove everything in front of the "\n"
    # - Rest is birthplace in most cases
    birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
    birthplace <- gsub(".*\\\n", "", birthplace)
      
  } else {
    
    # Try to extract birthplace from text as first NORP entitiy
    # Not beautiful, because it mostly is Nationality of the physicist
    # - Get named entities
    # - Get NORP entities
    # - Get first NORP
    entities <- cnlp_get_entity(annotations)
    norps <- entities[entities$entity_type == "NORP",]
    birthplace <- norps$entity[1]
    
  }
  
  # Trim whitespaces
  birthplace <- trimws(birthplace)
  
  # Return birthplace or NA  
  if(!is.null(birthplace)) {
    return(birthplace)
  } else {
    return(NA)
  }