Newer
Older
#!/usr/bin/env Rscript
# Author: Lukas
#' This script extracts Birthplace from physicist texts
#' Try to get the infobox and extract the birthplace
#' If there is no infobox, try to search text for
#' first occurence of an NORP entity
#'
#' @export
#' @param article Article in HTML-format
#' @param annotations CNLP Annotations
#' @return String with birthplace of the physicist|NA
get_birthplace <- function(article, annotations) {
# If there is no infobox we return 0
if(grepl("vcard", article)) {
# Use infobox to get Birthplace
infoBox <- get_infobox(article)
# Assumption: In most cases birthplace is after newline in Born
# field of infobox
# - Get 'Born' field
# - Remove everything in front of the "\n"
# - Rest is birthplace in most cases
birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
birthplace <- gsub(".*\\\n", "", birthplace)
} else {
# Try to extract birthplace from text as first NORP entitiy
# Not beautiful, because it mostly is Nationality of the physicist
# - Get named entities
# - Get NORP entities
# - Get first NORP
entities <- cnlp_get_entity(annotations)
norps <- entities[entities$entity_type == "NORP",]
birthplace <- norps$entity[1]
}
# Trim whitespaces
birthplace <- trimws(birthplace)
# Return birthplace or NA
if(!is.null(birthplace)) {
return(birthplace)
} else {
return(NA)
}