diff --git a/processing/wikiproc/R/get_birthplace.R b/processing/wikiproc/R/get_birthplace.R index e0310bdad7882b1f3a404115eb1637fb0d9cd52b..2b6336bee57249af3d030ad33296dd0900aacfe0 100644 --- a/processing/wikiproc/R/get_birthplace.R +++ b/processing/wikiproc/R/get_birthplace.R @@ -4,29 +4,50 @@ #' This script extracts Birthplace from physicist texts #' Try to get the infobox and extract the birthplace -#' If there is no infobox, 0 will be returned as -#' birthplace is hard to extract from text +#' If there is no infobox, try to search text for +#' first occurence of an NORP entity #' #' @export #' @param article Article in HTML-format -#' @return String with birthplace of the physicist|0 -get_birthplace <- function(article) { - +#' @param annotations CNLP Annotations +#' @return String with birthplace of the physicist|NA +get_birthplace <- function(article, annotations) { + # If there is no infobox we return 0 - if(!grepl("vcard", article)) { - return(0) - } - - # Use infobox to get Birthplace - infoBox <- get_infobox(article) - - # Get 'Born' field - birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content + if(grepl("vcard", article)) { + + # Use infobox to get Birthplace + infoBox <- get_infobox(article) + + # Assumption: In most cases birthplace is after newline in Born + # field of infobox + # - Get 'Born' field + # - Remove everything in front of the "\n" + # - Rest is birthplace in most cases + birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content + birthplace <- gsub(".*\\\n", "", birthplace) - # Remove everything in front of the "\n" - # Rest is birthplace - birthplace <- gsub(".*\\\n", "", birthplace) - - # return birthplace - return(birthplace) + + } else { + + # Try to extract birthplace from text as first NORP entitiy + # Not beautiful, because it mostly is Nationality of the physicist + # - Get named entities + # - Get NORP entities + # - Get first NORP + entities <- cnlp_get_entity(annotations) + norps <- entities[entities$entity_type == "NORP",] + birthplace <- norps$entity[1] + + } + + # Trim whitespaces + birthplace <- trimws(birthplace) + + # Return birthplace or NA + if(!is.null(birthplace)) { + return(birthplace) + } else { + return(NA) + } }