From 9daf70479d62b1701a3cef0ed353ed2b9e2db23a Mon Sep 17 00:00:00 2001 From: Lukas Gehrke <lukasgehrke@Lukass-MacBook-Pro.local> Date: Thu, 17 Jan 2019 16:41:09 +0100 Subject: [PATCH] Adds use of annotations to get_birthplace.R. --- processing/wikiproc/R/get_birthplace.R | 61 +++++++++++++++++--------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/processing/wikiproc/R/get_birthplace.R b/processing/wikiproc/R/get_birthplace.R index e0310bd..2b6336b 100644 --- a/processing/wikiproc/R/get_birthplace.R +++ b/processing/wikiproc/R/get_birthplace.R @@ -4,29 +4,50 @@ #' This script extracts Birthplace from physicist texts #' Try to get the infobox and extract the birthplace -#' If there is no infobox, 0 will be returned as -#' birthplace is hard to extract from text +#' If there is no infobox, try to search text for +#' first occurence of an NORP entity #' #' @export #' @param article Article in HTML-format -#' @return String with birthplace of the physicist|0 -get_birthplace <- function(article) { - +#' @param annotations CNLP Annotations +#' @return String with birthplace of the physicist|NA +get_birthplace <- function(article, annotations) { + # If there is no infobox we return 0 - if(!grepl("vcard", article)) { - return(0) - } - - # Use infobox to get Birthplace - infoBox <- get_infobox(article) - - # Get 'Born' field - birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content + if(grepl("vcard", article)) { + + # Use infobox to get Birthplace + infoBox <- get_infobox(article) + + # Assumption: In most cases birthplace is after newline in Born + # field of infobox + # - Get 'Born' field + # - Remove everything in front of the "\n" + # - Rest is birthplace in most cases + birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content + birthplace <- gsub(".*\\\n", "", birthplace) - # Remove everything in front of the "\n" - # Rest is birthplace - birthplace <- gsub(".*\\\n", "", birthplace) - - # return birthplace - return(birthplace) + + } else { + + # Try to extract birthplace from text as first NORP entitiy + # Not beautiful, because it mostly is Nationality of the physicist + # - Get named entities + # - Get NORP entities + # - Get first NORP + entities <- cnlp_get_entity(annotations) + norps <- entities[entities$entity_type == "NORP",] + birthplace <- norps$entity[1] + + } + + # Trim whitespaces + birthplace <- trimws(birthplace) + + # Return birthplace or NA + if(!is.null(birthplace)) { + return(birthplace) + } else { + return(NA) + } } -- GitLab