From 9daf70479d62b1701a3cef0ed353ed2b9e2db23a Mon Sep 17 00:00:00 2001
From: Lukas Gehrke <lukasgehrke@Lukass-MacBook-Pro.local>
Date: Thu, 17 Jan 2019 16:41:09 +0100
Subject: [PATCH] Adds use of annotations to get_birthplace.R.

---
 processing/wikiproc/R/get_birthplace.R | 61 +++++++++++++++++---------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/processing/wikiproc/R/get_birthplace.R b/processing/wikiproc/R/get_birthplace.R
index e0310bd..2b6336b 100644
--- a/processing/wikiproc/R/get_birthplace.R
+++ b/processing/wikiproc/R/get_birthplace.R
@@ -4,29 +4,50 @@
 
 #' This script extracts Birthplace from physicist texts
 #' Try to get the infobox and extract the birthplace
-#' If there is no infobox, 0 will be returned as
-#' birthplace is hard to extract from text
+#' If there is no infobox, try to search text for
+#' first occurence of an NORP entity
 #'
 #' @export
 #' @param article Article in HTML-format
-#' @return String with birthplace of the physicist|0
-get_birthplace <- function(article) {
-
+#' @param annotations CNLP Annotations
+#' @return String with birthplace of the physicist|NA
+get_birthplace <- function(article, annotations) {
+  
   # If there is no infobox we return 0
-  if(!grepl("vcard", article)) {
-      return(0)
-  }
-
-  # Use infobox to get Birthplace
-  infoBox <- get_infobox(article)
-
-  # Get 'Born' field
-  birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
+  if(grepl("vcard", article)) {
+      
+    # Use infobox to get Birthplace
+    infoBox <- get_infobox(article)
+    
+    # Assumption: In most cases birthplace is after newline in Born
+    # field of infobox
+    # - Get 'Born' field
+    # - Remove everything in front of the "\n"
+    # - Rest is birthplace in most cases
+    birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
+    birthplace <- gsub(".*\\\n", "", birthplace)
 
-  # Remove everything in front of the "\n"
-  # Rest is birthplace
-  birthplace <- gsub(".*\\\n", "", birthplace)
-
-  # return birthplace
-  return(birthplace)
+      
+  } else {
+    
+    # Try to extract birthplace from text as first NORP entitiy
+    # Not beautiful, because it mostly is Nationality of the physicist
+    # - Get named entities
+    # - Get NORP entities
+    # - Get first NORP
+    entities <- cnlp_get_entity(annotations)
+    norps <- entities[entities$entity_type == "NORP",]
+    birthplace <- norps$entity[1]
+    
+  }
+  
+  # Trim whitespaces
+  birthplace <- trimws(birthplace)
+  
+  # Return birthplace or NA  
+  if(!is.null(birthplace)) {
+    return(birthplace)
+  } else {
+    return(NA)
+  }
 }
-- 
GitLab