Skip to content
Snippets Groups Projects
Commit 9daf7047 authored by Lukas Gehrke's avatar Lukas Gehrke
Browse files

Adds use of annotations to get_birthplace.R.

parent d31acde2
No related branches found
No related tags found
1 merge request!44Adds use of annotations to get_birthplace.R.
......@@ -4,29 +4,50 @@
#' This script extracts Birthplace from physicist texts
#' Try to get the infobox and extract the birthplace
#' If there is no infobox, 0 will be returned as
#' birthplace is hard to extract from text
#' If there is no infobox, try to search text for
#' first occurence of an NORP entity
#'
#' @export
#' @param article Article in HTML-format
#' @return String with birthplace of the physicist|0
get_birthplace <- function(article) {
#' @param annotations CNLP Annotations
#' @return String with birthplace of the physicist|NA
get_birthplace <- function(article, annotations) {
# If there is no infobox we return 0
if(!grepl("vcard", article)) {
return(0)
}
# Use infobox to get Birthplace
infoBox <- get_infobox(article)
# Get 'Born' field
birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
if(grepl("vcard", article)) {
# Use infobox to get Birthplace
infoBox <- get_infobox(article)
# Assumption: In most cases birthplace is after newline in Born
# field of infobox
# - Get 'Born' field
# - Remove everything in front of the "\n"
# - Rest is birthplace in most cases
birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
birthplace <- gsub(".*\\\n", "", birthplace)
# Remove everything in front of the "\n"
# Rest is birthplace
birthplace <- gsub(".*\\\n", "", birthplace)
# return birthplace
return(birthplace)
} else {
# Try to extract birthplace from text as first NORP entitiy
# Not beautiful, because it mostly is Nationality of the physicist
# - Get named entities
# - Get NORP entities
# - Get first NORP
entities <- cnlp_get_entity(annotations)
norps <- entities[entities$entity_type == "NORP",]
birthplace <- norps$entity[1]
}
# Trim whitespaces
birthplace <- trimws(birthplace)
# Return birthplace or NA
if(!is.null(birthplace)) {
return(birthplace)
} else {
return(NA)
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment