Skip to content
Snippets Groups Projects
Commit 10013fb8 authored by Lucas Schons's avatar Lucas Schons
Browse files

Merge branch '46-enhancing-r-optimize-getbirthplace-r' into 'master'

Adds use of annotations to get_birthplace.R.

Closes #46

See merge request !44
parents 79e281db d402c31e
No related branches found
No related tags found
1 merge request!44Adds use of annotations to get_birthplace.R.
......@@ -4,29 +4,50 @@
#' This script extracts Birthplace from physicist texts
#' Try to get the infobox and extract the birthplace
#' If there is no infobox, NA will be returned as
#' birthplace is hard to extract from text
#' If there is no infobox, try to search text for
#' first occurence of an NORP entity
#'
#' @export
#' @param article Article in HTML-format
#' @return String with birthplace of the physicist|0
get_birthplace <- function(article) {
#' @param annotations CNLP Annotations
#' @return String with birthplace of the physicist|NA
get_birthplace <- function(article, annotations) {
# If there is no infobox we return 0
if(!grepl("vcard", article)) {
return(NA)
}
# Use infobox to get Birthplace
infoBox <- get_infobox(article)
# Get 'Born' field
birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
if(grepl("vcard", article)) {
# Use infobox to get Birthplace
infoBox <- get_infobox(article)
# Assumption: In most cases birthplace is after newline in Born
# field of infobox
# - Get 'Born' field
# - Remove everything in front of the "\n"
# - Rest is birthplace in most cases
birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
birthplace <- gsub(".*\\\n", "", birthplace)
# Remove everything in front of the "\n"
# Rest is birthplace
birthplace <- gsub(".*\\\n", "", birthplace)
# return birthplace
return(birthplace)
} else {
# Try to extract birthplace from text as first NORP entitiy
# Not beautiful, because it mostly is Nationality of the physicist
# - Get named entities
# - Get NORP entities
# - Get first NORP
entities <- cnlp_get_entity(annotations)
norps <- entities[entities$entity_type == "NORP",]
birthplace <- norps$entity[1]
}
# Trim whitespaces
birthplace <- trimws(birthplace)
# Return birthplace or NA
if(!is.null(birthplace)) {
return(birthplace)
} else {
return(NA)
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment