### get_spouse.R ### This extracts the spouse of the physicist entity # Author: David ### Test case to remember: Louis Walter Alvarez ### TODO: Maybe match pronouns? #' Reads the spouse of an physicist from text if possible #' #' @param article Cleaned article #' @param annotation Annotation object #' #' @return Name of the spouse of the #' @export #' #' @examples #' \dontrun{ #' articles <- get_data() #' #' spouses <- get_spouse(articles$Text[54], annotation) #' #' spouses #' } get_spouse <- function(physicist, annotation){ # We need this later entities <- cleanNLP::cnlp_get_entity(annotation) entities <- entities[entities$entity_type == "PERSON", ]$entity # Build parameter objects words <- c("* marry * *", "* marry * *", "* * marry * *", "* marry * *", "* * marry * *", "* marry * * *", "* * marry * * *") tags <- c("PROPN * PROPN PROPN", "NOUN * PROPN PROPN", "PROPN PROPN * PROPN PROPN", "* * * *", "* * * * *", "* * * * *", "* * * * * *") physicist.word.positions <- list(c(1, 1), c(1, 1), c(1, 2), c(1, 1), c(1, 2), c(1, 1), c(1, 2)) spouse.word.positions <- list(c(3, 4), c(3, 4), c(4, 5), c(3, 4), c(4, 5), c(4, 6), c(5, 7)) # Apply all the matchings to our data # First off we try matching exact patterns, as this will yield the best precision # While names are proper nouns sometimes if there is only one name before a verb (and especially at the start of a sentence) # this does not allways get tagged correct, so we check for noun too # Two words before should be recognized as propn so we won't bother checking for NOUN # If we don't find anything this way, we try and match without specific pos tags results <- sapply(seq_along(words), function(x) { spouse <- match_spouse(words[x], tags[x], physicist, physicist.word.positions[[x]], spouse.word.positions[[x]], annotation, entities) }) if(!all(is.na(results))) { # We got a match return(na.omit(results)[1]) } return(NA) } #' Interna function to match and extract a spouse from text #' #' @param words Tokens to match for #' @param tags **Optional** POS tags to match for #' @param physicist Name of the physicist for whos spouse we're looking #' @param physicist.position Where to extract the physicist entity from #' @param spouse.position Where to extract the spouse entity from #' @param annotation Anootation object #' @param people List to match found spouse against #' #' @return Spouse if found NA if not #' @examples #' \dontrun{ #' # Bad example #' spouse <- match_spouse("* marry * *", "* * * *", "Jules Aaron", c(1, 1), c(3, 4), entities) #' #' spouses #' } match_spouse <- function(words, tags, physicist, physicist.position, spouse.position, annotation, people) { # If there is no tags parameter we need to create the vector ourselves # Will be fixed upstream in match_pattern so we won't need to if (missing(tags)) { tags <- paste(rep("*", stringr::str_count(words, stringr::fixed(" ")) + 1)) } # Match pattern spouse <- match_pattern(annotation, words, tags, ignore.case = TRUE, use.stems = TRUE, ignore.punct = TRUE) # TODO: Multiple result handling is broken right now, will fix soon if (!is.na(spouse) && length(spouse) > 1) { spouse <- spouse[1] } # We check if the physicist in the matched sentence is the one we're looking for if (!is.na(spouse) && adist(stringr::word(spouse, physicist.position[1], physicist.position[2]), physicist, partial = TRUE) <= 3) { spouse <- stringr::word(spouse, spouse.position[1], spouse.position[2]) # Checking if the result is actualy a person. # This is ugly. Possibly better way: # tolower(spouse) %in% tolower(people) if (!is.na(any(grepl(spouse, people, ignore.case = TRUE, fixed = TRUE))) && any(grepl(spouse, people, ignore.case = TRUE))) { return(tools::toTitleCase(spouse)) } } return(NA) }