get_spouse.R

### get_spouse.R
### This extracts the spouse of the physicist entity

# Author: David


### Test case to remember: Louis Walter Alvarez
### TODO: Maybe match pronouns?


#' Reads the spouse of an physicist from text if possible
#'
#' @param article Cleaned article
#' @param annotation Annotation object
#'
#' @return Name of the spouse of the
#' @export
#'
#' @examples
#' \dontrun{
#' articles <- get_data()
#'
#' spouses <- get_spouse(articles$Text[54], annotation)
#'
#' spouses
#' }
get_spouse <- function(physicist, annotation){

  # We need this later
  entities <- cleanNLP::cnlp_get_entity(annotation)
  entities <- entities[entities$entity_type == "PERSON", ]$entity
  
  # Build parameter objects
  words <- c("* marry * *",
             "* marry * *",
             "* * marry * *",
             "* marry * *",
             "* * marry * *",
             "* marry * * *",
             "* * marry * * *")
  tags <- c("PROPN * PROPN PROPN",
            "NOUN * PROPN PROPN",
            "PROPN PROPN * PROPN PROPN",
            "* * * *",
            "* * * * *",
            "* * * * *",
            "* * * * * *")
  physicist.word.positions <- list(c(1, 1),
                                c(1, 1),
                                c(1, 2),
                                c(1, 1),
                                c(1, 2),
                                c(1, 1),
                                c(1, 2))
  spouse.word.positions <- list(c(3, 4),
                             c(3, 4),
                             c(4, 5),
                             c(3, 4),
                             c(4, 5),
                             c(4, 6),
                             c(5, 7))

  # Apply all the matchings to our data
  # First off we try matching exact patterns, as this will yield the best precision
  # While names are proper nouns sometimes if there is only one name before a verb (and especially at the start of a sentence)
  # this does not allways get tagged correct, so we check for noun too
  # Two words before should be recognized as propn so we won't bother checking for NOUN
  # If we don't find anything this way, we try and match without specific pos tags

  results <- sapply(seq_along(words), function(x) {
    spouse <- match_spouse(words[x], tags[x], physicist, physicist.word.positions[[x]], 
                           spouse.word.positions[[x]], annotation, entities)
  })
  
  if(!all(is.na(results))) {
    # We got a match
    return(na.omit(results)[1])
  }
  

  return(NA)
}

#' Interna function to match and extract a spouse from text
#'
#' @param words Tokens to match for
#' @param tags **Optional** POS tags to match for
#' @param physicist Name of the physicist for whos spouse we're looking
#' @param physicist.position Where to extract the physicist entity from
#' @param spouse.position Where to extract the spouse entity from
#' @param annotation Anootation object
#' @param people List to match found spouse against
#'
#' @return Spouse if found NA if not
#' @examples
#' \dontrun{
#' # Bad example
#' spouse <- match_spouse("* marry * *", "* * * *", "Jules Aaron", c(1, 1), c(3, 4), entities)
#'
#' spouses
#' }
match_spouse <- function(words, tags, physicist, physicist.position, spouse.position, annotation, people) {
  # If there is no tags parameter we need to create the vector ourselves
  # Will be fixed upstream in match_pattern so we won't need to
  if (missing(tags)) {
    tags <- paste(rep("*", stringr::str_count(words, stringr::fixed(" ")) + 1))
  }
  
  # Match pattern
  spouse <- match_pattern(annotation, words, tags, ignore.case = TRUE, use.stems = TRUE, ignore.punct = TRUE)
  # TODO: Multiple result handling is broken right now, will fix soon
  if (!is.na(spouse) && length(spouse) > 1) {
    spouse <- spouse[1]
  }
  
  # We check if the physicist in the matched sentence is the one we're looking for
  if (!is.na(spouse) && 
      adist(stringr::word(spouse, physicist.position[1], physicist.position[2]), physicist, partial = TRUE) <= 3) {
    spouse <- stringr::word(spouse, spouse.position[1], spouse.position[2])
    
    # Checking if the result is actualy a person.
    # This is ugly. Possibly better way:
    # tolower(spouse) %in% tolower(people)
    if (!is.na(any(grepl(spouse, people, ignore.case = TRUE, fixed = TRUE))) && 
        any(grepl(spouse, people, ignore.case = TRUE))) {
      return(tools::toTitleCase(spouse))
    }
  }
  return(NA)
  
}