From dd64253d00a224eed3e6926df99694e7a11ad319 Mon Sep 17 00:00:00 2001
From: David Fuhry <>
Date: Sat, 26 Jan 2019 16:28:27 +0100
Subject: [PATCH] Resolve "Intend: Make spouses intend get spouse instead of
 number of spouses"

 processing/script/master.R                   |  14 +-
 processing/wikiproc/NAMESPACE                |   2 +-
 processing/wikiproc/R/get_no_of_spouses.R    |  44 ------
 processing/wikiproc/R/get_spouse.R           | 133 +++++++++++++++++++
 processing/wikiproc/man/get_no_of_spouses.Rd |  26 ----
 processing/wikiproc/man/get_spouse.Rd        |  28 ++++
 processing/wikiproc/man/match_spouse.Rd      |  38 ++++++
 processing/wikiproc/tests/.DS_Store          | Bin 0 -> 6148 bytes
 rasa/                              |  18 +--
 rasa/domain.yml                              |   8 +-
 rasa/                                  |   9 +-
 rasa/                              |   8 +-
 12 files changed, 226 insertions(+), 102 deletions(-)
 delete mode 100755 processing/wikiproc/R/get_no_of_spouses.R
 create mode 100755 processing/wikiproc/R/get_spouse.R
 delete mode 100644 processing/wikiproc/man/get_no_of_spouses.Rd
 create mode 100644 processing/wikiproc/man/get_spouse.Rd
 create mode 100644 processing/wikiproc/man/match_spouse.Rd
 create mode 100644 processing/wikiproc/tests/.DS_Store

diff --git a/processing/script/master.R b/processing/script/master.R
index 202ebd2..a70ddf8 100755
--- a/processing/script/master.R
+++ b/processing/script/master.R
@@ -31,7 +31,7 @@ results <- pbapply(articles, 1, function(article) {
   ## Data cleaning
-  cleaned_text <- wikiproc::clean_html(article[4])
+  cleaned.text <- clean_html(article[4])
   ## Data preprocessing/annotating
@@ -39,24 +39,18 @@ results <- pbapply(articles, 1, function(article) {
   ## Extract information from Text
-  no.spouses <- get_no_of_spouses(article[4])
+  spouse_found <- get_spouse(article[4], annotation)
   awards <- get_awards(annotation)
-  # NOTE: Passes annotation though the changes needing this are not yet merged
-  # bplace <- get_birthplace(cleaned_text)
-  # bdate <- get_birthdate(cleaned_text)
   ## Create Results
-  data.frame(name = article[1],
+  data.frame(Name = article[1],
+             spouse = spouse_found,
              birthplace = NA,
              birthdate = NA,
              day_of_death = NA,
              place_of_death = NA,
              is_alive = NA,
-             num_spouses = no.spouses,
              primary_education = NA,
              university = NA,
              area_of_research = NA,
diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE
index 21f89c5..007f345 100644
--- a/processing/wikiproc/NAMESPACE
+++ b/processing/wikiproc/NAMESPACE
@@ -6,7 +6,7 @@ export(get_awards)
diff --git a/processing/wikiproc/R/get_no_of_spouses.R b/processing/wikiproc/R/get_no_of_spouses.R
deleted file mode 100755
index 5b688b2..0000000
--- a/processing/wikiproc/R/get_no_of_spouses.R
+++ /dev/null
@@ -1,44 +0,0 @@
-### GetNoOfSpouses.R
-### This extracts the number of spouses from the infobox
-### If no infobox or no information about spouses is found assumes there are none
-### Not for use in production, this does not actually get information from text
-# Author: David
-#' Reads the number of spouses from the infobox of an wikipedia article
-#' @param article Wikipedia article in html format
-#' @return Integer indicating the number of spouses
-#' @export
-#' @examples
-#' \dontrun{
-#' articles <- get_data()
-#' no.spouses <- get_no_of_spouses(articles$Text[54])
-#' no.spouses
-#' }
-get_no_of_spouses <- function(article) {
-  # If there is no infobox we assume there were no spouses
-  if(!grepl("vcard", article)) {
-    return(0)
-  }
-  infoBox <- get_infobox(article)
-  # Get the spouse field
-  spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
-  # Remove everything in parentheses
-  spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
-  # Split the strings by newlines to get one spouse per line
-  spouses <- strsplit(spouses, "\n")
-  spouses <- unlist(spouses)
-  if(length(spouses) > 0) {
-    no.spouses <- length(spouses)
-  } else {
-    no.spouses <- 0
-  }
diff --git a/processing/wikiproc/R/get_spouse.R b/processing/wikiproc/R/get_spouse.R
new file mode 100755
index 0000000..ff6c1cf
--- /dev/null
+++ b/processing/wikiproc/R/get_spouse.R
@@ -0,0 +1,133 @@
+### get_spouse.R
+### This extracts the spouse of the physicist entity
+# Author: David
+### Test case to remember: Louis Walter Alvarez
+### TODO: Maybe match pronouns?
+#' Reads the spouse of an physicist from text if possible
+#' @param article Cleaned article
+#' @param annotation Annotation object
+#' @return Name of the spouse of the
+#' @export
+#' @examples
+#' \dontrun{
+#' articles <- get_data()
+#' spouses <- get_spouse(articles$Text[54], annotation)
+#' spouses
+#' }
+get_spouse <- function(physicist, annotation){
+  # We need this later
+  entities <- cleanNLP::cnlp_get_entity(annotation)
+  entities <- entities[entities$entity_type == "PERSON", ]$entity
+  # Build parameter objects
+  words <- c("* marry * *",
+             "* marry * *",
+             "* * marry * *",
+             "* marry * *",
+             "* * marry * *",
+             "* marry * * *",
+             "* * marry * * *")
+  tags <- c("PROPN * PROPN PROPN",
+            "NOUN * PROPN PROPN",
+            "PROPN PROPN * PROPN PROPN",
+            "* * * *",
+            "* * * * *",
+            "* * * * *",
+            "* * * * * *")
+  physicist.word.positions <- list(c(1, 1),
+                                c(1, 1),
+                                c(1, 2),
+                                c(1, 1),
+                                c(1, 2),
+                                c(1, 1),
+                                c(1, 2))
+  spouse.word.positions <- list(c(3, 4),
+                             c(3, 4),
+                             c(4, 5),
+                             c(3, 4),
+                             c(4, 5),
+                             c(4, 6),
+                             c(5, 7))
+  # Apply all the matchings to our data
+  # First off we try matching exact patterns, as this will yield the best precision
+  # While names are proper nouns sometimes if there is only one name before a verb (and especially at the start of a sentence)
+  # this does not allways get tagged correct, so we check for noun too
+  # Two words before should be recognized as propn so we won't bother checking for NOUN
+  # If we don't find anything this way, we try and match without specific pos tags
+  results <- sapply(seq_along(words), function(x) {
+    spouse <- match_spouse(words[x], tags[x], physicist, physicist.word.positions[[x]], 
+                           spouse.word.positions[[x]], annotation, entities)
+  })
+  if(!all( {
+    # We got a match
+    return(na.omit(results)[1])
+  }
+  return(NA)
+#' Interna function to match and extract a spouse from text
+#' @param words Tokens to match for
+#' @param tags **Optional** POS tags to match for
+#' @param physicist Name of the physicist for whos spouse we're looking
+#' @param physicist.position Where to extract the physicist entity from
+#' @param spouse.position Where to extract the spouse entity from
+#' @param annotation Anootation object
+#' @param people List to match found spouse against
+#' @return Spouse if found NA if not
+#' @examples
+#' \dontrun{
+#' # Bad example
+#' spouse <- match_spouse("* marry * *", "* * * *", "Jules Aaron", c(1, 1), c(3, 4), entities)
+#' spouses
+#' }
+match_spouse <- function(words, tags, physicist, physicist.position, spouse.position, annotation, people) {
+  # If there is no tags parameter we need to create the vector ourselves
+  # Will be fixed upstream in match_pattern so we won't need to
+  if (missing(tags)) {
+    tags <- paste(rep("*", stringr::str_count(words, stringr::fixed(" ")) + 1))
+  }
+  # Match pattern
+  spouse <- match_pattern(annotation, words, tags, = TRUE, use.stems = TRUE, ignore.punct = TRUE)
+  # TODO: Multiple result handling is broken right now, will fix soon
+  if (! && length(spouse) > 1) {
+    spouse <- spouse[1]
+  }
+  # We check if the physicist in the matched sentence is the one we're looking for
+  if (! && 
+      adist(stringr::word(spouse, physicist.position[1], physicist.position[2]), physicist, partial = TRUE) <= 3) {
+    spouse <- stringr::word(spouse, spouse.position[1], spouse.position[2])
+    # Checking if the result is actualy a person.
+    # This is ugly. Possibly better way:
+    # tolower(spouse) %in% tolower(people)
+    if (!, people, = TRUE, fixed = TRUE))) && 
+        any(grepl(spouse, people, = TRUE))) {
+      return(tools::toTitleCase(spouse))
+    }
+  }
+  return(NA)
diff --git a/processing/wikiproc/man/get_no_of_spouses.Rd b/processing/wikiproc/man/get_no_of_spouses.Rd
deleted file mode 100644
index bd8a78e..0000000
--- a/processing/wikiproc/man/get_no_of_spouses.Rd
+++ /dev/null
@@ -1,26 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/get_no_of_spouses.R
-\title{Reads the number of spouses from the infobox of an wikipedia article}
-\item{article}{Wikipedia article in html format}
-Integer indicating the number of spouses
-Reads the number of spouses from the infobox of an wikipedia article
-articles <- get_data()
-no.spouses <- get_no_of_spouses(articles$Text[54])
diff --git a/processing/wikiproc/man/get_spouse.Rd b/processing/wikiproc/man/get_spouse.Rd
new file mode 100644
index 0000000..2d32df2
--- /dev/null
+++ b/processing/wikiproc/man/get_spouse.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_spouse.R
+\title{Reads the spouse of an physicist from text if possible}
+get_spouse(physicist, annotation)
+\item{annotation}{Annotation object}
+\item{article}{Cleaned article}
+Name of the spouse of the
+Reads the spouse of an physicist from text if possible
+articles <- get_data()
+spouses <- get_spouse(articles$Text[54], annotation)
diff --git a/processing/wikiproc/man/match_spouse.Rd b/processing/wikiproc/man/match_spouse.Rd
new file mode 100644
index 0000000..1c173bf
--- /dev/null
+++ b/processing/wikiproc/man/match_spouse.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_spouse.R
+\title{Interna function to match and extract a spouse from text}
+match_spouse(words, tags, physicist, physicist.position, spouse.position,
+  annotation, people)
+\item{words}{Tokens to match for}
+\item{tags}{**Optional** POS tags to match for}
+\item{physicist}{Name of the physicist for whos spouse we're looking}
+\item{physicist.position}{Where to extract the physicist entity from}
+\item{spouse.position}{Where to extract the spouse entity from}
+\item{annotation}{Anootation object}
+\item{people}{List to match found spouse against}
+Spouse if found NA if not
+Interna function to match and extract a spouse from text
+# Bad example
+spouse <- match_spouse("* marry * *", "* * * *", "Jules Aaron", c(1, 1), c(3, 4), entities)
diff --git a/processing/wikiproc/tests/.DS_Store b/processing/wikiproc/tests/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..07c0a46c0bda99e479d4022d7b93670751f0e29a
GIT binary patch
literal 6148

literal 0

diff --git a/rasa/ b/rasa/
index af2b480..0ca0175 100644
--- a/rasa/
+++ b/rasa/
@@ -192,34 +192,34 @@ class ActionUtterIsAlive(Action):
 # num spouses
-class ActionSearchNumSpouses(Action):
+class ActionSearchSpouse(Action):
     def name(self):
-        return 'action_search_num_spouses'
+        return 'action_search_spouse'
     def run(self, dispatcher, tracker, domain):
         import csv
         import re
         person = tracker.get_slot('physicist')
         name_regex = re.compile(person, re.IGNORECASE)
-        actual_num_spouses = None
+        actual_spouse= None
         with open('data.tsv') as csvfile:
             spamreader = csv.DictReader(csvfile, delimiter='\t')
             for row in spamreader:
                 if name_regex.match(row['name']):
-                    actual_num_spouses = row['num_spouses']
-        return [SlotSet('num_spouses', actual_num_spouses 
-            if actual_num_spouses is not None and actual_num_spouses is not "" 
+                    actual_spouse = row['spouse']
+        return [SlotSet('spouse', actual_spouse 
+            if actual_spouse is not None and actual_spouse is not "" 
             else "not known")]
 class ActionUtterNumSpouses(Action):
     def name(self):
-        return 'action_utter_num_spouses'
+        return 'action_utter_spouse'
     def run(self, dispatcher, tracker, domain):
         person = tracker.get_slot('physicist')
-        num_spouses = tracker.get_slot('num_spouses')
-        dispatcher.utter_message("The Number of spouses of {} is {}.".format(person, num_spouses))
+        spouse = tracker.get_slot('spouse')
+        dispatcher.utter_message("The spouse of {} is {}.".format(person, spouse))
         return []
diff --git a/rasa/domain.yml b/rasa/domain.yml
index 2221181..dddc1af 100644
--- a/rasa/domain.yml
+++ b/rasa/domain.yml
@@ -6,7 +6,7 @@ intents:
   - day_of_death
   - place_of_death
   - is_alive
-  - num_spouses
+  - spouse
   - primary_education
   - university
   - area_of_research
@@ -26,8 +26,8 @@ actions:
   - action_utter_place_of_death
   - action_search_is_alive
   - action_utter_is_alive
-  - action_search_num_spouses
-  - action_utter_num_spouses
+  - action_search_spouse
+  - action_utter_spouse
   - action_search_primary_education
   - action_utter_primary_education
   - action_search_university
@@ -55,7 +55,7 @@ slots:
     type: unfeaturized
     type: unfeaturized
-  num_spouses:
+  spouse:
     type: unfeaturized
     type: unfeaturized
diff --git a/rasa/ b/rasa/
index b7cfff8..3b9869d 100644
--- a/rasa/
+++ b/rasa/
@@ -28,10 +28,11 @@
 - is [Albert Einstein](physicist) still alive
 - is [Galileo Galilei](physicist) dead
-## intent:num_spouses
-- how often was [Albert Einstein](physicist) married
-- was [Galileo Galilei](physicist) married
-- how many spouses did have [Albert Einstein](physicist)
+## intent:spouse
+- to whom was [Albert Einstein](physicist) married
+- who was [Galileo Galilei](physicist) spouse
+- who was [Albert Einstein](physicist) wife
+- who was [Galileo Galilei](physicist) husband
 ## intent:primary_education
 - where did [Galileo Galilei](physicist) go to school
diff --git a/rasa/ b/rasa/
index a6defb5..e624184 100644
--- a/rasa/
+++ b/rasa/
@@ -31,10 +31,10 @@
   - action_search_is_alive
   - action_utter_is_alive
-## num_spouses
-* num_spouses{"physicist": "albert einstein"}
-  - action_search_num_spouses
-  - action_utter_num_spouses
+## spouse
+* spouse{"physicist": "albert einstein"}
+  - action_search_spouse
+  - action_utter_spouse
 ## primary_education
 * primary_education{"physicist": "albert einstein"}