From dd64253d00a224eed3e6926df99694e7a11ad319 Mon Sep 17 00:00:00 2001
From: David Fuhry <df15nocu@studserv.uni-leipzig.de>
Date: Sat, 26 Jan 2019 16:28:27 +0100
Subject: [PATCH] Resolve "Intend: Make spouses intend get spouse instead of
 number of spouses"

---
 processing/script/master.R                   |  14 +-
 processing/wikiproc/NAMESPACE                |   2 +-
 processing/wikiproc/R/get_no_of_spouses.R    |  44 ------
 processing/wikiproc/R/get_spouse.R           | 133 +++++++++++++++++++
 processing/wikiproc/man/get_no_of_spouses.Rd |  26 ----
 processing/wikiproc/man/get_spouse.Rd        |  28 ++++
 processing/wikiproc/man/match_spouse.Rd      |  38 ++++++
 processing/wikiproc/tests/.DS_Store          | Bin 0 -> 6148 bytes
 rasa/actions.py                              |  18 +--
 rasa/domain.yml                              |   8 +-
 rasa/nlu.md                                  |   9 +-
 rasa/stories.md                              |   8 +-
 12 files changed, 226 insertions(+), 102 deletions(-)
 delete mode 100755 processing/wikiproc/R/get_no_of_spouses.R
 create mode 100755 processing/wikiproc/R/get_spouse.R
 delete mode 100644 processing/wikiproc/man/get_no_of_spouses.Rd
 create mode 100644 processing/wikiproc/man/get_spouse.Rd
 create mode 100644 processing/wikiproc/man/match_spouse.Rd
 create mode 100644 processing/wikiproc/tests/.DS_Store

diff --git a/processing/script/master.R b/processing/script/master.R
index 202ebd2..a70ddf8 100755
--- a/processing/script/master.R
+++ b/processing/script/master.R
@@ -31,7 +31,7 @@ results <- pbapply(articles, 1, function(article) {
 
   ## Data cleaning
 
-  cleaned_text <- wikiproc::clean_html(article[4])
+  cleaned.text <- clean_html(article[4])
 
   ## Data preprocessing/annotating
 
@@ -39,24 +39,18 @@ results <- pbapply(articles, 1, function(article) {
 
   ## Extract information from Text
 
-  no.spouses <- get_no_of_spouses(article[4])
+  spouse_found <- get_spouse(article[4], annotation)
   awards <- get_awards(annotation)
 
-
-  # NOTE: Passes annotation though the changes needing this are not yet merged
-  # bplace <- get_birthplace(cleaned_text)
-
-  # bdate <- get_birthdate(cleaned_text)
-
   ## Create Results
 
-  data.frame(name = article[1],
+  data.frame(Name = article[1],
+             spouse = spouse_found,
              birthplace = NA,
              birthdate = NA,
              day_of_death = NA,
              place_of_death = NA,
              is_alive = NA,
-             num_spouses = no.spouses,
              primary_education = NA,
              university = NA,
              area_of_research = NA,
diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE
index 21f89c5..007f345 100644
--- a/processing/wikiproc/NAMESPACE
+++ b/processing/wikiproc/NAMESPACE
@@ -6,7 +6,7 @@ export(get_awards)
 export(get_birthdate)
 export(get_birthplace)
 export(get_data)
-export(get_no_of_spouses)
+export(get_spouse)
 export(init_nlp)
 importFrom(data.table,"%like%")
 importFrom(magrittr,"%>%")
diff --git a/processing/wikiproc/R/get_no_of_spouses.R b/processing/wikiproc/R/get_no_of_spouses.R
deleted file mode 100755
index 5b688b2..0000000
--- a/processing/wikiproc/R/get_no_of_spouses.R
+++ /dev/null
@@ -1,44 +0,0 @@
-### GetNoOfSpouses.R
-### This extracts the number of spouses from the infobox
-### If no infobox or no information about spouses is found assumes there are none
-### Not for use in production, this does not actually get information from text
-
-# Author: David
-
-#' Reads the number of spouses from the infobox of an wikipedia article
-#'
-#' @param article Wikipedia article in html format
-#'
-#' @return Integer indicating the number of spouses
-#' @export
-#'
-#' @examples
-#' \dontrun{
-#' articles <- get_data()
-#'
-#' no.spouses <- get_no_of_spouses(articles$Text[54])
-#'
-#' no.spouses
-#' }
-get_no_of_spouses <- function(article) {
-
-  # If there is no infobox we assume there were no spouses
-  if(!grepl("vcard", article)) {
-    return(0)
-  }
-
-  infoBox <- get_infobox(article)
-
-  # Get the spouse field
-  spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
-  # Remove everything in parentheses
-  spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
-  # Split the strings by newlines to get one spouse per line
-  spouses <- strsplit(spouses, "\n")
-  spouses <- unlist(spouses)
-  if(length(spouses) > 0) {
-    no.spouses <- length(spouses)
-  } else {
-    no.spouses <- 0
-  }
-}
diff --git a/processing/wikiproc/R/get_spouse.R b/processing/wikiproc/R/get_spouse.R
new file mode 100755
index 0000000..ff6c1cf
--- /dev/null
+++ b/processing/wikiproc/R/get_spouse.R
@@ -0,0 +1,133 @@
+### get_spouse.R
+### This extracts the spouse of the physicist entity
+
+# Author: David
+
+
+### Test case to remember: Louis Walter Alvarez
+### TODO: Maybe match pronouns?
+
+
+#' Reads the spouse of an physicist from text if possible
+#'
+#' @param article Cleaned article
+#' @param annotation Annotation object
+#'
+#' @return Name of the spouse of the
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' articles <- get_data()
+#'
+#' spouses <- get_spouse(articles$Text[54], annotation)
+#'
+#' spouses
+#' }
+get_spouse <- function(physicist, annotation){
+
+  # We need this later
+  entities <- cleanNLP::cnlp_get_entity(annotation)
+  entities <- entities[entities$entity_type == "PERSON", ]$entity
+  
+  # Build parameter objects
+  words <- c("* marry * *",
+             "* marry * *",
+             "* * marry * *",
+             "* marry * *",
+             "* * marry * *",
+             "* marry * * *",
+             "* * marry * * *")
+  tags <- c("PROPN * PROPN PROPN",
+            "NOUN * PROPN PROPN",
+            "PROPN PROPN * PROPN PROPN",
+            "* * * *",
+            "* * * * *",
+            "* * * * *",
+            "* * * * * *")
+  physicist.word.positions <- list(c(1, 1),
+                                c(1, 1),
+                                c(1, 2),
+                                c(1, 1),
+                                c(1, 2),
+                                c(1, 1),
+                                c(1, 2))
+  spouse.word.positions <- list(c(3, 4),
+                             c(3, 4),
+                             c(4, 5),
+                             c(3, 4),
+                             c(4, 5),
+                             c(4, 6),
+                             c(5, 7))
+
+  # Apply all the matchings to our data
+  # First off we try matching exact patterns, as this will yield the best precision
+  # While names are proper nouns sometimes if there is only one name before a verb (and especially at the start of a sentence)
+  # this does not allways get tagged correct, so we check for noun too
+  # Two words before should be recognized as propn so we won't bother checking for NOUN
+  # If we don't find anything this way, we try and match without specific pos tags
+
+  results <- sapply(seq_along(words), function(x) {
+    spouse <- match_spouse(words[x], tags[x], physicist, physicist.word.positions[[x]], 
+                           spouse.word.positions[[x]], annotation, entities)
+  })
+  
+  if(!all(is.na(results))) {
+    # We got a match
+    return(na.omit(results)[1])
+  }
+  
+
+  
+  return(NA)
+}
+
+#' Interna function to match and extract a spouse from text
+#'
+#' @param words Tokens to match for
+#' @param tags **Optional** POS tags to match for
+#' @param physicist Name of the physicist for whos spouse we're looking
+#' @param physicist.position Where to extract the physicist entity from
+#' @param spouse.position Where to extract the spouse entity from
+#' @param annotation Anootation object
+#' @param people List to match found spouse against
+#'
+#' @return Spouse if found NA if not
+#' @examples
+#' \dontrun{
+#' # Bad example
+#' spouse <- match_spouse("* marry * *", "* * * *", "Jules Aaron", c(1, 1), c(3, 4), entities)
+#'
+#' spouses
+#' }
+match_spouse <- function(words, tags, physicist, physicist.position, spouse.position, annotation, people) {
+  # If there is no tags parameter we need to create the vector ourselves
+  # Will be fixed upstream in match_pattern so we won't need to
+  if (missing(tags)) {
+    tags <- paste(rep("*", stringr::str_count(words, stringr::fixed(" ")) + 1))
+  }
+  
+  # Match pattern
+  spouse <- match_pattern(annotation, words, tags, ignore.case = TRUE, use.stems = TRUE, ignore.punct = TRUE)
+  # TODO: Multiple result handling is broken right now, will fix soon
+  if (!is.na(spouse) && length(spouse) > 1) {
+    spouse <- spouse[1]
+  }
+  
+  # We check if the physicist in the matched sentence is the one we're looking for
+  if (!is.na(spouse) && 
+      adist(stringr::word(spouse, physicist.position[1], physicist.position[2]), physicist, partial = TRUE) <= 3) {
+    spouse <- stringr::word(spouse, spouse.position[1], spouse.position[2])
+    
+    # Checking if the result is actualy a person.
+    # This is ugly. Possibly better way:
+    # tolower(spouse) %in% tolower(people)
+    if (!is.na(any(grepl(spouse, people, ignore.case = TRUE, fixed = TRUE))) && 
+        any(grepl(spouse, people, ignore.case = TRUE))) {
+      return(tools::toTitleCase(spouse))
+    }
+  }
+  return(NA)
+  
+}
+
diff --git a/processing/wikiproc/man/get_no_of_spouses.Rd b/processing/wikiproc/man/get_no_of_spouses.Rd
deleted file mode 100644
index bd8a78e..0000000
--- a/processing/wikiproc/man/get_no_of_spouses.Rd
+++ /dev/null
@@ -1,26 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/get_no_of_spouses.R
-\name{get_no_of_spouses}
-\alias{get_no_of_spouses}
-\title{Reads the number of spouses from the infobox of an wikipedia article}
-\usage{
-get_no_of_spouses(article)
-}
-\arguments{
-\item{article}{Wikipedia article in html format}
-}
-\value{
-Integer indicating the number of spouses
-}
-\description{
-Reads the number of spouses from the infobox of an wikipedia article
-}
-\examples{
-\dontrun{
-articles <- get_data()
-
-no.spouses <- get_no_of_spouses(articles$Text[54])
-
-no.spouses
-}
-}
diff --git a/processing/wikiproc/man/get_spouse.Rd b/processing/wikiproc/man/get_spouse.Rd
new file mode 100644
index 0000000..2d32df2
--- /dev/null
+++ b/processing/wikiproc/man/get_spouse.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_spouse.R
+\name{get_spouse}
+\alias{get_spouse}
+\title{Reads the spouse of an physicist from text if possible}
+\usage{
+get_spouse(physicist, annotation)
+}
+\arguments{
+\item{annotation}{Annotation object}
+
+\item{article}{Cleaned article}
+}
+\value{
+Name of the spouse of the
+}
+\description{
+Reads the spouse of an physicist from text if possible
+}
+\examples{
+\dontrun{
+articles <- get_data()
+
+spouses <- get_spouse(articles$Text[54], annotation)
+
+spouses
+}
+}
diff --git a/processing/wikiproc/man/match_spouse.Rd b/processing/wikiproc/man/match_spouse.Rd
new file mode 100644
index 0000000..1c173bf
--- /dev/null
+++ b/processing/wikiproc/man/match_spouse.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_spouse.R
+\name{match_spouse}
+\alias{match_spouse}
+\title{Interna function to match and extract a spouse from text}
+\usage{
+match_spouse(words, tags, physicist, physicist.position, spouse.position,
+  annotation, people)
+}
+\arguments{
+\item{words}{Tokens to match for}
+
+\item{tags}{**Optional** POS tags to match for}
+
+\item{physicist}{Name of the physicist for whos spouse we're looking}
+
+\item{physicist.position}{Where to extract the physicist entity from}
+
+\item{spouse.position}{Where to extract the spouse entity from}
+
+\item{annotation}{Anootation object}
+
+\item{people}{List to match found spouse against}
+}
+\value{
+Spouse if found NA if not
+}
+\description{
+Interna function to match and extract a spouse from text
+}
+\examples{
+\dontrun{
+# Bad example
+spouse <- match_spouse("* marry * *", "* * * *", "Jules Aaron", c(1, 1), c(3, 4), entities)
+
+spouses
+}
+}
diff --git a/processing/wikiproc/tests/.DS_Store b/processing/wikiproc/tests/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..07c0a46c0bda99e479d4022d7b93670751f0e29a
GIT binary patch
literal 6148
zcmeHK%}T>S5Z-O8ZYp9Af<5lVTMrE_Ru4j`_25m2=s~4TXt9BsNRt{gR`MG9Mm~YB
z<IL{1SgIEfB4q|<zuEcOWxs^|VT|$iq;E4;VT=h-#DWFQH-d50B`FvWB3EN1Y{)#8
z_-TB<lueHR$N+vj!4fuLF|=R4pFfIxh`sywqaaS|^$%Vsme$rcgeZ&hR{6#qMyZ>4
z$s}%j<16f)dCAZ>_IVVI=C<cw1cS6&**T8F#0$c~SQQ8T0i;}C24O!++tDQK$7&o`
z2ZXSM)vZ*sta)J9WWzq3)@0Ud)oZfZ*q=@<vAbs<o%Ze?pR(ua%d2LW!;h$B+u#h|
zz*t)7gF6mKQFsfnS=}t3kQg8ah=CPmz-)P9bA{JPOCtt|f!{HJ`-22UbPVPi)z$$G
zULP@DLPP-@-x7$@pkpxC2oVsjO96E$H%|<%%fT;Ao?|fAsLL5wGs8G$W^P_6T+I%C
zsnQvDG*U|p5Ce-06m_?P=l{vi@BhUj>JbCPz)CT|OC7h<h9#M^b!l;U)=JPGC<^A~
m8s{ZosG}Hi@hGl>DgnPl1JE&;YXlDnT?7;j)DQ!I%D^X>u~Uoy

literal 0
HcmV?d00001

diff --git a/rasa/actions.py b/rasa/actions.py
index af2b480..0ca0175 100644
--- a/rasa/actions.py
+++ b/rasa/actions.py
@@ -192,34 +192,34 @@ class ActionUtterIsAlive(Action):
 #
 # num spouses
 #
-class ActionSearchNumSpouses(Action):
+class ActionSearchSpouse(Action):
     def name(self):
-        return 'action_search_num_spouses'
+        return 'action_search_spouse'
 
     def run(self, dispatcher, tracker, domain):
         import csv
         import re
         person = tracker.get_slot('physicist')
         name_regex = re.compile(person, re.IGNORECASE)
-        actual_num_spouses = None
+        actual_spouse= None
         with open('data.tsv') as csvfile:
             spamreader = csv.DictReader(csvfile, delimiter='\t')
             for row in spamreader:
                 if name_regex.match(row['name']):
-                    actual_num_spouses = row['num_spouses']
-        return [SlotSet('num_spouses', actual_num_spouses 
-            if actual_num_spouses is not None and actual_num_spouses is not "" 
+                    actual_spouse = row['spouse']
+        return [SlotSet('spouse', actual_spouse 
+            if actual_spouse is not None and actual_spouse is not "" 
             else "not known")]
         
         
 class ActionUtterNumSpouses(Action):
     def name(self):
-        return 'action_utter_num_spouses'
+        return 'action_utter_spouse'
 
     def run(self, dispatcher, tracker, domain):
         person = tracker.get_slot('physicist')
-        num_spouses = tracker.get_slot('num_spouses')
-        dispatcher.utter_message("The Number of spouses of {} is {}.".format(person, num_spouses))
+        spouse = tracker.get_slot('spouse')
+        dispatcher.utter_message("The spouse of {} is {}.".format(person, spouse))
         return []
 
 #
diff --git a/rasa/domain.yml b/rasa/domain.yml
index 2221181..dddc1af 100644
--- a/rasa/domain.yml
+++ b/rasa/domain.yml
@@ -6,7 +6,7 @@ intents:
   - day_of_death
   - place_of_death
   - is_alive
-  - num_spouses
+  - spouse
   - primary_education
   - university
   - area_of_research
@@ -26,8 +26,8 @@ actions:
   - action_utter_place_of_death
   - action_search_is_alive
   - action_utter_is_alive
-  - action_search_num_spouses
-  - action_utter_num_spouses
+  - action_search_spouse
+  - action_utter_spouse
   - action_search_primary_education
   - action_utter_primary_education
   - action_search_university
@@ -55,7 +55,7 @@ slots:
     type: unfeaturized
   is_alive:
     type: unfeaturized
-  num_spouses:
+  spouse:
     type: unfeaturized
   primary_education:
     type: unfeaturized
diff --git a/rasa/nlu.md b/rasa/nlu.md
index b7cfff8..3b9869d 100644
--- a/rasa/nlu.md
+++ b/rasa/nlu.md
@@ -28,10 +28,11 @@
 - is [Albert Einstein](physicist) still alive
 - is [Galileo Galilei](physicist) dead
 
-## intent:num_spouses
-- how often was [Albert Einstein](physicist) married
-- was [Galileo Galilei](physicist) married
-- how many spouses did have [Albert Einstein](physicist)
+## intent:spouse
+- to whom was [Albert Einstein](physicist) married
+- who was [Galileo Galilei](physicist) spouse
+- who was [Albert Einstein](physicist) wife
+- who was [Galileo Galilei](physicist) husband
 
 ## intent:primary_education
 - where did [Galileo Galilei](physicist) go to school
diff --git a/rasa/stories.md b/rasa/stories.md
index a6defb5..e624184 100644
--- a/rasa/stories.md
+++ b/rasa/stories.md
@@ -31,10 +31,10 @@
   - action_search_is_alive
   - action_utter_is_alive
 
-## num_spouses
-* num_spouses{"physicist": "albert einstein"}
-  - action_search_num_spouses
-  - action_utter_num_spouses
+## spouse
+* spouse{"physicist": "albert einstein"}
+  - action_search_spouse
+  - action_utter_spouse
 
 ## primary_education
 * primary_education{"physicist": "albert einstein"}
-- 
GitLab