From dd64253d00a224eed3e6926df99694e7a11ad319 Mon Sep 17 00:00:00 2001 From: David Fuhry <df15nocu@studserv.uni-leipzig.de> Date: Sat, 26 Jan 2019 16:28:27 +0100 Subject: [PATCH] Resolve "Intend: Make spouses intend get spouse instead of number of spouses" --- processing/script/master.R | 14 +- processing/wikiproc/NAMESPACE | 2 +- processing/wikiproc/R/get_no_of_spouses.R | 44 ------ processing/wikiproc/R/get_spouse.R | 133 +++++++++++++++++++ processing/wikiproc/man/get_no_of_spouses.Rd | 26 ---- processing/wikiproc/man/get_spouse.Rd | 28 ++++ processing/wikiproc/man/match_spouse.Rd | 38 ++++++ processing/wikiproc/tests/.DS_Store | Bin 0 -> 6148 bytes rasa/actions.py | 18 +-- rasa/domain.yml | 8 +- rasa/nlu.md | 9 +- rasa/stories.md | 8 +- 12 files changed, 226 insertions(+), 102 deletions(-) delete mode 100755 processing/wikiproc/R/get_no_of_spouses.R create mode 100755 processing/wikiproc/R/get_spouse.R delete mode 100644 processing/wikiproc/man/get_no_of_spouses.Rd create mode 100644 processing/wikiproc/man/get_spouse.Rd create mode 100644 processing/wikiproc/man/match_spouse.Rd create mode 100644 processing/wikiproc/tests/.DS_Store diff --git a/processing/script/master.R b/processing/script/master.R index 202ebd2..a70ddf8 100755 --- a/processing/script/master.R +++ b/processing/script/master.R @@ -31,7 +31,7 @@ results <- pbapply(articles, 1, function(article) { ## Data cleaning - cleaned_text <- wikiproc::clean_html(article[4]) + cleaned.text <- clean_html(article[4]) ## Data preprocessing/annotating @@ -39,24 +39,18 @@ results <- pbapply(articles, 1, function(article) { ## Extract information from Text - no.spouses <- get_no_of_spouses(article[4]) + spouse_found <- get_spouse(article[4], annotation) awards <- get_awards(annotation) - - # NOTE: Passes annotation though the changes needing this are not yet merged - # bplace <- get_birthplace(cleaned_text) - - # bdate <- get_birthdate(cleaned_text) - ## Create Results - data.frame(name = article[1], + data.frame(Name = article[1], + spouse = spouse_found, birthplace = NA, birthdate = NA, day_of_death = NA, place_of_death = NA, is_alive = NA, - num_spouses = no.spouses, primary_education = NA, university = NA, area_of_research = NA, diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE index 21f89c5..007f345 100644 --- a/processing/wikiproc/NAMESPACE +++ b/processing/wikiproc/NAMESPACE @@ -6,7 +6,7 @@ export(get_awards) export(get_birthdate) export(get_birthplace) export(get_data) -export(get_no_of_spouses) +export(get_spouse) export(init_nlp) importFrom(data.table,"%like%") importFrom(magrittr,"%>%") diff --git a/processing/wikiproc/R/get_no_of_spouses.R b/processing/wikiproc/R/get_no_of_spouses.R deleted file mode 100755 index 5b688b2..0000000 --- a/processing/wikiproc/R/get_no_of_spouses.R +++ /dev/null @@ -1,44 +0,0 @@ -### GetNoOfSpouses.R -### This extracts the number of spouses from the infobox -### If no infobox or no information about spouses is found assumes there are none -### Not for use in production, this does not actually get information from text - -# Author: David - -#' Reads the number of spouses from the infobox of an wikipedia article -#' -#' @param article Wikipedia article in html format -#' -#' @return Integer indicating the number of spouses -#' @export -#' -#' @examples -#' \dontrun{ -#' articles <- get_data() -#' -#' no.spouses <- get_no_of_spouses(articles$Text[54]) -#' -#' no.spouses -#' } -get_no_of_spouses <- function(article) { - - # If there is no infobox we assume there were no spouses - if(!grepl("vcard", article)) { - return(0) - } - - infoBox <- get_infobox(article) - - # Get the spouse field - spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content - # Remove everything in parentheses - spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses) - # Split the strings by newlines to get one spouse per line - spouses <- strsplit(spouses, "\n") - spouses <- unlist(spouses) - if(length(spouses) > 0) { - no.spouses <- length(spouses) - } else { - no.spouses <- 0 - } -} diff --git a/processing/wikiproc/R/get_spouse.R b/processing/wikiproc/R/get_spouse.R new file mode 100755 index 0000000..ff6c1cf --- /dev/null +++ b/processing/wikiproc/R/get_spouse.R @@ -0,0 +1,133 @@ +### get_spouse.R +### This extracts the spouse of the physicist entity + +# Author: David + + +### Test case to remember: Louis Walter Alvarez +### TODO: Maybe match pronouns? + + +#' Reads the spouse of an physicist from text if possible +#' +#' @param article Cleaned article +#' @param annotation Annotation object +#' +#' @return Name of the spouse of the +#' @export +#' +#' @examples +#' \dontrun{ +#' articles <- get_data() +#' +#' spouses <- get_spouse(articles$Text[54], annotation) +#' +#' spouses +#' } +get_spouse <- function(physicist, annotation){ + + # We need this later + entities <- cleanNLP::cnlp_get_entity(annotation) + entities <- entities[entities$entity_type == "PERSON", ]$entity + + # Build parameter objects + words <- c("* marry * *", + "* marry * *", + "* * marry * *", + "* marry * *", + "* * marry * *", + "* marry * * *", + "* * marry * * *") + tags <- c("PROPN * PROPN PROPN", + "NOUN * PROPN PROPN", + "PROPN PROPN * PROPN PROPN", + "* * * *", + "* * * * *", + "* * * * *", + "* * * * * *") + physicist.word.positions <- list(c(1, 1), + c(1, 1), + c(1, 2), + c(1, 1), + c(1, 2), + c(1, 1), + c(1, 2)) + spouse.word.positions <- list(c(3, 4), + c(3, 4), + c(4, 5), + c(3, 4), + c(4, 5), + c(4, 6), + c(5, 7)) + + # Apply all the matchings to our data + # First off we try matching exact patterns, as this will yield the best precision + # While names are proper nouns sometimes if there is only one name before a verb (and especially at the start of a sentence) + # this does not allways get tagged correct, so we check for noun too + # Two words before should be recognized as propn so we won't bother checking for NOUN + # If we don't find anything this way, we try and match without specific pos tags + + results <- sapply(seq_along(words), function(x) { + spouse <- match_spouse(words[x], tags[x], physicist, physicist.word.positions[[x]], + spouse.word.positions[[x]], annotation, entities) + }) + + if(!all(is.na(results))) { + # We got a match + return(na.omit(results)[1]) + } + + + + return(NA) +} + +#' Interna function to match and extract a spouse from text +#' +#' @param words Tokens to match for +#' @param tags **Optional** POS tags to match for +#' @param physicist Name of the physicist for whos spouse we're looking +#' @param physicist.position Where to extract the physicist entity from +#' @param spouse.position Where to extract the spouse entity from +#' @param annotation Anootation object +#' @param people List to match found spouse against +#' +#' @return Spouse if found NA if not +#' @examples +#' \dontrun{ +#' # Bad example +#' spouse <- match_spouse("* marry * *", "* * * *", "Jules Aaron", c(1, 1), c(3, 4), entities) +#' +#' spouses +#' } +match_spouse <- function(words, tags, physicist, physicist.position, spouse.position, annotation, people) { + # If there is no tags parameter we need to create the vector ourselves + # Will be fixed upstream in match_pattern so we won't need to + if (missing(tags)) { + tags <- paste(rep("*", stringr::str_count(words, stringr::fixed(" ")) + 1)) + } + + # Match pattern + spouse <- match_pattern(annotation, words, tags, ignore.case = TRUE, use.stems = TRUE, ignore.punct = TRUE) + # TODO: Multiple result handling is broken right now, will fix soon + if (!is.na(spouse) && length(spouse) > 1) { + spouse <- spouse[1] + } + + # We check if the physicist in the matched sentence is the one we're looking for + if (!is.na(spouse) && + adist(stringr::word(spouse, physicist.position[1], physicist.position[2]), physicist, partial = TRUE) <= 3) { + spouse <- stringr::word(spouse, spouse.position[1], spouse.position[2]) + + # Checking if the result is actualy a person. + # This is ugly. Possibly better way: + # tolower(spouse) %in% tolower(people) + if (!is.na(any(grepl(spouse, people, ignore.case = TRUE, fixed = TRUE))) && + any(grepl(spouse, people, ignore.case = TRUE))) { + return(tools::toTitleCase(spouse)) + } + } + return(NA) + +} + diff --git a/processing/wikiproc/man/get_no_of_spouses.Rd b/processing/wikiproc/man/get_no_of_spouses.Rd deleted file mode 100644 index bd8a78e..0000000 --- a/processing/wikiproc/man/get_no_of_spouses.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_no_of_spouses.R -\name{get_no_of_spouses} -\alias{get_no_of_spouses} -\title{Reads the number of spouses from the infobox of an wikipedia article} -\usage{ -get_no_of_spouses(article) -} -\arguments{ -\item{article}{Wikipedia article in html format} -} -\value{ -Integer indicating the number of spouses -} -\description{ -Reads the number of spouses from the infobox of an wikipedia article -} -\examples{ -\dontrun{ -articles <- get_data() - -no.spouses <- get_no_of_spouses(articles$Text[54]) - -no.spouses -} -} diff --git a/processing/wikiproc/man/get_spouse.Rd b/processing/wikiproc/man/get_spouse.Rd new file mode 100644 index 0000000..2d32df2 --- /dev/null +++ b/processing/wikiproc/man/get_spouse.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_spouse.R +\name{get_spouse} +\alias{get_spouse} +\title{Reads the spouse of an physicist from text if possible} +\usage{ +get_spouse(physicist, annotation) +} +\arguments{ +\item{annotation}{Annotation object} + +\item{article}{Cleaned article} +} +\value{ +Name of the spouse of the +} +\description{ +Reads the spouse of an physicist from text if possible +} +\examples{ +\dontrun{ +articles <- get_data() + +spouses <- get_spouse(articles$Text[54], annotation) + +spouses +} +} diff --git a/processing/wikiproc/man/match_spouse.Rd b/processing/wikiproc/man/match_spouse.Rd new file mode 100644 index 0000000..1c173bf --- /dev/null +++ b/processing/wikiproc/man/match_spouse.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_spouse.R +\name{match_spouse} +\alias{match_spouse} +\title{Interna function to match and extract a spouse from text} +\usage{ +match_spouse(words, tags, physicist, physicist.position, spouse.position, + annotation, people) +} +\arguments{ +\item{words}{Tokens to match for} + +\item{tags}{**Optional** POS tags to match for} + +\item{physicist}{Name of the physicist for whos spouse we're looking} + +\item{physicist.position}{Where to extract the physicist entity from} + +\item{spouse.position}{Where to extract the spouse entity from} + +\item{annotation}{Anootation object} + +\item{people}{List to match found spouse against} +} +\value{ +Spouse if found NA if not +} +\description{ +Interna function to match and extract a spouse from text +} +\examples{ +\dontrun{ +# Bad example +spouse <- match_spouse("* marry * *", "* * * *", "Jules Aaron", c(1, 1), c(3, 4), entities) + +spouses +} +} diff --git a/processing/wikiproc/tests/.DS_Store b/processing/wikiproc/tests/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..07c0a46c0bda99e479d4022d7b93670751f0e29a GIT binary patch literal 6148 zcmeHK%}T>S5Z-O8ZYp9Af<5lVTMrE_Ru4j`_25m2=s~4TXt9BsNRt{gR`MG9Mm~YB z<IL{1SgIEfB4q|<zuEcOWxs^|VT|$iq;E4;VT=h-#DWFQH-d50B`FvWB3EN1Y{)#8 z_-TB<lueHR$N+vj!4fuLF|=R4pFfIxh`sywqaaS|^$%Vsme$rcgeZ&hR{6#qMyZ>4 z$s}%j<16f)dCAZ>_IVVI=C<cw1cS6&**T8F#0$c~SQQ8T0i;}C24O!++tDQK$7&o` z2ZXSM)vZ*sta)J9WWzq3)@0Ud)oZfZ*q=@<vAbs<o%Ze?pR(ua%d2LW!;h$B+u#h| zz*t)7gF6mKQFsfnS=}t3kQg8ah=CPmz-)P9bA{JPOCtt|f!{HJ`-22UbPVPi)z$$G zULP@DLPP-@-x7$@pkpxC2oVsjO96E$H%|<%%fT;Ao?|fAsLL5wGs8G$W^P_6T+I%C zsnQvDG*U|p5Ce-06m_?P=l{vi@BhUj>JbCPz)CT|OC7h<h9#M^b!l;U)=JPGC<^A~ m8s{ZosG}Hi@hGl>DgnPl1JE&;YXlDnT?7;j)DQ!I%D^X>u~Uoy literal 0 HcmV?d00001 diff --git a/rasa/actions.py b/rasa/actions.py index af2b480..0ca0175 100644 --- a/rasa/actions.py +++ b/rasa/actions.py @@ -192,34 +192,34 @@ class ActionUtterIsAlive(Action): # # num spouses # -class ActionSearchNumSpouses(Action): +class ActionSearchSpouse(Action): def name(self): - return 'action_search_num_spouses' + return 'action_search_spouse' def run(self, dispatcher, tracker, domain): import csv import re person = tracker.get_slot('physicist') name_regex = re.compile(person, re.IGNORECASE) - actual_num_spouses = None + actual_spouse= None with open('data.tsv') as csvfile: spamreader = csv.DictReader(csvfile, delimiter='\t') for row in spamreader: if name_regex.match(row['name']): - actual_num_spouses = row['num_spouses'] - return [SlotSet('num_spouses', actual_num_spouses - if actual_num_spouses is not None and actual_num_spouses is not "" + actual_spouse = row['spouse'] + return [SlotSet('spouse', actual_spouse + if actual_spouse is not None and actual_spouse is not "" else "not known")] class ActionUtterNumSpouses(Action): def name(self): - return 'action_utter_num_spouses' + return 'action_utter_spouse' def run(self, dispatcher, tracker, domain): person = tracker.get_slot('physicist') - num_spouses = tracker.get_slot('num_spouses') - dispatcher.utter_message("The Number of spouses of {} is {}.".format(person, num_spouses)) + spouse = tracker.get_slot('spouse') + dispatcher.utter_message("The spouse of {} is {}.".format(person, spouse)) return [] # diff --git a/rasa/domain.yml b/rasa/domain.yml index 2221181..dddc1af 100644 --- a/rasa/domain.yml +++ b/rasa/domain.yml @@ -6,7 +6,7 @@ intents: - day_of_death - place_of_death - is_alive - - num_spouses + - spouse - primary_education - university - area_of_research @@ -26,8 +26,8 @@ actions: - action_utter_place_of_death - action_search_is_alive - action_utter_is_alive - - action_search_num_spouses - - action_utter_num_spouses + - action_search_spouse + - action_utter_spouse - action_search_primary_education - action_utter_primary_education - action_search_university @@ -55,7 +55,7 @@ slots: type: unfeaturized is_alive: type: unfeaturized - num_spouses: + spouse: type: unfeaturized primary_education: type: unfeaturized diff --git a/rasa/nlu.md b/rasa/nlu.md index b7cfff8..3b9869d 100644 --- a/rasa/nlu.md +++ b/rasa/nlu.md @@ -28,10 +28,11 @@ - is [Albert Einstein](physicist) still alive - is [Galileo Galilei](physicist) dead -## intent:num_spouses -- how often was [Albert Einstein](physicist) married -- was [Galileo Galilei](physicist) married -- how many spouses did have [Albert Einstein](physicist) +## intent:spouse +- to whom was [Albert Einstein](physicist) married +- who was [Galileo Galilei](physicist) spouse +- who was [Albert Einstein](physicist) wife +- who was [Galileo Galilei](physicist) husband ## intent:primary_education - where did [Galileo Galilei](physicist) go to school diff --git a/rasa/stories.md b/rasa/stories.md index a6defb5..e624184 100644 --- a/rasa/stories.md +++ b/rasa/stories.md @@ -31,10 +31,10 @@ - action_search_is_alive - action_utter_is_alive -## num_spouses -* num_spouses{"physicist": "albert einstein"} - - action_search_num_spouses - - action_utter_num_spouses +## spouse +* spouse{"physicist": "albert einstein"} + - action_search_spouse + - action_utter_spouse ## primary_education * primary_education{"physicist": "albert einstein"} -- GitLab