Skip to content
Snippets Groups Projects
Commit 6d48bdb1 authored by Lukas Gehrke's avatar Lukas Gehrke
Browse files

Merge branch '51-intend-make-spouses-intend-get-spouse-instead-of-number-of-spouses' into 'master'

Resolve "Intend: Make spouses intend get spouse instead of number of spouses"

Closes #51

See merge request !56
parents 49451f87 dd64253d
No related branches found
No related tags found
1 merge request!56Resolve "Intend: Make spouses intend get spouse instead of number of spouses"
...@@ -31,7 +31,7 @@ results <- pbapply(articles, 1, function(article) { ...@@ -31,7 +31,7 @@ results <- pbapply(articles, 1, function(article) {
## Data cleaning ## Data cleaning
cleaned_text <- wikiproc::clean_html(article[4]) cleaned.text <- clean_html(article[4])
## Data preprocessing/annotating ## Data preprocessing/annotating
...@@ -39,24 +39,18 @@ results <- pbapply(articles, 1, function(article) { ...@@ -39,24 +39,18 @@ results <- pbapply(articles, 1, function(article) {
## Extract information from Text ## Extract information from Text
no.spouses <- get_no_of_spouses(article[4]) spouse_found <- get_spouse(article[4], annotation)
awards <- get_awards(annotation) awards <- get_awards(annotation)
# NOTE: Passes annotation though the changes needing this are not yet merged
# bplace <- get_birthplace(cleaned_text)
# bdate <- get_birthdate(cleaned_text)
## Create Results ## Create Results
data.frame(name = article[1], data.frame(Name = article[1],
spouse = spouse_found,
birthplace = NA, birthplace = NA,
birthdate = NA, birthdate = NA,
day_of_death = NA, day_of_death = NA,
place_of_death = NA, place_of_death = NA,
is_alive = NA, is_alive = NA,
num_spouses = no.spouses,
primary_education = NA, primary_education = NA,
university = NA, university = NA,
area_of_research = NA, area_of_research = NA,
......
...@@ -6,7 +6,7 @@ export(get_awards) ...@@ -6,7 +6,7 @@ export(get_awards)
export(get_birthdate) export(get_birthdate)
export(get_birthplace) export(get_birthplace)
export(get_data) export(get_data)
export(get_no_of_spouses) export(get_spouse)
export(init_nlp) export(init_nlp)
importFrom(data.table,"%like%") importFrom(data.table,"%like%")
importFrom(magrittr,"%>%") importFrom(magrittr,"%>%")
......
### GetNoOfSpouses.R
### This extracts the number of spouses from the infobox
### If no infobox or no information about spouses is found assumes there are none
### Not for use in production, this does not actually get information from text
# Author: David
#' Reads the number of spouses from the infobox of an wikipedia article
#'
#' @param article Wikipedia article in html format
#'
#' @return Integer indicating the number of spouses
#' @export
#'
#' @examples
#' \dontrun{
#' articles <- get_data()
#'
#' no.spouses <- get_no_of_spouses(articles$Text[54])
#'
#' no.spouses
#' }
get_no_of_spouses <- function(article) {
# If there is no infobox we assume there were no spouses
if(!grepl("vcard", article)) {
return(0)
}
infoBox <- get_infobox(article)
# Get the spouse field
spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
# Remove everything in parentheses
spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
# Split the strings by newlines to get one spouse per line
spouses <- strsplit(spouses, "\n")
spouses <- unlist(spouses)
if(length(spouses) > 0) {
no.spouses <- length(spouses)
} else {
no.spouses <- 0
}
}
### get_spouse.R
### This extracts the spouse of the physicist entity
# Author: David
### Test case to remember: Louis Walter Alvarez
### TODO: Maybe match pronouns?
#' Reads the spouse of an physicist from text if possible
#'
#' @param article Cleaned article
#' @param annotation Annotation object
#'
#' @return Name of the spouse of the
#' @export
#'
#' @examples
#' \dontrun{
#' articles <- get_data()
#'
#' spouses <- get_spouse(articles$Text[54], annotation)
#'
#' spouses
#' }
get_spouse <- function(physicist, annotation){
# We need this later
entities <- cleanNLP::cnlp_get_entity(annotation)
entities <- entities[entities$entity_type == "PERSON", ]$entity
# Build parameter objects
words <- c("* marry * *",
"* marry * *",
"* * marry * *",
"* marry * *",
"* * marry * *",
"* marry * * *",
"* * marry * * *")
tags <- c("PROPN * PROPN PROPN",
"NOUN * PROPN PROPN",
"PROPN PROPN * PROPN PROPN",
"* * * *",
"* * * * *",
"* * * * *",
"* * * * * *")
physicist.word.positions <- list(c(1, 1),
c(1, 1),
c(1, 2),
c(1, 1),
c(1, 2),
c(1, 1),
c(1, 2))
spouse.word.positions <- list(c(3, 4),
c(3, 4),
c(4, 5),
c(3, 4),
c(4, 5),
c(4, 6),
c(5, 7))
# Apply all the matchings to our data
# First off we try matching exact patterns, as this will yield the best precision
# While names are proper nouns sometimes if there is only one name before a verb (and especially at the start of a sentence)
# this does not allways get tagged correct, so we check for noun too
# Two words before should be recognized as propn so we won't bother checking for NOUN
# If we don't find anything this way, we try and match without specific pos tags
results <- sapply(seq_along(words), function(x) {
spouse <- match_spouse(words[x], tags[x], physicist, physicist.word.positions[[x]],
spouse.word.positions[[x]], annotation, entities)
})
if(!all(is.na(results))) {
# We got a match
return(na.omit(results)[1])
}
return(NA)
}
#' Interna function to match and extract a spouse from text
#'
#' @param words Tokens to match for
#' @param tags **Optional** POS tags to match for
#' @param physicist Name of the physicist for whos spouse we're looking
#' @param physicist.position Where to extract the physicist entity from
#' @param spouse.position Where to extract the spouse entity from
#' @param annotation Anootation object
#' @param people List to match found spouse against
#'
#' @return Spouse if found NA if not
#' @examples
#' \dontrun{
#' # Bad example
#' spouse <- match_spouse("* marry * *", "* * * *", "Jules Aaron", c(1, 1), c(3, 4), entities)
#'
#' spouses
#' }
match_spouse <- function(words, tags, physicist, physicist.position, spouse.position, annotation, people) {
# If there is no tags parameter we need to create the vector ourselves
# Will be fixed upstream in match_pattern so we won't need to
if (missing(tags)) {
tags <- paste(rep("*", stringr::str_count(words, stringr::fixed(" ")) + 1))
}
# Match pattern
spouse <- match_pattern(annotation, words, tags, ignore.case = TRUE, use.stems = TRUE, ignore.punct = TRUE)
# TODO: Multiple result handling is broken right now, will fix soon
if (!is.na(spouse) && length(spouse) > 1) {
spouse <- spouse[1]
}
# We check if the physicist in the matched sentence is the one we're looking for
if (!is.na(spouse) &&
adist(stringr::word(spouse, physicist.position[1], physicist.position[2]), physicist, partial = TRUE) <= 3) {
spouse <- stringr::word(spouse, spouse.position[1], spouse.position[2])
# Checking if the result is actualy a person.
# This is ugly. Possibly better way:
# tolower(spouse) %in% tolower(people)
if (!is.na(any(grepl(spouse, people, ignore.case = TRUE, fixed = TRUE))) &&
any(grepl(spouse, people, ignore.case = TRUE))) {
return(tools::toTitleCase(spouse))
}
}
return(NA)
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_no_of_spouses.R
\name{get_no_of_spouses}
\alias{get_no_of_spouses}
\title{Reads the number of spouses from the infobox of an wikipedia article}
\usage{
get_no_of_spouses(article)
}
\arguments{
\item{article}{Wikipedia article in html format}
}
\value{
Integer indicating the number of spouses
}
\description{
Reads the number of spouses from the infobox of an wikipedia article
}
\examples{
\dontrun{
articles <- get_data()
no.spouses <- get_no_of_spouses(articles$Text[54])
no.spouses
}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_spouse.R
\name{get_spouse}
\alias{get_spouse}
\title{Reads the spouse of an physicist from text if possible}
\usage{
get_spouse(physicist, annotation)
}
\arguments{
\item{annotation}{Annotation object}
\item{article}{Cleaned article}
}
\value{
Name of the spouse of the
}
\description{
Reads the spouse of an physicist from text if possible
}
\examples{
\dontrun{
articles <- get_data()
spouses <- get_spouse(articles$Text[54], annotation)
spouses
}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_spouse.R
\name{match_spouse}
\alias{match_spouse}
\title{Interna function to match and extract a spouse from text}
\usage{
match_spouse(words, tags, physicist, physicist.position, spouse.position,
annotation, people)
}
\arguments{
\item{words}{Tokens to match for}
\item{tags}{**Optional** POS tags to match for}
\item{physicist}{Name of the physicist for whos spouse we're looking}
\item{physicist.position}{Where to extract the physicist entity from}
\item{spouse.position}{Where to extract the spouse entity from}
\item{annotation}{Anootation object}
\item{people}{List to match found spouse against}
}
\value{
Spouse if found NA if not
}
\description{
Interna function to match and extract a spouse from text
}
\examples{
\dontrun{
# Bad example
spouse <- match_spouse("* marry * *", "* * * *", "Jules Aaron", c(1, 1), c(3, 4), entities)
spouses
}
}
File added
...@@ -192,34 +192,34 @@ class ActionUtterIsAlive(Action): ...@@ -192,34 +192,34 @@ class ActionUtterIsAlive(Action):
# #
# num spouses # num spouses
# #
class ActionSearchNumSpouses(Action): class ActionSearchSpouse(Action):
def name(self): def name(self):
return 'action_search_num_spouses' return 'action_search_spouse'
def run(self, dispatcher, tracker, domain): def run(self, dispatcher, tracker, domain):
import csv import csv
import re import re
person = tracker.get_slot('physicist') person = tracker.get_slot('physicist')
name_regex = re.compile(person, re.IGNORECASE) name_regex = re.compile(person, re.IGNORECASE)
actual_num_spouses = None actual_spouse= None
with open('data.tsv') as csvfile: with open('data.tsv') as csvfile:
spamreader = csv.DictReader(csvfile, delimiter='\t') spamreader = csv.DictReader(csvfile, delimiter='\t')
for row in spamreader: for row in spamreader:
if name_regex.match(row['name']): if name_regex.match(row['name']):
actual_num_spouses = row['num_spouses'] actual_spouse = row['spouse']
return [SlotSet('num_spouses', actual_num_spouses return [SlotSet('spouse', actual_spouse
if actual_num_spouses is not None and actual_num_spouses is not "" if actual_spouse is not None and actual_spouse is not ""
else "not known")] else "not known")]
class ActionUtterNumSpouses(Action): class ActionUtterNumSpouses(Action):
def name(self): def name(self):
return 'action_utter_num_spouses' return 'action_utter_spouse'
def run(self, dispatcher, tracker, domain): def run(self, dispatcher, tracker, domain):
person = tracker.get_slot('physicist') person = tracker.get_slot('physicist')
num_spouses = tracker.get_slot('num_spouses') spouse = tracker.get_slot('spouse')
dispatcher.utter_message("The Number of spouses of {} is {}.".format(person, num_spouses)) dispatcher.utter_message("The spouse of {} is {}.".format(person, spouse))
return [] return []
# #
......
...@@ -6,7 +6,7 @@ intents: ...@@ -6,7 +6,7 @@ intents:
- day_of_death - day_of_death
- place_of_death - place_of_death
- is_alive - is_alive
- num_spouses - spouse
- primary_education - primary_education
- university - university
- area_of_research - area_of_research
...@@ -26,8 +26,8 @@ actions: ...@@ -26,8 +26,8 @@ actions:
- action_utter_place_of_death - action_utter_place_of_death
- action_search_is_alive - action_search_is_alive
- action_utter_is_alive - action_utter_is_alive
- action_search_num_spouses - action_search_spouse
- action_utter_num_spouses - action_utter_spouse
- action_search_primary_education - action_search_primary_education
- action_utter_primary_education - action_utter_primary_education
- action_search_university - action_search_university
...@@ -55,7 +55,7 @@ slots: ...@@ -55,7 +55,7 @@ slots:
type: unfeaturized type: unfeaturized
is_alive: is_alive:
type: unfeaturized type: unfeaturized
num_spouses: spouse:
type: unfeaturized type: unfeaturized
primary_education: primary_education:
type: unfeaturized type: unfeaturized
......
...@@ -28,10 +28,11 @@ ...@@ -28,10 +28,11 @@
- is [Albert Einstein](physicist) still alive - is [Albert Einstein](physicist) still alive
- is [Galileo Galilei](physicist) dead - is [Galileo Galilei](physicist) dead
## intent:num_spouses ## intent:spouse
- how often was [Albert Einstein](physicist) married - to whom was [Albert Einstein](physicist) married
- was [Galileo Galilei](physicist) married - who was [Galileo Galilei](physicist) spouse
- how many spouses did have [Albert Einstein](physicist) - who was [Albert Einstein](physicist) wife
- who was [Galileo Galilei](physicist) husband
## intent:primary_education ## intent:primary_education
- where did [Galileo Galilei](physicist) go to school - where did [Galileo Galilei](physicist) go to school
......
...@@ -31,10 +31,10 @@ ...@@ -31,10 +31,10 @@
- action_search_is_alive - action_search_is_alive
- action_utter_is_alive - action_utter_is_alive
## num_spouses ## spouse
* num_spouses{"physicist": "albert einstein"} * spouse{"physicist": "albert einstein"}
- action_search_num_spouses - action_search_spouse
- action_utter_num_spouses - action_utter_spouse
## primary_education ## primary_education
* primary_education{"physicist": "albert einstein"} * primary_education{"physicist": "albert einstein"}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment