Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
### get_spouse.R
### This extracts the spouse of the physicist entity
# Author: David
### Test case to remember: Louis Walter Alvarez
### TODO: Maybe match pronouns?
#' Reads the spouse of an physicist from text if possible
#'
#' @param article Cleaned article
#' @param annotation Annotation object
#'
#' @return Name of the spouse of the
#' @export
#'
#' @examples
#' \dontrun{
#' articles <- get_data()
#'
#' spouses <- get_spouse(articles$Text[54], annotation)
#'
#' spouses
#' }
get_spouse <- function(physicist, annotation){
# We need this later
entities <- cleanNLP::cnlp_get_entity(annotation)
entities <- entities[entities$entity_type == "PERSON", ]$entity
# Build parameter objects
words <- c("* marry * *",
"* marry * *",
"* * marry * *",
"* marry * *",
"* * marry * *",
"* marry * * *",
"* * marry * * *")
tags <- c("PROPN * PROPN PROPN",
"NOUN * PROPN PROPN",
"PROPN PROPN * PROPN PROPN",
"* * * *",
"* * * * *",
"* * * * *",
"* * * * * *")
physicist.word.positions <- list(c(1, 1),
c(1, 1),
c(1, 2),
c(1, 1),
c(1, 2),
c(1, 1),
c(1, 2))
spouse.word.positions <- list(c(3, 4),
c(3, 4),
c(4, 5),
c(3, 4),
c(4, 5),
c(4, 6),
c(5, 7))
# Apply all the matchings to our data
# First off we try matching exact patterns, as this will yield the best precision
# While names are proper nouns sometimes if there is only one name before a verb (and especially at the start of a sentence)
# this does not allways get tagged correct, so we check for noun too
# Two words before should be recognized as propn so we won't bother checking for NOUN
# If we don't find anything this way, we try and match without specific pos tags
results <- sapply(seq_along(words), function(x) {
spouse <- match_spouse(words[x], tags[x], physicist, physicist.word.positions[[x]],
spouse.word.positions[[x]], annotation, entities)
})
if(!all(is.na(results))) {
# We got a match
return(na.omit(results)[1])
}
return(NA)
}
#' Interna function to match and extract a spouse from text
#'
#' @param words Tokens to match for
#' @param tags **Optional** POS tags to match for
#' @param physicist Name of the physicist for whos spouse we're looking
#' @param physicist.position Where to extract the physicist entity from
#' @param spouse.position Where to extract the spouse entity from
#' @param annotation Anootation object
#' @param people List to match found spouse against
#'
#' @return Spouse if found NA if not
#' @examples
#' \dontrun{
#' # Bad example
#' spouse <- match_spouse("* marry * *", "* * * *", "Jules Aaron", c(1, 1), c(3, 4), entities)
#'
#' spouses
#' }
match_spouse <- function(words, tags, physicist, physicist.position, spouse.position, annotation, people) {
# If there is no tags parameter we need to create the vector ourselves
# Will be fixed upstream in match_pattern so we won't need to
if (missing(tags)) {
tags <- paste(rep("*", stringr::str_count(words, stringr::fixed(" ")) + 1))
}
# Match pattern
spouse <- match_pattern(annotation, words, tags, ignore.case = TRUE, use.stems = TRUE, ignore.punct = TRUE)
# TODO: Multiple result handling is broken right now, will fix soon
if (!is.na(spouse) && length(spouse) > 1) {
spouse <- spouse[1]
}
# We check if the physicist in the matched sentence is the one we're looking for
if (!is.na(spouse) &&
adist(stringr::word(spouse, physicist.position[1], physicist.position[2]), physicist, partial = TRUE) <= 3) {
spouse <- stringr::word(spouse, spouse.position[1], spouse.position[2])
# Checking if the result is actualy a person.
# This is ugly. Possibly better way:
# tolower(spouse) %in% tolower(people)
if (!is.na(any(grepl(spouse, people, ignore.case = TRUE, fixed = TRUE))) &&
any(grepl(spouse, people, ignore.case = TRUE))) {
return(tools::toTitleCase(spouse))
}
}
return(NA)
}