Skip to content
Snippets Groups Projects
Commit 4840c73b authored by David Fuhry's avatar David Fuhry
Browse files

Implementet pattern matching function

parent 9945b3bb
No related branches found
No related tags found
1 merge request!32Resolve "Create pattern matching function"
......@@ -41,3 +41,142 @@ get_infobox <- function(article) {
return(table)
}
### TODO:
### * use.stems also makes the function return stemmed words
### * performance?
### * Testing (!!)
### * NER matching
#' Matches the given pattern to the provided annotations object
#'
#' @param annottions The annotations object genereted by cleanNLP
#' @param text The text tags to match, needs to be of the same length as pos.
#' May be supplied as single string or as vector of strings. Wildcards are supported.
#' @param pos The part of speech tags to match, needs to be of the same length as text.
#' May be supplied as single string or as vector of strings. Wildcards are supported.
#' @param use.stems Wheather to use stemmed words for the matching.
#' Beware that the stem for pronouns is _always_ "-PRON-".
#' Will currently also return stemmed words.
#' @param ignore.case Wheater to ignore case during pattern matching
#' @param ignore.punct Wheather to ignore punctuation for the pattern matching.
#' Note that this refers to the _PUNCT_ pos tag, so it will also ignore any parantheses etc.
#'
#' @return Any matches found in a vector
#'
#' @examples
#' \dontrun{
#' # Find all proper nouns following an and
#'
#' # Both notations are working
#' text <- "and *"
#' pos <- c("*", "PROPN")
#'
#' # Prepare data
#' articles <- get_data()
#' annotations <- create_annotations(clean_html(articles[42,4]), articles[42,2], articles[42,3])
#'
#' results <- match_pattern(annotations, text, pos, ignore.punct = TRUE)
#' results
#' # [1] "and Frequency" "and Technology" "and Gravitation"
#' }
match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case = FALSE, ignore.punct = FALSE) {
# First we need to verify our input
# If we got a single vector we need to expand it
if (length(text) == 1) {
text <- unlist(strsplit(text, " ", fixed = TRUE))
}
if (length(pos) == 1) {
pos <- unlist(strsplit(pos, " ", fixed = TRUE))
}
# We need at the very least to tokens to define a pattern
if (length(text) <= 1 | length(pos) <= 1) {
stop("Error: Input needs to be contain at least two tokens")
}
# Check the length of our inputs matches
if (length(text) != length(pos)) {
stop("Error: Text and pos need to be of equal length")
}
# Let's build the vector we're going to be searching through
# TODO: This way of doing this is probably inefficent
combined <- NA
annot <- cleanNLP::cnlp_get_token(annotations)
# Remove spaces as they would mess up a lot down there
annot <- annot[annot$upos != "SPACE",]
# Remove punctuation if that is desired
if(ignore.punct) {
annot <- annot[annot$upos != "PUNCT",]
}
# Now to actually constructing the vector
if(use.stems) {
# Maybe want to change that seperator, however slashes would mess up regex construction even more
combined <- paste(annot$lemma, annot$upos, sep = "#")
} else {
combined <- paste(annot$word, annot$upos, sep = "#")
}
# We could also use something else to collapse, we would slightly need to modify the query
# but we could skip removing spaces. If that is worth anything?
# Also appending an extra space so we can match the last element
combined <- paste0(paste(combined, collapse = " "), " ")
# Now let us construct the regex
expr <- sapply(seq_along(text), function(x) {
# Create regex vector
re <- character()
# If we get a wildcard we match anything but spaces that comes before a #
# If not we match exactly the word we got
if(text[x] == "*") {
re <- "([^ ])+#"
} else {
re <- paste0(text[x], "#")
}
# Now we do a very similar thing for the pos tags
# Except we know they will only contain capital letters, always
if(pos[x] == "*") {
re <- paste0(re, "([A-Z])+ ")
} else {
re <- paste0(re, paste0(pos[x], " "))
}
})
# Stitch it all together
expr <- paste(expr, collapse = "")
matches <- gregexpr(expr, combined, ignore.case = ignore.case)
# If there was no match we may exit here
if (matches[[1]][1] == -1) {
# Explicit returns are bad, i know. But what to do here?
return(NA)
}
# What do we want to do with the result data? This currently extracts it,
# strips the POS tags of and then goes home happy
matches <- unlist(regmatches(combined, matches))
matches <- sapply(matches, function(match) {
match <- trimws(gsub("#([A-Z])+ ", " ", match), which = "right")
})
unname(matches)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment