Skip to content
Snippets Groups Projects
Commit d31acde2 authored by Leonard Haas's avatar Leonard Haas
Browse files

Merge branch '33-create-pattern-matching-function' into 'master'

Resolve "Create pattern matching function"

Closes #33

See merge request !32
parents 71c6fe2b 58d5f2d2
No related branches found
No related tags found
1 merge request!32Resolve "Create pattern matching function"
......@@ -40,3 +40,142 @@ get_infobox <- function(article) {
return(table)
}
### TODO:
### * use.stems also makes the function return stemmed words
### * performance?
### * Testing (!!)
### * NER matching
#' Matches the given pattern to the provided annotations object
#'
#' @param annotations The annotations object genereted by cleanNLP
#' @param text The text tags to match, needs to be of the same length as pos.
#' May be supplied as single string or as vector of strings. Wildcards are supported.
#' @param pos The part of speech tags to match, needs to be of the same length as text.
#' May be supplied as single string or as vector of strings. Wildcards are supported.
#' @param use.stems Wheather to use stemmed words for the matching.
#' Beware that the stem for pronouns is _always_ "-PRON-".
#' Will currently also return stemmed words.
#' @param ignore.case Wheater to ignore case during pattern matching
#' @param ignore.punct Wheather to ignore punctuation for the pattern matching.
#' Note that this refers to the _PUNCT_ pos tag, so it will also ignore any parantheses etc.
#'
#' @return Any matches found in the given text
#'
#' @examples
#' \dontrun{
#' # Find all proper nouns following an and
#'
#' # Both notations are working
#' text <- "and *"
#' pos <- c("*", "PROPN")
#'
#' # Prepare data
#' articles <- get_data()
#' annotations <- create_annotations(clean_html(articles[42,4]), articles[42,2], articles[42,3])
#'
#' results <- match_pattern(annotations, text, pos, ignore.punct = TRUE)
#' results
#' # [1] "and Frequency" "and Technology" "and Gravitation"
#' }
match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case = FALSE, ignore.punct = FALSE) {
# First we need to verify our input
# If we got a single vector we need to expand it
if (length(text) == 1) {
text <- unlist(strsplit(text, " ", fixed = TRUE))
}
if (length(pos) == 1) {
pos <- unlist(strsplit(pos, " ", fixed = TRUE))
}
# We need at the very least to tokens to define a pattern
if (length(text) <= 1 | length(pos) <= 1) {
stop("Error: Input needs to be contain at least two tokens")
}
# Check the length of our inputs matches
if (length(text) != length(pos)) {
stop("Error: Text and pos need to be of equal length")
}
# Let's build the vector we're going to be searching through
# TODO: This way of doing this is probably inefficent
combined <- NA
annot <- cleanNLP::cnlp_get_token(annotations)
# Remove spaces as they would mess up a lot down there
annot <- annot[annot$upos != "SPACE",]
# Remove punctuation if that is desired
if(ignore.punct) {
annot <- annot[annot$upos != "PUNCT",]
}
# Now to actually constructing the vector
if(use.stems) {
# Maybe want to change that seperator, however slashes would mess up regex construction even more
combined <- paste(annot$lemma, annot$upos, sep = "#")
} else {
combined <- paste(annot$word, annot$upos, sep = "#")
}
# We could also use something else to collapse, we would slightly need to modify the query
# but we could skip removing spaces. If that is worth anything?
# Also appending an extra space so we can match the last element
combined <- paste0(paste(combined, collapse = " "), " ")
# Now let us construct the regex
expr <- sapply(seq_along(text), function(x) {
# Create regex vector
re <- character()
# If we get a wildcard we match anything but spaces that comes before a #
# If not we match exactly the word we got
if(text[x] == "*") {
re <- "([^ ])+#"
} else {
re <- paste0(text[x], "#")
}
# Now we do a very similar thing for the pos tags
# Except we know they will only contain capital letters, always
if(pos[x] == "*") {
re <- paste0(re, "([A-Z])+ ")
} else {
re <- paste0(re, paste0(pos[x], " "))
}
})
# Stitch it all together
expr <- paste(expr, collapse = "")
matches <- gregexpr(expr, combined, ignore.case = ignore.case)
# If there was no match we may exit here
if (matches[[1]][1] == -1) {
# Explicit returns are bad, i know. But what to do here?
return(NA)
}
# What do we want to do with the result data? This currently extracts it,
# strips the POS tags of and then goes home happy
matches <- unlist(regmatches(combined, matches))
matches <- sapply(matches, function(match) {
match <- trimws(gsub("#([A-Z])+ ", " ", match), which = "right")
})
unname(matches)
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{match_pattern}
\alias{match_pattern}
\title{Matches the given pattern to the provided annotations object}
\usage{
match_pattern(annotations, text, pos, use.stems = FALSE,
ignore.case = FALSE, ignore.punct = FALSE)
}
\arguments{
\item{annotations}{The annotations object genereted by cleanNLP}
\item{text}{The text tags to match, needs to be of the same length as pos.
May be supplied as single string or as vector of strings. Wildcards are supported.}
\item{pos}{The part of speech tags to match, needs to be of the same length as text.
May be supplied as single string or as vector of strings. Wildcards are supported.}
\item{use.stems}{Wheather to use stemmed words for the matching.
Beware that the stem for pronouns is _always_ "-PRON-".
Will currently also return stemmed words.}
\item{ignore.case}{Wheater to ignore case during pattern matching}
\item{ignore.punct}{Wheather to ignore punctuation for the pattern matching.
Note that this refers to the _PUNCT_ pos tag, so it will also ignore any parantheses etc.}
}
\value{
Any matches found in the given text
}
\description{
Matches the given pattern to the provided annotations object
}
\examples{
\dontrun{
# Find all proper nouns following an and
# Both notations are working
text <- "and *"
pos <- c("*", "PROPN")
# Prepare data
articles <- get_data()
annotations <- create_annotations(clean_html(articles[42,4]), articles[42,2], articles[42,3])
results <- match_pattern(annotations, text, pos, ignore.punct = TRUE)
results
# [1] "and Frequency" "and Technology" "and Gravitation"
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment