Merge branch '33-create-pattern-matching-function' into 'master'

Resolve "Create pattern matching function" Closes #33 See merge request !32

Merge branch '33-create-pattern-matching-function' into 'master'
d31acde2 · Leonard Haas · 71c6fe2b · 58d5f2d2 · d31acde2 · d31acde2
Commit d31acde2 authored 6 years ago by Leonard Haas
--- a/processing/wikiproc/R/utils.R
+++ b/processing/wikiproc/R/utils.R
@@ -40,3 +40,142 @@ get_infobox <- function(article) {

  return(table)
 }
+
+
+### TODO:
+### * use.stems also makes the function return stemmed words
+### * performance?
+### * Testing (!!)
+### * NER matching
+
+
+#' Matches the given pattern to the provided annotations object
+#'
+#' @param annotations The annotations object genereted by cleanNLP
+#' @param text The text tags to match, needs to be of the same length as pos.
+#' May be supplied as single string or as vector of strings. Wildcards are supported.
+#' @param pos The part of speech tags to match, needs to be of the same length as text.
+#' May be supplied as single string or as vector of strings. Wildcards are supported.
+#' @param use.stems Wheather to use stemmed words for the matching.
+#' Beware that the stem for pronouns is _always_ "-PRON-".
+#' Will currently also return stemmed words.
+#' @param ignore.case Wheater to ignore case during pattern matching
+#' @param ignore.punct Wheather to ignore punctuation for the pattern matching.
+#' Note that this refers to the _PUNCT_ pos tag, so it will also ignore any parantheses etc.
+#'
+#' @return Any matches found in the given text
+#'
+#' @examples
+#' \dontrun{
+#' # Find all proper nouns following an and
+#' 
+#' # Both notations are working
+#' text <- "and *"
+#' pos <- c("*", "PROPN")
+#' 
+#' # Prepare data
+#' articles <- get_data()
+#' annotations <- create_annotations(clean_html(articles[42,4]), articles[42,2], articles[42,3])
+#' 
+#' results <- match_pattern(annotations, text, pos, ignore.punct = TRUE)
+#' results
+#' # [1] "and Frequency"   "and Technology"  "and Gravitation"
+#' }
+match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case = FALSE, ignore.punct = FALSE) {
+  # First we need to verify our input
+  # If we got a single vector we need to expand it
+  if (length(text) == 1) {
+    text <- unlist(strsplit(text, " ", fixed = TRUE))
+  }
+  if (length(pos) == 1) {
+    pos <- unlist(strsplit(pos, " ", fixed = TRUE))
+  }
+  
+  # We need at the very least to tokens to define a pattern
+  if (length(text) <= 1 | length(pos) <= 1) {
+    stop("Error: Input needs to be contain at least two tokens")
+  }
+  
+  # Check the length of our inputs matches
+  if (length(text) != length(pos)) {
+    stop("Error: Text and pos need to be of equal length")
+  }
+  
+  # Let's build the vector we're going to be searching through
+  # TODO: This way of doing this is probably inefficent
+  
+  combined <- NA
+  annot <- cleanNLP::cnlp_get_token(annotations)
+  
+  # Remove spaces as they would mess up a lot down there
+  
+  annot <- annot[annot$upos != "SPACE",]
+  
+  # Remove punctuation if that is desired
+  
+  if(ignore.punct) {
+    annot <- annot[annot$upos != "PUNCT",]
+  }
+  
+  # Now to actually constructing the vector
+  
+  if(use.stems) {
+    # Maybe want to change that seperator, however slashes would mess up regex construction even more
+    combined <- paste(annot$lemma, annot$upos, sep = "#")
+  } else {
+    combined <- paste(annot$word, annot$upos, sep = "#")
+  }
+  
+  
+  # We could also use something else to collapse, we would slightly need to modify the query
+  # but we could skip removing spaces. If that is worth anything?
+  # Also appending an extra space so we can match the last element
+  combined <- paste0(paste(combined, collapse = " "), " ")
+  
+  # Now let us  construct the regex
+  
+  expr <- sapply(seq_along(text), function(x) {
+    # Create regex vector
+    re <- character()
+    
+    # If we get a wildcard we match anything but spaces that comes before a #
+    # If not we match exactly the word we got
+    if(text[x] == "*") {
+      re <- "([^ ])+#"
+    } else {
+      re <- paste0(text[x], "#")
+    }
+    
+    # Now we do a very similar thing for the pos tags
+    # Except we know they will only contain capital letters, always
+    
+    if(pos[x] == "*") {
+      re <- paste0(re, "([A-Z])+ ")
+    } else {
+      re <- paste0(re, paste0(pos[x], " "))
+    }
+  })
+  
+  # Stitch it all together
+  expr <- paste(expr, collapse = "")
+  
+  
+  matches <- gregexpr(expr, combined, ignore.case = ignore.case)
+  
+  # If there was no match we may exit here
+  if (matches[[1]][1] == -1) {
+    # Explicit returns are bad, i know. But what to do here?
+    return(NA)
+  }
+  
+  # What do we want to do with the result data? This currently extracts it,
+  # strips the POS tags of and then goes home happy
+  matches <- unlist(regmatches(combined, matches))
+  
+  matches <- sapply(matches, function(match) {
+    match <- trimws(gsub("#([A-Z])+ ", " ", match), which = "right")
+  })
+  
+  unname(matches)
+  
+}
--- a/processing/wikiproc/man/match_pattern.Rd
+++ b/processing/wikiproc/man/match_pattern.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{match_pattern}
+\alias{match_pattern}
+\title{Matches the given pattern to the provided annotations object}
+\usage{
+match_pattern(annotations, text, pos, use.stems = FALSE,
+  ignore.case = FALSE, ignore.punct = FALSE)
+}
+\arguments{
+\item{annotations}{The annotations object genereted by cleanNLP}
+
+\item{text}{The text tags to match, needs to be of the same length as pos.
+May be supplied as single string or as vector of strings. Wildcards are supported.}
+
+\item{pos}{The part of speech tags to match, needs to be of the same length as text.
+May be supplied as single string or as vector of strings. Wildcards are supported.}
+
+\item{use.stems}{Wheather to use stemmed words for the matching.
+Beware that the stem for pronouns is _always_ "-PRON-".
+Will currently also return stemmed words.}
+
+\item{ignore.case}{Wheater to ignore case during pattern matching}
+
+\item{ignore.punct}{Wheather to ignore punctuation for the pattern matching.
+Note that this refers to the _PUNCT_ pos tag, so it will also ignore any parantheses etc.}
+}
+\value{
+Any matches found in the given text
+}
+\description{
+Matches the given pattern to the provided annotations object
+}
+\examples{
+\dontrun{
+# Find all proper nouns following an and
+
+# Both notations are working
+text <- "and *"
+pos <- c("*", "PROPN")
+
+# Prepare data
+articles <- get_data()
+annotations <- create_annotations(clean_html(articles[42,4]), articles[42,2], articles[42,3])
+
+results <- match_pattern(annotations, text, pos, ignore.punct = TRUE)
+results
+# [1] "and Frequency"   "and Technology"  "and Gravitation"
+}
+}