diff --git a/processing/wikiproc/R/utils.R b/processing/wikiproc/R/utils.R index f4b088cbf20d54aceaa01103229508a1775ca36b..0725b3a1a1525d93f02ae72bf85888cf6ecc68f0 100644 --- a/processing/wikiproc/R/utils.R +++ b/processing/wikiproc/R/utils.R @@ -40,3 +40,142 @@ get_infobox <- function(article) { return(table) } + + +### TODO: +### * use.stems also makes the function return stemmed words +### * performance? +### * Testing (!!) +### * NER matching + + +#' Matches the given pattern to the provided annotations object +#' +#' @param annotations The annotations object genereted by cleanNLP +#' @param text The text tags to match, needs to be of the same length as pos. +#' May be supplied as single string or as vector of strings. Wildcards are supported. +#' @param pos The part of speech tags to match, needs to be of the same length as text. +#' May be supplied as single string or as vector of strings. Wildcards are supported. +#' @param use.stems Wheather to use stemmed words for the matching. +#' Beware that the stem for pronouns is _always_ "-PRON-". +#' Will currently also return stemmed words. +#' @param ignore.case Wheater to ignore case during pattern matching +#' @param ignore.punct Wheather to ignore punctuation for the pattern matching. +#' Note that this refers to the _PUNCT_ pos tag, so it will also ignore any parantheses etc. +#' +#' @return Any matches found in the given text +#' +#' @examples +#' \dontrun{ +#' # Find all proper nouns following an and +#' +#' # Both notations are working +#' text <- "and *" +#' pos <- c("*", "PROPN") +#' +#' # Prepare data +#' articles <- get_data() +#' annotations <- create_annotations(clean_html(articles[42,4]), articles[42,2], articles[42,3]) +#' +#' results <- match_pattern(annotations, text, pos, ignore.punct = TRUE) +#' results +#' # [1] "and Frequency" "and Technology" "and Gravitation" +#' } +match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case = FALSE, ignore.punct = FALSE) { + # First we need to verify our input + # If we got a single vector we need to expand it + if (length(text) == 1) { + text <- unlist(strsplit(text, " ", fixed = TRUE)) + } + if (length(pos) == 1) { + pos <- unlist(strsplit(pos, " ", fixed = TRUE)) + } + + # We need at the very least to tokens to define a pattern + if (length(text) <= 1 | length(pos) <= 1) { + stop("Error: Input needs to be contain at least two tokens") + } + + # Check the length of our inputs matches + if (length(text) != length(pos)) { + stop("Error: Text and pos need to be of equal length") + } + + # Let's build the vector we're going to be searching through + # TODO: This way of doing this is probably inefficent + + combined <- NA + annot <- cleanNLP::cnlp_get_token(annotations) + + # Remove spaces as they would mess up a lot down there + + annot <- annot[annot$upos != "SPACE",] + + # Remove punctuation if that is desired + + if(ignore.punct) { + annot <- annot[annot$upos != "PUNCT",] + } + + # Now to actually constructing the vector + + if(use.stems) { + # Maybe want to change that seperator, however slashes would mess up regex construction even more + combined <- paste(annot$lemma, annot$upos, sep = "#") + } else { + combined <- paste(annot$word, annot$upos, sep = "#") + } + + + # We could also use something else to collapse, we would slightly need to modify the query + # but we could skip removing spaces. If that is worth anything? + # Also appending an extra space so we can match the last element + combined <- paste0(paste(combined, collapse = " "), " ") + + # Now let us construct the regex + + expr <- sapply(seq_along(text), function(x) { + # Create regex vector + re <- character() + + # If we get a wildcard we match anything but spaces that comes before a # + # If not we match exactly the word we got + if(text[x] == "*") { + re <- "([^ ])+#" + } else { + re <- paste0(text[x], "#") + } + + # Now we do a very similar thing for the pos tags + # Except we know they will only contain capital letters, always + + if(pos[x] == "*") { + re <- paste0(re, "([A-Z])+ ") + } else { + re <- paste0(re, paste0(pos[x], " ")) + } + }) + + # Stitch it all together + expr <- paste(expr, collapse = "") + + + matches <- gregexpr(expr, combined, ignore.case = ignore.case) + + # If there was no match we may exit here + if (matches[[1]][1] == -1) { + # Explicit returns are bad, i know. But what to do here? + return(NA) + } + + # What do we want to do with the result data? This currently extracts it, + # strips the POS tags of and then goes home happy + matches <- unlist(regmatches(combined, matches)) + + matches <- sapply(matches, function(match) { + match <- trimws(gsub("#([A-Z])+ ", " ", match), which = "right") + }) + + unname(matches) + +} diff --git a/processing/wikiproc/man/match_pattern.Rd b/processing/wikiproc/man/match_pattern.Rd new file mode 100644 index 0000000000000000000000000000000000000000..508483240684217486c1da6f20f3f901d271fb84 --- /dev/null +++ b/processing/wikiproc/man/match_pattern.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{match_pattern} +\alias{match_pattern} +\title{Matches the given pattern to the provided annotations object} +\usage{ +match_pattern(annotations, text, pos, use.stems = FALSE, + ignore.case = FALSE, ignore.punct = FALSE) +} +\arguments{ +\item{annotations}{The annotations object genereted by cleanNLP} + +\item{text}{The text tags to match, needs to be of the same length as pos. +May be supplied as single string or as vector of strings. Wildcards are supported.} + +\item{pos}{The part of speech tags to match, needs to be of the same length as text. +May be supplied as single string or as vector of strings. Wildcards are supported.} + +\item{use.stems}{Wheather to use stemmed words for the matching. +Beware that the stem for pronouns is _always_ "-PRON-". +Will currently also return stemmed words.} + +\item{ignore.case}{Wheater to ignore case during pattern matching} + +\item{ignore.punct}{Wheather to ignore punctuation for the pattern matching. +Note that this refers to the _PUNCT_ pos tag, so it will also ignore any parantheses etc.} +} +\value{ +Any matches found in the given text +} +\description{ +Matches the given pattern to the provided annotations object +} +\examples{ +\dontrun{ +# Find all proper nouns following an and + +# Both notations are working +text <- "and *" +pos <- c("*", "PROPN") + +# Prepare data +articles <- get_data() +annotations <- create_annotations(clean_html(articles[42,4]), articles[42,2], articles[42,3]) + +results <- match_pattern(annotations, text, pos, ignore.punct = TRUE) +results +# [1] "and Frequency" "and Technology" "and Gravitation" +} +}