Skip to content
Snippets Groups Projects
Commit 17960c6b authored by Lucas Schons's avatar Lucas Schons
Browse files

Apply styler, fix typo

parent f5773748
No related branches found
No related tags found
1 merge request!47Resolve "Improve pattern matching function"
......@@ -13,7 +13,7 @@
#'
#' infobox <- get_infobox(articles$Text[54])
#'
#' infobox[3:4,]
#' infobox[3:4, ]
#' }
get_infobox <- function(article) {
# Read page as html
......@@ -27,7 +27,7 @@ get_infobox <- function(article) {
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove(.)
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
......@@ -70,19 +70,19 @@ get_infobox <- function(article) {
#' @examples
#' \dontrun{
#' # Find all proper nouns following an and
#'
#'
#' # Both notations are working
#' text <- "and *"
#' pos <- c("*", "PROPN")
#'
#'
#' # Prepare data
#' articles <- get_data()
#' annotations <- create_annotations(
#' clean_html(articles[articles$Title == "Neil Ashby",4]),
#' articles[articles$Title == "Neil Ashby",2],
#' articles[articles$Title == "Neil Ashby",3]
#' clean_html(articles[articles$Title == "Neil Ashby", 4]),
#' articles[articles$Title == "Neil Ashby", 2],
#' articles[articles$Title == "Neil Ashby", 3]
#' )
#'
#'
#' results <- match_pattern(annotations, text, pos, ignore.punct = TRUE)
#' results
#' # [1] "and Frequency" "and Technology" "and Gravitation"
......@@ -96,98 +96,96 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
if (length(pos) == 1) {
pos <- unlist(strsplit(pos, " ", fixed = TRUE))
}
# We need at the very least to tokens to define a pattern
# We need at the very least two tokens to define a pattern
if (length(text) <= 1 | length(pos) <= 1) {
stop("Error: Input needs to be contain at least two tokens")
}
# Check the length of our inputs matches
if (length(text) != length(pos)) {
stop("Error: Text and pos need to be of equal length")
}
# Let's build the vector we're going to be searching through
# TODO: This way of doing this is probably inefficent
combined <- NA
annot <- cleanNLP::cnlp_get_token(annotations)
# Remove spaces as they would mess up a lot down there
annot <- annot[annot$upos != "SPACE",]
annot <- annot[annot$upos != "SPACE", ]
# Remove punctuation if that is desired but keep sentence ending marks aka full stops
if(ignore.punct) {
annot <- annot[annot$upos != "PUNCT" || annot$word == ".",]
if (ignore.punct) {
annot <- annot[annot$upos != "PUNCT" || annot$word == ".", ]
}
# Now to actually constructing the vector
if(use.stems) {
if (use.stems) {
combined <- paste(annot$word, annot$lemma, annot$upos, sep = "\u001F")
} else {
combined <- paste(annot$word, annot$upos, sep = "\u001F")
}
# We could also use something else to collapse, we would slightly need to modify the query
# but we could skip removing spaces. If that is worth anything?
# Also appending an extra space so we can match the last element
combined <- paste0(paste(combined, collapse = " "), " ")
# Now let us construct the regex
expr <- sapply(seq_along(text), function(x) {
# Create regex vector
re <- character()
# If using stems we want to ignore the first part of the data
# as it contains the original word for later use
if(use.stems) {
if (use.stems) {
# Match everything up to the seperator value
re <- "([^ ])+?\u001F"
}
# If we get a wildcard we match anything but spaces that comes before a #
# If not we match exactly the word we got
if(text[x] == "*") {
if (text[x] == "*") {
re <- paste0(re, "([^ ])+?\u001F")
} else {
re <- paste0(re, paste0(text[x], "\u001F"))
}
# Now we do a very similar thing for the pos tags
# Except we know they will only contain capital letters, always
if(pos[x] == "*") {
if (pos[x] == "*") {
re <- paste0(re, "([A-Z])+? ")
} else {
re <- paste0(re, paste0(pos[x], " "))
}
})
# Stitch it all together
expr <- paste(expr, collapse = "")
matches <- gregexpr(expr, combined, ignore.case = ignore.case)
# If there was no match we may exit here
if (matches[[1]][1] == -1) {
# Explicit returns are bad, i know. But what to do here?
return(NA)
}
# What do we want to do with the result data? This currently extracts it,
# strips the POS tags of and then goes home happy
matches <- unlist(regmatches(combined, matches))
if(use.stems) {
if (use.stems) {
matches <- sapply(matches, function(match) {
match <- trimws(gsub("\u001F([^ ])+?\u001F([A-Z])+ ", " ", match), which = "right")
})
......@@ -196,8 +194,7 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
match <- trimws(gsub("\u001F([A-Z])+ ", " ", match), which = "right")
})
}
unname(matches)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment