Apply styler, fix typo

17960c6b · Lucas Schons · f5773748 · 17960c6b
Commit 17960c6b authored 6 years ago by Lucas Schons
--- a/processing/wikiproc/R/utils.R
+++ b/processing/wikiproc/R/utils.R
@@ -13,7 +13,7 @@
 #'
 #' infobox <- get_infobox(articles$Text[54])
 #'
-#' infobox[3:4,]
+#' infobox[3:4, ]
 #' }
 get_infobox <- function(article) {
  # Read page as html
@@ -27,7 +27,7 @@ get_infobox <- function(article) {

  xml2::xml_find_all(page, ".//br") %>%
    xml2::xml_remove(.)
-  
+
  # Get the info box
  # Will throw an error if there isnt any, so that should be checked beforehand

@@ -70,19 +70,19 @@ get_infobox <- function(article) {
 #' @examples
 #' \dontrun{
 #' # Find all proper nouns following an and
-#' 
+#'
 #' # Both notations are working
 #' text <- "and *"
 #' pos <- c("*", "PROPN")
-#' 
+#'
 #' # Prepare data
 #' articles <- get_data()
 #' annotations <- create_annotations(
-#'   clean_html(articles[articles$Title == "Neil Ashby",4]), 
-#'   articles[articles$Title == "Neil Ashby",2], 
-#'   articles[articles$Title == "Neil Ashby",3]
+#'   clean_html(articles[articles$Title == "Neil Ashby", 4]),
+#'   articles[articles$Title == "Neil Ashby", 2],
+#'   articles[articles$Title == "Neil Ashby", 3]
 #' )
-#' 
+#'
 #' results <- match_pattern(annotations, text, pos, ignore.punct = TRUE)
 #' results
 #' # [1] "and Frequency"   "and Technology"  "and Gravitation"
@@ -96,98 +96,96 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
  if (length(pos) == 1) {
    pos <- unlist(strsplit(pos, " ", fixed = TRUE))
  }
-  
-  # We need at the very least to tokens to define a pattern
+
+  # We need at the very least two tokens to define a pattern
  if (length(text) <= 1 | length(pos) <= 1) {
    stop("Error: Input needs to be contain at least two tokens")
  }
-  
+
  # Check the length of our inputs matches
  if (length(text) != length(pos)) {
    stop("Error: Text and pos need to be of equal length")
  }
-  
+
  # Let's build the vector we're going to be searching through
  # TODO: This way of doing this is probably inefficent
-  
+
  combined <- NA
  annot <- cleanNLP::cnlp_get_token(annotations)
-  
+
  # Remove spaces as they would mess up a lot down there
-  
-  annot <- annot[annot$upos != "SPACE",]
-  
+
+  annot <- annot[annot$upos != "SPACE", ]
+
  # Remove punctuation if that is desired but keep sentence ending marks aka full stops
-  
-  if(ignore.punct) {
-    annot <- annot[annot$upos != "PUNCT" || annot$word == ".",]
+
+  if (ignore.punct) {
+    annot <- annot[annot$upos != "PUNCT" || annot$word == ".", ]
  }
-  
+
  # Now to actually constructing the vector
-  
-  if(use.stems) {
+
+  if (use.stems) {
    combined <- paste(annot$word, annot$lemma, annot$upos, sep = "\u001F")
  } else {
    combined <- paste(annot$word, annot$upos, sep = "\u001F")
  }
-  
-  
+
+
  # We could also use something else to collapse, we would slightly need to modify the query
  # but we could skip removing spaces. If that is worth anything?
  # Also appending an extra space so we can match the last element
  combined <- paste0(paste(combined, collapse = " "), " ")
-  
+
  # Now let us  construct the regex
-  
+
  expr <- sapply(seq_along(text), function(x) {
    # Create regex vector
    re <- character()
-    
+
    # If using stems we want to ignore the first part of the data
    # as it contains the original word for later use
-    
-    if(use.stems) {
+
+    if (use.stems) {
      # Match everything up to the seperator value
      re <- "([^ ])+?\u001F"
    }
-    
+
    # If we get a wildcard we match anything but spaces that comes before a #
    # If not we match exactly the word we got
-    if(text[x] == "*") {
+    if (text[x] == "*") {
      re <- paste0(re, "([^ ])+?\u001F")
    } else {
      re <- paste0(re, paste0(text[x], "\u001F"))
    }
-    
+
    # Now we do a very similar thing for the pos tags
    # Except we know they will only contain capital letters, always
-    
-    if(pos[x] == "*") {
+
+    if (pos[x] == "*") {
      re <- paste0(re, "([A-Z])+? ")
    } else {
      re <- paste0(re, paste0(pos[x], " "))
    }
-    
-    
  })
-  
+
  # Stitch it all together
  expr <- paste(expr, collapse = "")
-  
-  
+
+
  matches <- gregexpr(expr, combined, ignore.case = ignore.case)
-  
+
  # If there was no match we may exit here
  if (matches[[1]][1] == -1) {
    # Explicit returns are bad, i know. But what to do here?
    return(NA)
  }
-  
+
  # What do we want to do with the result data? This currently extracts it,
  # strips the POS tags of and then goes home happy
  matches <- unlist(regmatches(combined, matches))
-  
-  if(use.stems) {
+
+  if (use.stems) {
    matches <- sapply(matches, function(match) {
      match <- trimws(gsub("\u001F([^ ])+?\u001F([A-Z])+ ", " ", match), which = "right")
    })
@@ -196,8 +194,7 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
      match <- trimws(gsub("\u001F([A-Z])+ ", " ", match), which = "right")
    })
  }
- 
-  
+
+
  unname(matches)
-  
 }