utils.R

### Utility functions used internally

#' Extract the inforbox contents from wikipedia articles
#'
#' @param article Character vector containing the contents of an wikipedia
#' article as html
#'
#' @return Data frame holding the contents of the table
#'
#' @examples
#' \dontrun{
#' articles <- get_data()
#'
#' infobox <- get_infobox(articles$Text[54])
#'
#' infobox[3:4, ]
#' }
get_infobox <- function(article) {
  
  # Read page as html
  page <- xml2::read_html(article)

  # Extracting text from the html will erase all <br> tags,
  # this will replace them with line breaks

  xml2::xml_find_all(page, ".//br") %>%
    xml2::xml_add_sibling("p", "\n")

  xml2::xml_find_all(page, ".//br") %>%
    xml2::xml_remove(.)

  # Get the info box
  # Throws error if no infobox found and returns empty table

  tryCatch({
    info.table <- page %>%
      rvest::html_nodes("table.vcard") %>%
      rvest::html_table(fill = TRUE) %>%
      .[[1]]
      colnames(info.table) <- c("Desc", "Content")
  }, error = function(e) {
    #print("Error: No infobox found")
    info.table <<- NA
  })

  
  return(info.table)
}


### TODO:
### * performance?
###   * should be a bit better now as regex is lazy
### * make the method be usable with only text or pos tags
### * add matching of 'up to 3 arbitrary words'*{1:3} or something like this
### * maybe expose some functionality to use with custom regex?
### * Testing (!!)


#' Matches the given pattern to the provided annotations object
#'
#' @param annotations The annotations object genereted by cleanNLP
#' @param text The text tags to match, needs to be of the same length as pos.
#' May be supplied as single string or as vector of strings. Wildcards are supported.
#' @param pos The part of speech tags to match, needs to be of the same length as text.
#' May be supplied as single string or as vector of strings. Wildcards are supported.
#' @param use.stems Wheather to use stemmed words for the matching.
#' Beware that the stem for pronouns is _always_ "-PRON-".
#' Will currently also return stemmed words.
#' @param ignore.case Wheater to ignore case during pattern matching
#' @param ignore.punct Wheather to ignore punctuation for the pattern matching.
#' Note that this refers to the _PUNCT_ pos tag, so it will also ignore any parantheses etc.
#'
#' @return Any matches found in the given text
#'
#' @examples
#' \dontrun{
#' # Find all proper nouns following an and
#'
#' # Both notations are working
#' text <- "and *"
#' pos <- c("*", "PROPN")
#'
#' # Prepare data
#' articles <- get_data()
#' annotations <- create_annotations(
#'   clean_html(articles[articles$Title == "Neil Ashby", 4]),
#'   articles[articles$Title == "Neil Ashby", 2],
#'   articles[articles$Title == "Neil Ashby", 3]
#' )
#'
#' results <- match_pattern(annotations, text, pos, ignore.punct = TRUE)
#' results
#' # [1] "and Frequency"   "and Technology"  "and Gravitation"
#' }
match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case = TRUE, ignore.punct = FALSE) {
  # First we need to verify our input
  # If we got a single vector we need to expand it
  if (length(text) == 1) {
    text <- unlist(strsplit(text, " ", fixed = TRUE))
  }
  if (length(pos) == 1) {
    pos <- unlist(strsplit(pos, " ", fixed = TRUE))
  }

  # We need at the very least two tokens to define a pattern
  if (length(text) <= 1 | length(pos) <= 1) {
    stop("Error: Input needs to be contain at least two tokens")
  }

  # Check the length of our inputs matches
  if (length(text) != length(pos)) {
    stop("Error: Text and pos need to be of equal length")
  }

  # Let's build the vector we're going to be searching through
  # TODO: This way of doing this is probably inefficent

  combined <- NA
  annot <- cleanNLP::cnlp_get_token(annotations)

  # Remove spaces as they would mess up a lot down there

  annot <- annot[annot$upos != "SPACE", ]

  # Remove punctuation if that is desired but keep sentence ending marks aka full stops

  if (ignore.punct) {
    annot <- annot[annot$upos != "PUNCT" || annot$word == ".", ]
  }

  # Now to actually constructing the vector

  if (use.stems) {
    combined <- paste(annot$word, annot$lemma, annot$upos, sep = "\u001F")
  } else {
    combined <- paste(annot$word, annot$upos, sep = "\u001F")
  }


  # We could also use something else to collapse, we would slightly need to modify the query
  # but we could skip removing spaces. If that is worth anything?
  # Also appending an extra space so we can match the last element
  combined <- paste0(paste(combined, collapse = " "), " ")

  # Now let us  construct the regex

  expr <- sapply(seq_along(text), function(x) {
    # Create regex vector
    re <- character()

    # If using stems we want to ignore the first part of the data
    # as it contains the original word for later use

    if (use.stems) {
      # Match everything up to the seperator value
      re <- "([^ ])+?\u001F"
    }

    # If we get a wildcard we match anything but spaces that comes before a #
    # If not we match exactly the word we got
    if (text[x] == "*") {
      re <- paste0(re, "([^ ])+?\u001F")
    } else {
      re <- paste0(re, paste0(text[x], "\u001F"))
    }

    # Now we do a very similar thing for the pos tags
    # Except we know they will only contain capital letters, always

    if (pos[x] == "*") {
      re <- paste0(re, "([A-Z])+? ")
    } else {
      re <- paste0(re, paste0(pos[x], " "))
    }
  })

  # Stitch it all together
  expr <- paste(expr, collapse = "")


  matches <- gregexpr(expr, combined, ignore.case = ignore.case)

  # If there was no match we may exit here
  if (matches[[1]][1] == -1) {
    # Explicit returns are bad, i know. But what to do here?
    return(NA)
  }

  # What do we want to do with the result data? This currently extracts it,
  # strips the POS tags of and then goes home happy
  matches <- unlist(regmatches(combined, matches))

  if (use.stems) {
    matches <- sapply(matches, function(match) {
      match <- trimws(gsub("\u001F([^ ])+?\u001F([A-Z])+ ", " ", match), which = "right")
    })
  } else {
    matches <- sapply(matches, function(match) {
      match <- trimws(gsub("\u001F([A-Z])+ ", " ", match), which = "right")
    })
  }


  unname(matches)
}