### Utility functions used internally #' Extract the inforbox contents from wikipedia articles #' #' @param article Character vector containing the contents of an wikipedia #' article as html #' #' @return Data frame holding the contents of the table #' #' @examples #' \dontrun{ #' articles <- get_data() #' #' infobox <- get_infobox(articles$Text[54]) #' #' infobox[3:4,] #' } get_infobox <- function(article) { # Read page as html page <- xml2::read_html(article) # Extracting text from the html will erase all <br> tags, # this will replace them with line breaks xml2::xml_find_all(page, ".//br") %>% xml2::xml_add_sibling("p", "\n") xml2::xml_find_all(page, ".//br") %>% xml2::xml_remove() # Get the info box # Will throw an error if there isnt any, so that should be checked beforehand table <- page %>% rvest::html_nodes("table.vcard") %>% rvest::html_table(fill = TRUE) %>% .[[1]] colnames(table) <- c("Desc", "Content") return(table) } ### TODO: ### * use.stems also makes the function return stemmed words ### * performance? ### * Testing (!!) ### * NER matching #' Matches the given pattern to the provided annotations object #' #' @param annotations The annotations object genereted by cleanNLP #' @param text The text tags to match, needs to be of the same length as pos. #' May be supplied as single string or as vector of strings. Wildcards are supported. #' @param pos The part of speech tags to match, needs to be of the same length as text. #' May be supplied as single string or as vector of strings. Wildcards are supported. #' @param use.stems Wheather to use stemmed words for the matching. #' Beware that the stem for pronouns is _always_ "-PRON-". #' Will currently also return stemmed words. #' @param ignore.case Wheater to ignore case during pattern matching #' @param ignore.punct Wheather to ignore punctuation for the pattern matching. #' Note that this refers to the _PUNCT_ pos tag, so it will also ignore any parantheses etc. #' #' @return Any matches found in the given text #' #' @examples #' \dontrun{ #' # Find all proper nouns following an and #' #' # Both notations are working #' text <- "and *" #' pos <- c("*", "PROPN") #' #' # Prepare data #' articles <- get_data() #' annotations <- create_annotations(clean_html(articles[42,4]), articles[42,2], articles[42,3]) #' #' results <- match_pattern(annotations, text, pos, ignore.punct = TRUE) #' results #' # [1] "and Frequency" "and Technology" "and Gravitation" #' } match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case = FALSE, ignore.punct = FALSE) { # First we need to verify our input # If we got a single vector we need to expand it if (length(text) == 1) { text <- unlist(strsplit(text, " ", fixed = TRUE)) } if (length(pos) == 1) { pos <- unlist(strsplit(pos, " ", fixed = TRUE)) } # We need at the very least to tokens to define a pattern if (length(text) <= 1 | length(pos) <= 1) { stop("Error: Input needs to be contain at least two tokens") } # Check the length of our inputs matches if (length(text) != length(pos)) { stop("Error: Text and pos need to be of equal length") } # Let's build the vector we're going to be searching through # TODO: This way of doing this is probably inefficent combined <- NA annot <- cleanNLP::cnlp_get_token(annotations) # Remove spaces as they would mess up a lot down there annot <- annot[annot$upos != "SPACE",] # Remove punctuation if that is desired if(ignore.punct) { annot <- annot[annot$upos != "PUNCT",] } # Now to actually constructing the vector if(use.stems) { # Maybe want to change that seperator, however slashes would mess up regex construction even more combined <- paste(annot$lemma, annot$upos, sep = "#") } else { combined <- paste(annot$word, annot$upos, sep = "#") } # We could also use something else to collapse, we would slightly need to modify the query # but we could skip removing spaces. If that is worth anything? # Also appending an extra space so we can match the last element combined <- paste0(paste(combined, collapse = " "), " ") # Now let us construct the regex expr <- sapply(seq_along(text), function(x) { # Create regex vector re <- character() # If we get a wildcard we match anything but spaces that comes before a # # If not we match exactly the word we got if(text[x] == "*") { re <- "([^ ])+#" } else { re <- paste0(text[x], "#") } # Now we do a very similar thing for the pos tags # Except we know they will only contain capital letters, always if(pos[x] == "*") { re <- paste0(re, "([A-Z])+ ") } else { re <- paste0(re, paste0(pos[x], " ")) } }) # Stitch it all together expr <- paste(expr, collapse = "") matches <- gregexpr(expr, combined, ignore.case = ignore.case) # If there was no match we may exit here if (matches[[1]][1] == -1) { # Explicit returns are bad, i know. But what to do here? return(NA) } # What do we want to do with the result data? This currently extracts it, # strips the POS tags of and then goes home happy matches <- unlist(regmatches(combined, matches)) matches <- sapply(matches, function(match) { match <- trimws(gsub("#([A-Z])+ ", " ", match), which = "right") }) unname(matches) }