### Utility functions used internally #' Extract the inforbox contents from wikipedia articles #' #' @param article Character vector containing the contents of an wikipedia #' article as html #' #' @return Data frame holding the contents of the table #' #' @examples #' \dontrun{ #' articles <- get_data() #' #' infobox <- get_infobox(articles$Text[54]) #' #' infobox[3:4, ] #' } get_infobox <- function(article) { # Read page as html page <- xml2::read_html(article) # Extracting text from the html will erase all <br> tags, # this will replace them with line breaks xml2::xml_find_all(page, ".//br") %>% xml2::xml_add_sibling("p", "\n") xml2::xml_find_all(page, ".//br") %>% xml2::xml_remove(.) # Get the info box # Throws error if no infobox found and returns empty table tryCatch({ info.table <- page %>% rvest::html_nodes("table.vcard") %>% rvest::html_table(fill = TRUE) %>% .[[1]] colnames(info.table) <- c("Desc", "Content") }, error = function(e) { #print("Error: No infobox found") info.table <<- NA }) return(info.table) } ### TODO: ### * performance? ### * should be a bit better now as regex is lazy ### * make the method be usable with only text or pos tags ### * add matching of 'up to 3 arbitrary words'*{1:3} or something like this ### * maybe expose some functionality to use with custom regex? ### * Testing (!!) #' Matches the given pattern to the provided annotations object #' #' @param annotations The annotations object genereted by cleanNLP #' @param text The text tags to match, needs to be of the same length as pos. #' May be supplied as single string or as vector of strings. Wildcards are supported. #' @param pos The part of speech tags to match, needs to be of the same length as text. #' May be supplied as single string or as vector of strings. Wildcards are supported. #' @param use.stems Wheather to use stemmed words for the matching. #' Beware that the stem for pronouns is _always_ "-PRON-". #' Will currently also return stemmed words. #' @param ignore.case Wheater to ignore case during pattern matching #' @param ignore.punct Wheather to ignore punctuation for the pattern matching. #' Note that this refers to the _PUNCT_ pos tag, so it will also ignore any parantheses etc. #' #' @return Any matches found in the given text #' #' @examples #' \dontrun{ #' # Find all proper nouns following an and #' #' # Both notations are working #' text <- "and *" #' pos <- c("*", "PROPN") #' #' # Prepare data #' articles <- get_data() #' annotations <- create_annotations( #' clean_html(articles[articles$Title == "Neil Ashby", 4]), #' articles[articles$Title == "Neil Ashby", 2], #' articles[articles$Title == "Neil Ashby", 3] #' ) #' #' results <- match_pattern(annotations, text, pos, ignore.punct = TRUE) #' results #' # [1] "and Frequency" "and Technology" "and Gravitation" #' } match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case = TRUE, ignore.punct = FALSE) { # First we need to verify our input # If we got a single vector we need to expand it if (length(text) == 1) { text <- unlist(strsplit(text, " ", fixed = TRUE)) } if (length(pos) == 1) { pos <- unlist(strsplit(pos, " ", fixed = TRUE)) } # We need at the very least two tokens to define a pattern if (length(text) <= 1 | length(pos) <= 1) { stop("Error: Input needs to be contain at least two tokens") } # Check the length of our inputs matches if (length(text) != length(pos)) { stop("Error: Text and pos need to be of equal length") } # Let's build the vector we're going to be searching through # TODO: This way of doing this is probably inefficent combined <- NA annot <- cleanNLP::cnlp_get_token(annotations) # Remove spaces as they would mess up a lot down there annot <- annot[annot$upos != "SPACE", ] # Remove punctuation if that is desired but keep sentence ending marks aka full stops if (ignore.punct) { annot <- annot[annot$upos != "PUNCT" || annot$word == ".", ] } # Now to actually constructing the vector if (use.stems) { combined <- paste(annot$word, annot$lemma, annot$upos, sep = "\u001F") } else { combined <- paste(annot$word, annot$upos, sep = "\u001F") } # We could also use something else to collapse, we would slightly need to modify the query # but we could skip removing spaces. If that is worth anything? # Also appending an extra space so we can match the last element combined <- paste0(paste(combined, collapse = " "), " ") # Now let us construct the regex expr <- sapply(seq_along(text), function(x) { # Create regex vector re <- character() # If using stems we want to ignore the first part of the data # as it contains the original word for later use if (use.stems) { # Match everything up to the seperator value re <- "([^ ])+?\u001F" } # If we get a wildcard we match anything but spaces that comes before a # # If not we match exactly the word we got if (text[x] == "*") { re <- paste0(re, "([^ ])+?\u001F") } else { re <- paste0(re, paste0(text[x], "\u001F")) } # Now we do a very similar thing for the pos tags # Except we know they will only contain capital letters, always if (pos[x] == "*") { re <- paste0(re, "([A-Z])+? ") } else { re <- paste0(re, paste0(pos[x], " ")) } }) # Stitch it all together expr <- paste(expr, collapse = "") matches <- gregexpr(expr, combined, ignore.case = ignore.case) # If there was no match we may exit here if (matches[[1]][1] == -1) { # Explicit returns are bad, i know. But what to do here? return(NA) } # What do we want to do with the result data? This currently extracts it, # strips the POS tags of and then goes home happy matches <- unlist(regmatches(combined, matches)) if (use.stems) { matches <- sapply(matches, function(match) { match <- trimws(gsub("\u001F([^ ])+?\u001F([A-Z])+ ", " ", match), which = "right") }) } else { matches <- sapply(matches, function(match) { match <- trimws(gsub("\u001F([A-Z])+ ", " ", match), which = "right") }) } unname(matches) }