### Utility functions used internally #' Extract the inforbox contents from wikipedia articles #' #' @param article Character vector containing the contents of an wikipedia #' article as html #' #' @return Data frame holding the contents of the table #' #' @examples #' \dontrun{ #' articles <- get_data() #' #' infobox <- get_infobox(articles$Text[54]) #' #' infobox[3:4,] #' } get_infobox <- function(article) { # Read page as html page <- xml2::read_html(article) # Extracting text from the html will erase all <br> tags, # this will replace them with line breaks xml2::xml_find_all(page, ".//br") %>% xml2::xml_add_sibling("p", "\n") xml2::xml_find_all(page, ".//br") %>% xml2::xml_remove() # Get the info box # Will throw an error if there isnt any, so that should be checked beforehand table <- page %>% html_nodes("table.vcard") %>% html_table(fill = TRUE) %>% .[[1]] colnames(table) <- c("Desc", "Content") return(table) }