utils.R

### Utility functions used internally


#' Extract the inforbox contents from wikipedia articles
#'
#' @param article Character vector containing the contents of an wikipedia
#' article as html
#'
#' @return Data frame holding the contents of the table
#' 
#' @examples
#' \dontrun{
#' articles <- get_data()
#' 
#' infobox <- get_infobox(articles$Text[54])
#' 
#' infobox[3:4,]
#' }
get_infobox <- function(article) {
  # Read page as html
  page <- read_html(article)
  
  # Extracting text from the html will erase all <br> tags,
  # this will replace them with line breaks
  
  xml_find_all(page, ".//br") %>%
    xml_add_sibling("p", "\n")
  
  xml_find_all(page, ".//br") %>%
    xml_remove()
  
  # Get the info box
  # Will throw an error if there isnt any, so that should be checked beforehand
  
  table <- page %>%
    html_nodes("table.vcard") %>%
    html_table(fill = TRUE) %>%
    .[[1]]
  
  colnames(table) <- c("Desc", "Content")
  
  return(table)
}