Skip to content
Snippets Groups Projects
utils.R 999 B
Newer Older
David Fuhry's avatar
David Fuhry committed
### Utility functions used internally


#' Extract the inforbox contents from wikipedia articles
#'
#' @param article Character vector containing the contents of an wikipedia
#' article as html
#'
#' @return Data frame holding the contents of the table
#' 
#' @examples
#' \dontrun{
#' articles <- get_data()
#' 
#' infobox <- get_infobox(articles$Text[54])
#' 
#' infobox[3:4,]
#' }
get_infobox <- function(article) {
  # Read page as html
David Fuhry's avatar
David Fuhry committed
  page <- xml2::read_html(article)
David Fuhry's avatar
David Fuhry committed
  
  # Extracting text from the html will erase all <br> tags,
  # this will replace them with line breaks
  
David Fuhry's avatar
David Fuhry committed
  xml2::xml_find_all(page, ".//br") %>%
    xml2::xml_add_sibling("p", "\n")
David Fuhry's avatar
David Fuhry committed
  
David Fuhry's avatar
David Fuhry committed
  xml2::xml_find_all(page, ".//br") %>%
    xml2::xml_remove()
David Fuhry's avatar
David Fuhry committed
  
  # Get the info box
  # Will throw an error if there isnt any, so that should be checked beforehand
  
  table <- page %>%
    html_nodes("table.vcard") %>%
    html_table(fill = TRUE) %>%
    .[[1]]
  
  colnames(table) <- c("Desc", "Content")
  
  return(table)
}