Newer
Older
### Utility functions used internally
#' Extract the inforbox contents from wikipedia articles
#'
#' @param article Character vector containing the contents of an wikipedia
#' article as html
#'
#' @return Data frame holding the contents of the table
#'
#' @examples
#' \dontrun{
#' articles <- get_data()
#'
#' infobox <- get_infobox(articles$Text[54])
#'
#' infobox[3:4,]
#' }
get_infobox <- function(article) {
# Read page as html
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
html_nodes("table.vcard") %>%
html_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}