clean_html.R

#!/usr/bin/env Rscript

# Author: Lucas

#' Clean a html formatted wikipedia page.
#' Nodes of interest from the DOM are extracted and then cleaned from all html
#' tags and annotations.
#'
#' @export
#' @param html Url linking to a wikipedia webpage or a html formatted document.
#' @return Plaintext document containing only the maintext of the give wikipedia page.
clean_html <- function(html) {

  # 1. read data from url or html-formatted text
  # 2 .extract nodes containing main information (ignore infoboxes, list of literature, ...)
  # 3. collapse vektors into a single one
  # 4. remove html tags
  # 5.  - remove annotations ([123]) and trim whitespace
  #     - remove whitespace after newline
  #     - remove whitespace before punctuation
  #     - replace multiple newlines with single newline
  result <- xml2::read_html(html) %>%
    rvest::html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>%
    stringi::stri_flatten(collapse = " ") %>%
    textclean::replace_html() %>%
    gsub("\\[.+?\\]", "", .) %>%
    gsub("\n ", "\n", .) %>%
    gsub(" *([.!?:,'’])", "\\1", .) %>%
    gsub("\n *\n+", "\n", .) %>%
    gsub("  +", " ", .) %>%
    sub(" ", "", .)
}