Skip to content
Snippets Groups Projects
clean_html.R 1.15 KiB
Newer Older
#!/usr/bin/env Rscript

# Author: Lucas

Lucas Schons's avatar
Lucas Schons committed
#' Clean a html formatted wikipedia page.
#' Nodes of interest from the DOM are extracted and then cleaned from all html
#' tags and annotations.
Lucas Schons's avatar
Lucas Schons committed
#'
#' @param html Url linking to a wikipedia webpage or a html formatted document.
#' @return Plaintext document containing only the maintext of the give wikipedia page.
Lucas Schons's avatar
Lucas Schons committed
clean_html <- function(html) {
Lucas Schons's avatar
Lucas Schons committed

  # 1. read data from url or html-formatted text
  # 2 .extract nodes containing main information (ignore infoboxes, list of literature, ...)
  # 3. collapse vektors into a single one
  # 4. remove html tags
  # 5.  - remove annotations ([123]) and trim whitespace
  #     - remove whitespace after newline
  #     - remove whitespace before punctuation
  #     - replace multiple newlines with single newline
  result <- xml2::read_html(html) %>%
    rvest::html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>%
    stringi::stri_flatten(collapse = " ") %>%
    textclean::replace_html() %>%
Lucas Schons's avatar
Lucas Schons committed
    gsub("\\[.+?\\]", "", .) %>%
    gsub("\n ", "\n", .) %>%
    gsub(" *([.!?:,'’])", "\\1", .) %>%
    gsub("\n *\n+", "\n", .) %>%
Lucas Schons's avatar
Lucas Schons committed
    gsub("  +", " ", .) %>%
    sub(" ", "", .)
Lucas Schons's avatar
Lucas Schons committed
}