Newer
Older
#!/usr/bin/env Rscript
# Author: Lucas
#' Nodes of interest from the DOM are extracted and then cleaned from all html
#' tags and annotations.
#' @param html Url linking to a wikipedia webpage or a html formatted document.
#' @return Plaintext document containing only the maintext of the give wikipedia page.
# 1. read data from url or html-formatted text
# 2 .extract nodes containing main information (ignore infoboxes, list of literature, ...)
# 3. collapse vektors into a single one
# 4. remove html tags
# 5. - remove annotations ([123]) and trim whitespace
# - remove whitespace after newline
# - remove whitespace before punctuation
# - replace multiple newlines with single newline
result <- xml2::read_html(html) %>%
rvest::html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>%
stringi::stri_flatten(collapse = " ") %>%
textclean::replace_html() %>%
gsub("\n ", "\n", .) %>%
gsub(" *([.!?:,'’])", "\\1", .) %>%
gsub("\n *\n+", "\n", .) %>%