-
Lucas Schons authoredLucas Schons authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
clean_html.R 1.15 KiB
#!/usr/bin/env Rscript
# Author: Lucas
#' Clean a html formatted wikipedia page.
#' Nodes of interest from the DOM are extracted and then cleaned from all html
#' tags and annotations.
#'
#' @export
#' @param html Url linking to a wikipedia webpage or a html formatted document.
#' @return Plaintext document containing only the maintext of the give wikipedia page.
clean_html <- function(html) {
# 1. read data from url or html-formatted text
# 2 .extract nodes containing main information (ignore infoboxes, list of literature, ...)
# 3. collapse vektors into a single one
# 4. remove html tags
# 5. - remove annotations ([123]) and trim whitespace
# - remove whitespace after newline
# - remove whitespace before punctuation
# - replace multiple newlines with single newline
result <- xml2::read_html(html) %>%
rvest::html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>%
stringi::stri_flatten(collapse = " ") %>%
textclean::replace_html() %>%
gsub("\\[.+?\\]", "", .) %>%
gsub("\n ", "\n", .) %>%
gsub(" *([.!?:,'’])", "\\1", .) %>%
gsub("\n *\n+", "\n", .) %>%
gsub(" +", " ", .) %>%
sub(" ", "", .)
}