Newer
Older
#!/usr/bin/env Rscript
# Author: Lucas
library(rvest)
library(stringi)
library(textclean)
#' Nodes of interest from the DOM are extracted and then cleaned from all html
#' tags and annotations.
#' @param html Url linking to a wikipedia webpage or a html formatted document.
#' @return Plaintext document containing only the maintext of the give wikipedia page.
cleanHtml <- function(html) {
# 1. read data from url or html-formatted text
# 2 .extract nodes containing main information (ignore infoboxes, list of literature, ...)
# 3. collapse vektors into a single one
# 4. remove html tags
# 5. - remove annotations ([123]) and trim whitespace
# - remove whitespace after newline
# - remove whitespace before punctuation
# - replace multiple newlines with single newline
result <- read_html(html) %>%
html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>%
stri_flatten(collapse = " ") %>%
replace_html() %>%
gsub("\\[\\d*\\]", "", .) %>%
gsub(" +", " ", .) %>%
gsub("\n ", "\n", .) %>%
gsub(" *([.!?:,'’])", "\\1", .) %>%
gsub("\n *\n+", "\n", .) %>%
sub(" ", "", .)