#!/usr/bin/env Rscript # Author: Lucas library(rvest) library(stringi) library(textclean) #' Clean a html formatted wikipedia page. #' Nodes of interest from the DOM are extracted and then cleaned from all html #' tags and annotations. #' #' @param html Url linking to a wikipedia webpage or a html formatted document. #' @return Plaintext document containing only the maintext of the give wikipedia page. cleanHtml <- function(html) { # 1. read data from url or html-formatted text # 2 .extract nodes containing main information (ignore infoboxes, list of literature, ...) # 3. collapse vektors into a single one # 4. remove html tags # 5. - remove annotations ([123]) and trim whitespace # - remove whitespace after newline # - remove whitespace before punctuation # - replace multiple newlines with single newline result <- read_html(html) %>% html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>% stri_flatten(collapse = " ") %>% replace_html() %>% gsub("\\[\\d*\\]", "", .) %>% gsub(" +", " ", .) %>% gsub("\n ", "\n", .) %>% gsub(" *([.!?:,'’])", "\\1", .) %>% gsub("\n *\n+", "\n", .) %>% sub(" ", "", .) }