Skip to content
Snippets Groups Projects
Commit 71c6fe2b authored by David Fuhry's avatar David Fuhry :fist:
Browse files

Merge branch '32-add-unit-tests-for-cleanhtml-r' into 'master'

Resolve "Add unit tests for clean_html.R"

Closes #32

See merge request !34
parents 964ec2c8 ed4cb9d8
No related branches found
No related tags found
1 merge request!34Resolve "Add unit tests for clean_html.R"
......@@ -23,10 +23,10 @@ clean_html <- function(html) {
rvest::html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>%
stringi::stri_flatten(collapse = " ") %>%
textclean::replace_html() %>%
gsub("\\[\\d*\\]", "", .) %>%
gsub(" +", " ", .) %>%
gsub("\\[.+?\\]", "", .) %>%
gsub("\n ", "\n", .) %>%
gsub(" *([.!?:,'’])", "\\1", .) %>%
gsub("\n *\n+", "\n", .) %>%
gsub(" +", " ", .) %>%
sub(" ", "", .)
}
......@@ -37,7 +37,7 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
cat("Downloading list from wikipedia... ")
page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists")
page <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_physicists")
cat("Done.\n")
......@@ -46,8 +46,8 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
cat("Processing data:\n")
physicists <- page %>%
html_nodes(".mw-parser-output li a") %>%
html_attr("href")
rvest::html_nodes(".mw-parser-output li a") %>%
rvest::html_attr("href")
# Clean the list
......@@ -84,9 +84,9 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
# Check if the article is a redirect page
if (grepl(".redirectText", article$parse$text$`*`)) {
# Get the real article name
pname <- read_html(article$parse$text$`*`) %>%
html_nodes(".redirectText a") %>%
html_attr("href")
pname <- xml2::read_html(article$parse$text$`*`) %>%
rvest::html_nodes(".redirectText a") %>%
rvest::html_attr("href")
panme <- gsub("_", " ", pname)
......
#' Initialize the nlp backend
#'
#'
#' A wrapper used to set the python environment and call cnlp_init
#'
#' @param type Type of python env to use, either "conda" or "python"
......@@ -34,14 +34,14 @@ init_nlp <- function(type, value) {
#' @return Annotation object for use with cleanNLP methods
#' @export
create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
# Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
# To prevent this we stip 'em again
filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE)
# Check if there is a cached version of the annotations for this article in this specific revision
if(use.cache & file.exists(filename)) {
res <- tryCatch({
data <- readRDS(filename)
......@@ -53,9 +53,9 @@ create_annotations <- function(text, article.id, article.rev.id, use.cache = TRU
}
annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE)
# Write cache if desired
if(write.cache) {
if (!dir.exists("data")) {
dir.create("data")
......@@ -71,4 +71,4 @@ create_annotations <- function(text, article.id, article.rev.id, use.cache = TRU
# But then again, it suggests snake case for variables...
return(annotation)
}
\ No newline at end of file
}
### Utility functions used internally
#' Extract the inforbox contents from wikipedia articles
#'
#' @param article Character vector containing the contents of an wikipedia
#' article as html
#'
#' @return Data frame holding the contents of the table
#'
#'
#' @examples
#' \dontrun{
#' articles <- get_data()
#'
#'
#' infobox <- get_infobox(articles$Text[54])
#'
#'
#' infobox[3:4,]
#' }
get_infobox <- function(article) {
# Read page as html
page <- xml2::read_html(article)
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_add_sibling("p", "\n")
xml2::xml_find_all(page, ".//br") %>%
xml2::xml_remove(.)
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
rvest::html_nodes("table.vcard") %>%
rvest::html_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
......@@ -70,7 +70,7 @@
<h2><span class="mw-headline" id="Scientific_efforts_and_peer_recognition">Scientific efforts and peer recognition</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Hasan_Abdullayev&amp;action=edit&amp;section=4" title="Edit section: Scientific efforts and peer recognition">edit</a><span class="mw-editsection-bracket">]</span></span></h2>
<p>Academician Hasan Abdullayev is the author of 28 monographs, several scientific textbooks, approximately six hundred scientific journal articles. He holds 585 patents from USSR (including 171 secret and 65 top secret patents for technologies with military applications),<sup id="cite_ref-19" class="reference"><a href="#cite_note-19">&#91;19&#93;</a></sup> and 35 foreign patents from France, Germany, Great Britain, Japan, Sweden, Italy, Bulgaria, India, and U.S. (United States Patent 3,472,652).<sup id="cite_ref-20" class="reference"><a href="#cite_note-20">&#91;20&#93;</a></sup>
</p><p>Academician Abdullayev received highest praise from his colleagues, including <a href="/wiki/Nobel_Prize" title="Nobel Prize">Nobel Prize</a> winner academician <a href="/wiki/Zhores_Alferov" title="Zhores Alferov">Zhores Alferov</a>, <a href="/wiki/Nobel_Prize" title="Nobel Prize">Nobel Prize</a> winner academician <a href="/wiki/Alexander_Prokhorov" title="Alexander Prokhorov">Alexander Prokhorov</a>, <a href="/wiki/Kurchatov_Institute" title="Kurchatov Institute">Kurchatov Institute</a> President and Director <a href="/wiki/Evgeny_Velikhov" title="Evgeny Velikhov">Evgeny Velikhov</a>, academician <a href="/w/index.php?title=Bentsion_Vul&amp;action=edit&amp;redlink=1" class="new" title="Bentsion Vul (page does not exist)">Bentsion Vul</a>, academician Vladimir Tuchkevich,<sup id="cite_ref-21" class="reference"><a href="#cite_note-21">&#91;21&#93;</a></sup> academician <a href="/wiki/Sergey_Kapitsa" title="Sergey Kapitsa">Sergey Kapitsa</a>, academician <a href="/wiki/Roald_Sagdeev" title="Roald Sagdeev">Roald Sagdeev</a>, Nobel Prize winner professor <a href="/wiki/Rudolf_Ludwig_Mossbauer" class="mw-redirect" title="Rudolf Ludwig Mossbauer">Rudolf Ludwig Mossbauer</a>, academician <a href="/wiki/Nikolay_Bogolyubov" title="Nikolay Bogolyubov">Nikolay Bogolyubov</a>,<sup id="cite_ref-GSE_22-0" class="reference"><a href="#cite_note-GSE-22">&#91;22&#93;</a></sup> Soviet Academy of Sciences Presidents academician Alexander Nesmeyanov, academician <a href="/wiki/Anatoly_Petrovich_Alexandrov" class="mw-redirect" title="Anatoly Petrovich Alexandrov">Anatoly Petrovich Alexandrov</a>, academician <a href="/wiki/Mstislav_Keldysh" title="Mstislav Keldysh">Mstislav Keldysh</a><sup id="cite_ref-23" class="reference"><a href="#cite_note-23">&#91;23&#93;</a></sup> and other Soviet and foreign scientists.
</p><p>According to a 2008 article, "Academician Abdullayev was called the Father of Physics in Azerbaijan and one of the Founders of the School of Semiconductor Research in the Soviet Union by such authoritative scientists as academicians Zh.Alferov, Yu.Gulyaev, L.Kurbatov, V.Isakov, Professor D.Nasledov, and others. In fact, the <a href="/wiki/Great_Soviet_Encyclopedia" title="Great Soviet Encyclopedia">Great Soviet Encyclopedia</a>, the most authoritative Soviet encyclopedia - the Soviet equivalent of the Encyclopædia Britannica in the West, listed the names of scientists, making the greatest contributions to the development of semiconductor electronics and microelectronics in this order: A.F.Ioffe (who was Abdullayev's mentor during his postdoctoral studies in Leningrad), N.P.Sazhin, Ya.I.Frenkel, B.M.Vul, V.M.Tuchkevich, H.B.Abdullayev, Zh.I.Alferov, L.V.Keldish, and others (Third Edition, 1970, page 351).<sup id="cite_ref-24" class="reference"><a href="#cite_note-24">&#91;24&#93;</a></sup> Thus, already in 1970, this encyclopedia put academician Abdullayev as the sixth most influential scientist in semi-conductor research, higher than such giants as Academicians Alferov and Keldish!"<sup id="cite_ref-GSE_22-1" class="reference"><a href="#cite_note-GSE-22">&#91;22&#93;</a></sup>
</p><p>According to a 2008 article, "Academician Abdullayev was called the Father of Physics in Azerbaijan and one of the Founders of the School of Semiconductor Research in the Soviet Union by such authoritative scientists as academicians Zh.Alferov, Yu.Gulyaev, L.Kurbatov, V.Isakov, Professor D.Nasledov, and others. In fact, the <a href="/wiki/Great_Soviet_Encyclopedia" title="Great Soviet Encyclopedia">Great Soviet Encyclopedia</a>, the most authoritative Soviet encyclopedia - the Soviet equivalent of the Encyclopædia Britannica in the West, listed the names of scientists, making the greatest contributions to the development of semiconductor electronics and microelectronics in this order: A.F.Ioffe (who was Abdullayev's mentor &#91; edit &#93; during his postdoctoral studies in Leningrad), N.P.Sazhin, Ya.I.Frenkel, B.M.Vul, V.M.Tuchkevich, H.B.Abdullayev, Zh.I.Alferov, L.V.Keldish, and others (Third Edition, 1970, page 351).<sup id="cite_ref-24" class="reference"><a href="#cite_note-24">&#91;24&#93;</a></sup> Thus, already in 1970, this encyclopedia put academician Abdullayev as the sixth most influential scientist in semi-conductor research, higher than such giants as Academicians Alferov and Keldish!"<sup id="cite_ref-GSE_22-1" class="reference"><a href="#cite_note-GSE-22">&#91;22&#93;</a></sup>
</p><p>Academician Abdullayev was recognized as the top expert on the chemical element <a href="/wiki/Selenium" title="Selenium">selenium</a>, and thus entrusted authoring the article on selenium in the third (final) edition of the top scientific reference publication - the <a href="/wiki/Great_Soviet_Encyclopedia" title="Great Soviet Encyclopedia">Great Soviet Encyclopedia</a>.<sup id="cite_ref-25" class="reference"><a href="#cite_note-25">&#91;25&#93;</a></sup>
</p>
<h2><span class="mw-headline" id="Publications">Publications</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Hasan_Abdullayev&amp;action=edit&amp;section=5" title="Edit section: Publications">edit</a><span class="mw-editsection-bracket">]</span></span></h2>
......@@ -175,7 +175,7 @@
<li><span class="nowrap"> <a href="/wiki/WorldCat_Identities" class="mw-redirect" title="WorldCat Identities">WorldCat Identities</a> (via VIAF): <a rel="nofollow" class="external text" href="https://www.worldcat.org/identities/containsVIAFID/109565605">109565605</a></span></li></ul>
</div></td></tr></tbody></table></div>
<!--
<!--
NewPP limit report
Parsed by mw1244
Cached time: 20190111175009
......
......@@ -4,8 +4,9 @@ test_that("html cleansing works", {
filename_raw <- "article-4-raw.html"
filename_cleansed <- "article-4-cleansed.txt"
html <- readChar(filename_raw, file.info(filename_raw)$size)
expected <- readChar(filename_cleansed, file.info(filename_cleansed)$size)
actual <- clean_html(html)
expected <- gsub("\\s", "", readChar(filename_cleansed, file.info(filename_cleansed)$size))
actual <- gsub("\\s", "", clean_html(html))
expect_equal(expected, actual)
})
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment