diff --git a/processing/wikiproc/R/clean_html.R b/processing/wikiproc/R/clean_html.R index d0a57c237ce4edef7b3ad9bb359b348e4994b4ce..bf0006c1c74a6a0b4bbe196ba6c9d821ae15b824 100644 --- a/processing/wikiproc/R/clean_html.R +++ b/processing/wikiproc/R/clean_html.R @@ -23,10 +23,10 @@ clean_html <- function(html) { rvest::html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>% stringi::stri_flatten(collapse = " ") %>% textclean::replace_html() %>% - gsub("\\[\\d*\\]", "", .) %>% - gsub(" +", " ", .) %>% + gsub("\\[.+?\\]", "", .) %>% gsub("\n ", "\n", .) %>% gsub(" *([.!?:,'’])", "\\1", .) %>% gsub("\n *\n+", "\n", .) %>% + gsub(" +", " ", .) %>% sub(" ", "", .) } diff --git a/processing/wikiproc/R/get_data.R b/processing/wikiproc/R/get_data.R index abcf0094a43ee04b8385e6e978258170777b0f7b..1050189120a85492d34b8dda19d6ff414db55b6f 100644 --- a/processing/wikiproc/R/get_data.R +++ b/processing/wikiproc/R/get_data.R @@ -37,7 +37,7 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { cat("Downloading list from wikipedia... ") - page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists") + page <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_physicists") cat("Done.\n") @@ -46,8 +46,8 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { cat("Processing data:\n") physicists <- page %>% - html_nodes(".mw-parser-output li a") %>% - html_attr("href") + rvest::html_nodes(".mw-parser-output li a") %>% + rvest::html_attr("href") # Clean the list @@ -84,9 +84,9 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { # Check if the article is a redirect page if (grepl(".redirectText", article$parse$text$`*`)) { # Get the real article name - pname <- read_html(article$parse$text$`*`) %>% - html_nodes(".redirectText a") %>% - html_attr("href") + pname <- xml2::read_html(article$parse$text$`*`) %>% + rvest::html_nodes(".redirectText a") %>% + rvest::html_attr("href") panme <- gsub("_", " ", pname) diff --git a/processing/wikiproc/R/nlp_annotate.R b/processing/wikiproc/R/nlp_annotate.R index c83921ec93dcc21b20cbebec03c1b458a2bd8609..4059ed6085d329a0fdf5dd7f77e0b6e8ff8297bf 100644 --- a/processing/wikiproc/R/nlp_annotate.R +++ b/processing/wikiproc/R/nlp_annotate.R @@ -1,5 +1,5 @@ #' Initialize the nlp backend -#' +#' #' A wrapper used to set the python environment and call cnlp_init #' #' @param type Type of python env to use, either "conda" or "python" @@ -34,14 +34,14 @@ init_nlp <- function(type, value) { #' @return Annotation object for use with cleanNLP methods #' @export create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") { - + # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces # To prevent this we stip 'em again - + filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE) - + # Check if there is a cached version of the annotations for this article in this specific revision - + if(use.cache & file.exists(filename)) { res <- tryCatch({ data <- readRDS(filename) @@ -53,9 +53,9 @@ create_annotations <- function(text, article.id, article.rev.id, use.cache = TRU } annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE) - + # Write cache if desired - + if(write.cache) { if (!dir.exists("data")) { dir.create("data") @@ -71,4 +71,4 @@ create_annotations <- function(text, article.id, article.rev.id, use.cache = TRU # But then again, it suggests snake case for variables... return(annotation) -} \ No newline at end of file +} diff --git a/processing/wikiproc/R/utils.R b/processing/wikiproc/R/utils.R index baffaa9e2da4f03d74159e10cbffb294be4451db..f4b088cbf20d54aceaa01103229508a1775ca36b 100644 --- a/processing/wikiproc/R/utils.R +++ b/processing/wikiproc/R/utils.R @@ -1,43 +1,42 @@ ### Utility functions used internally - #' Extract the inforbox contents from wikipedia articles #' #' @param article Character vector containing the contents of an wikipedia #' article as html #' #' @return Data frame holding the contents of the table -#' +#' #' @examples #' \dontrun{ #' articles <- get_data() -#' +#' #' infobox <- get_infobox(articles$Text[54]) -#' +#' #' infobox[3:4,] #' } get_infobox <- function(article) { # Read page as html page <- xml2::read_html(article) - + # Extracting text from the html will erase all <br> tags, # this will replace them with line breaks - + xml2::xml_find_all(page, ".//br") %>% xml2::xml_add_sibling("p", "\n") - + xml2::xml_find_all(page, ".//br") %>% xml2::xml_remove(.) # Get the info box # Will throw an error if there isnt any, so that should be checked beforehand - + table <- page %>% rvest::html_nodes("table.vcard") %>% rvest::html_table(fill = TRUE) %>% .[[1]] - + colnames(table) <- c("Desc", "Content") - + return(table) } diff --git a/processing/wikiproc/tests/testthat/article-4-raw.html b/processing/wikiproc/tests/testthat/article-4-raw.html index 6a27f0570b2c6be7b2603b84b81f24cd590f3cb9..ee9ad2b67c414f265c1e0f75420d1a444a811731 100644 --- a/processing/wikiproc/tests/testthat/article-4-raw.html +++ b/processing/wikiproc/tests/testthat/article-4-raw.html @@ -70,7 +70,7 @@ <h2><span class="mw-headline" id="Scientific_efforts_and_peer_recognition">Scientific efforts and peer recognition</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Hasan_Abdullayev&action=edit&section=4" title="Edit section: Scientific efforts and peer recognition">edit</a><span class="mw-editsection-bracket">]</span></span></h2> <p>Academician Hasan Abdullayev is the author of 28 monographs, several scientific textbooks, approximately six hundred scientific journal articles. He holds 585 patents from USSR (including 171 secret and 65 top secret patents for technologies with military applications),<sup id="cite_ref-19" class="reference"><a href="#cite_note-19">[19]</a></sup> and 35 foreign patents from France, Germany, Great Britain, Japan, Sweden, Italy, Bulgaria, India, and U.S. (United States Patent 3,472,652).<sup id="cite_ref-20" class="reference"><a href="#cite_note-20">[20]</a></sup> </p><p>Academician Abdullayev received highest praise from his colleagues, including <a href="/wiki/Nobel_Prize" title="Nobel Prize">Nobel Prize</a> winner academician <a href="/wiki/Zhores_Alferov" title="Zhores Alferov">Zhores Alferov</a>, <a href="/wiki/Nobel_Prize" title="Nobel Prize">Nobel Prize</a> winner academician <a href="/wiki/Alexander_Prokhorov" title="Alexander Prokhorov">Alexander Prokhorov</a>, <a href="/wiki/Kurchatov_Institute" title="Kurchatov Institute">Kurchatov Institute</a> President and Director <a href="/wiki/Evgeny_Velikhov" title="Evgeny Velikhov">Evgeny Velikhov</a>, academician <a href="/w/index.php?title=Bentsion_Vul&action=edit&redlink=1" class="new" title="Bentsion Vul (page does not exist)">Bentsion Vul</a>, academician Vladimir Tuchkevich,<sup id="cite_ref-21" class="reference"><a href="#cite_note-21">[21]</a></sup> academician <a href="/wiki/Sergey_Kapitsa" title="Sergey Kapitsa">Sergey Kapitsa</a>, academician <a href="/wiki/Roald_Sagdeev" title="Roald Sagdeev">Roald Sagdeev</a>, Nobel Prize winner professor <a href="/wiki/Rudolf_Ludwig_Mossbauer" class="mw-redirect" title="Rudolf Ludwig Mossbauer">Rudolf Ludwig Mossbauer</a>, academician <a href="/wiki/Nikolay_Bogolyubov" title="Nikolay Bogolyubov">Nikolay Bogolyubov</a>,<sup id="cite_ref-GSE_22-0" class="reference"><a href="#cite_note-GSE-22">[22]</a></sup> Soviet Academy of Sciences Presidents academician Alexander Nesmeyanov, academician <a href="/wiki/Anatoly_Petrovich_Alexandrov" class="mw-redirect" title="Anatoly Petrovich Alexandrov">Anatoly Petrovich Alexandrov</a>, academician <a href="/wiki/Mstislav_Keldysh" title="Mstislav Keldysh">Mstislav Keldysh</a><sup id="cite_ref-23" class="reference"><a href="#cite_note-23">[23]</a></sup> and other Soviet and foreign scientists. -</p><p>According to a 2008 article, "Academician Abdullayev was called the Father of Physics in Azerbaijan and one of the Founders of the School of Semiconductor Research in the Soviet Union by such authoritative scientists as academicians Zh.Alferov, Yu.Gulyaev, L.Kurbatov, V.Isakov, Professor D.Nasledov, and others. In fact, the <a href="/wiki/Great_Soviet_Encyclopedia" title="Great Soviet Encyclopedia">Great Soviet Encyclopedia</a>, the most authoritative Soviet encyclopedia - the Soviet equivalent of the Encyclopædia Britannica in the West, listed the names of scientists, making the greatest contributions to the development of semiconductor electronics and microelectronics in this order: A.F.Ioffe (who was Abdullayev's mentor during his postdoctoral studies in Leningrad), N.P.Sazhin, Ya.I.Frenkel, B.M.Vul, V.M.Tuchkevich, H.B.Abdullayev, Zh.I.Alferov, L.V.Keldish, and others (Third Edition, 1970, page 351).<sup id="cite_ref-24" class="reference"><a href="#cite_note-24">[24]</a></sup> Thus, already in 1970, this encyclopedia put academician Abdullayev as the sixth most influential scientist in semi-conductor research, higher than such giants as Academicians Alferov and Keldish!"<sup id="cite_ref-GSE_22-1" class="reference"><a href="#cite_note-GSE-22">[22]</a></sup> +</p><p>According to a 2008 article, "Academician Abdullayev was called the Father of Physics in Azerbaijan and one of the Founders of the School of Semiconductor Research in the Soviet Union by such authoritative scientists as academicians Zh.Alferov, Yu.Gulyaev, L.Kurbatov, V.Isakov, Professor D.Nasledov, and others. In fact, the <a href="/wiki/Great_Soviet_Encyclopedia" title="Great Soviet Encyclopedia">Great Soviet Encyclopedia</a>, the most authoritative Soviet encyclopedia - the Soviet equivalent of the Encyclopædia Britannica in the West, listed the names of scientists, making the greatest contributions to the development of semiconductor electronics and microelectronics in this order: A.F.Ioffe (who was Abdullayev's mentor [ edit ] during his postdoctoral studies in Leningrad), N.P.Sazhin, Ya.I.Frenkel, B.M.Vul, V.M.Tuchkevich, H.B.Abdullayev, Zh.I.Alferov, L.V.Keldish, and others (Third Edition, 1970, page 351).<sup id="cite_ref-24" class="reference"><a href="#cite_note-24">[24]</a></sup> Thus, already in 1970, this encyclopedia put academician Abdullayev as the sixth most influential scientist in semi-conductor research, higher than such giants as Academicians Alferov and Keldish!"<sup id="cite_ref-GSE_22-1" class="reference"><a href="#cite_note-GSE-22">[22]</a></sup> </p><p>Academician Abdullayev was recognized as the top expert on the chemical element <a href="/wiki/Selenium" title="Selenium">selenium</a>, and thus entrusted authoring the article on selenium in the third (final) edition of the top scientific reference publication - the <a href="/wiki/Great_Soviet_Encyclopedia" title="Great Soviet Encyclopedia">Great Soviet Encyclopedia</a>.<sup id="cite_ref-25" class="reference"><a href="#cite_note-25">[25]</a></sup> </p> <h2><span class="mw-headline" id="Publications">Publications</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Hasan_Abdullayev&action=edit&section=5" title="Edit section: Publications">edit</a><span class="mw-editsection-bracket">]</span></span></h2> @@ -175,7 +175,7 @@ <li><span class="nowrap"> <a href="/wiki/WorldCat_Identities" class="mw-redirect" title="WorldCat Identities">WorldCat Identities</a> (via VIAF): <a rel="nofollow" class="external text" href="https://www.worldcat.org/identities/containsVIAFID/109565605">109565605</a></span></li></ul> </div></td></tr></tbody></table></div> -<!-- +<!-- NewPP limit report Parsed by mw1244 Cached time: 20190111175009 diff --git a/processing/wikiproc/tests/testthat/test-clean_html.R b/processing/wikiproc/tests/testthat/test-clean_html.R index 042e273c5e6859075004997dd82284d569490e2c..467230cbc0a48f4a9f1c2a0ed0f79df05e62c5a3 100644 --- a/processing/wikiproc/tests/testthat/test-clean_html.R +++ b/processing/wikiproc/tests/testthat/test-clean_html.R @@ -4,8 +4,9 @@ test_that("html cleansing works", { filename_raw <- "article-4-raw.html" filename_cleansed <- "article-4-cleansed.txt" html <- readChar(filename_raw, file.info(filename_raw)$size) - expected <- readChar(filename_cleansed, file.info(filename_cleansed)$size) - actual <- clean_html(html) + + expected <- gsub("\\s", "", readChar(filename_cleansed, file.info(filename_cleansed)$size)) + actual <- gsub("\\s", "", clean_html(html)) expect_equal(expected, actual) })