diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE index 23ace1cca47f2bd6bfdf5be4aea3a32779c38baa..cec814d403dbe165ac9ea64126faec2e3c772a0e 100644 --- a/processing/wikiproc/NAMESPACE +++ b/processing/wikiproc/NAMESPACE @@ -7,9 +7,5 @@ export(get_birthplace) export(get_data) export(get_no_of_spouses) export(init_nlp) -import(rvest) importFrom(data.table,"%like%") -importFrom(xml2,read_html) -importFrom(xml2,xml_add_sibling) -importFrom(xml2,xml_find_all) -importFrom(xml2,xml_remove) +importFrom(magrittr,"%>%") diff --git a/processing/wikiproc/R/create_annotations.R b/processing/wikiproc/R/create_annotations.R deleted file mode 100644 index bbe1642aadbed0be45eb7fa391bef6c719c4bad3..0000000000000000000000000000000000000000 --- a/processing/wikiproc/R/create_annotations.R +++ /dev/null @@ -1,42 +0,0 @@ -library(cleanNLP) - -#' @export -create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) { - - # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces - # To prevent this we stip 'em again - - filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE) - - # Check if there is a cached version of the annotations for this article in this specific revision - - if(use.cache & file.exists(filename)) { - res <- tryCatch({ - data <- readRDS(filename) - data - }, error = function (e) { - cat("Cached data seems to be corrupted, redoing annotation.\n") - }) - return(res) - } - - annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE) - - # Write cache if desired - - if(write.cache) { - if (!dir.exists("data")) { - dir.create("data") - } - if (!dir.exists("data/annotations")) { - dir.create("data/annotations") - } - saveRDS(annotation, filename) - } - - # Return data - # On a side note: Should we do this? The tidyverse style guide discourages explicit returns. - # But then again, it suggests snake case for variables... - - return(annotation) -} diff --git a/processing/wikiproc/R/get_birthdate.R b/processing/wikiproc/R/get_birthdate.R index 3288e2da7904d6d7cf6a8c52eff3444a5f735e18..09f243e43ac8b035282127d22b130adc66b4c988 100644 --- a/processing/wikiproc/R/get_birthdate.R +++ b/processing/wikiproc/R/get_birthdate.R @@ -2,11 +2,6 @@ # Author: Lukas -library(rvest) -library(stringr) -library(data.table) -library(xml2) - #' Extract birthdate from infobox #' Will try to get infobox as table and extract birthdate #' from 'Born'-entry @@ -21,7 +16,7 @@ get_birthdate <- function(article) { if(grepl("vcard", article)) { # Check if there is an infobox - infoBox <- getInfoBox(article) + infoBox <- get_infobox(article) # Get the Born field birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content @@ -53,33 +48,6 @@ get_birthdate <- function(article) { } } -### Converts info box to table -getInfoBox <- function(article) { - # Read page as html - page <- xml2::read_html(article) - - # Extracting text from the html will erase all <br> tags, - # This will replace them with line breaks - - xml2::xml_find_all(page, ".//br") %>% - xml2::xml_add_sibling("p", "\n") - - xml2::xml_find_all(page, ".//br") %>% - xml2::xml_remove() - - # Get the info box - # Will throw an error if there isnt any, so that should be checked beforehand - - table <- page %>% - rvest::html_nodes("table.vcard") %>% - rvest::html_table(fill = TRUE) %>% - .[[1]] - - colnames(table) <- c("Desc", "Content") - - return(table) -} - #' Get Introduction Text from Wikipedia page that contains birthdate #' #' @param article article in HTML-format diff --git a/processing/wikiproc/R/get_birthplace.R b/processing/wikiproc/R/get_birthplace.R index 61839e07faef37f592b51e62f8605071d975dee3..e0310bdad7882b1f3a404115eb1637fb0d9cd52b 100644 --- a/processing/wikiproc/R/get_birthplace.R +++ b/processing/wikiproc/R/get_birthplace.R @@ -2,12 +2,6 @@ # Author: Lukas -## librarys - -library(rvest) -library(stringr) -library(data.table) - #' This script extracts Birthplace from physicist texts #' Try to get the infobox and extract the birthplace #' If there is no infobox, 0 will be returned as @@ -24,7 +18,7 @@ get_birthplace <- function(article) { } # Use infobox to get Birthplace - infoBox <- getInfoBox(article) + infoBox <- get_infobox(article) # Get 'Born' field birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content @@ -36,30 +30,3 @@ get_birthplace <- function(article) { # return birthplace return(birthplace) } - -### Converts info box to table -getInfoBox <- function(article) { - # Read page as html - page <- xml2::read_html(article) - - # Extracting text from the html will erase all <br> tags, - # this will replace them with line breaks - - xml2::xml_find_all(page, ".//br") %>% - xml2::xml_add_sibling("p", "\n") - - xml2::xml_find_all(page, ".//br") %>% - xml2::xml_remove() - - # Get the info box - # Will throw an error if there isnt any, so that should be checked beforehand - - table <- page %>% - rvest::html_nodes("table.vcard") %>% - rvest::tml_table(fill = TRUE) %>% - .[[1]] - - colnames(table) <- c("Desc", "Content") - - return(table) -} diff --git a/processing/wikiproc/R/get_data.R b/processing/wikiproc/R/get_data.R index d5dc1411e7b139563145a729371efb746d66b622..abcf0094a43ee04b8385e6e978258170777b0f7b 100644 --- a/processing/wikiproc/R/get_data.R +++ b/processing/wikiproc/R/get_data.R @@ -10,7 +10,6 @@ #' Retrieve wikipedia articles about physicists #' -#' @export #' @param use.cache Use cached data if it exists over downloading new data #' @param write.cache Write downloaded results into cache for use on future calls #' @param data.dir Directory the data should be read from and/or written to @@ -38,7 +37,7 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { cat("Downloading list from wikipedia... ") - page <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_physicists") + page <- read_html("https://en.wikipedia.org/wiki/List_of_physicists") cat("Done.\n") @@ -47,8 +46,8 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { cat("Processing data:\n") physicists <- page %>% - rvest::html_nodes(".mw-parser-output li a") %>% - rvest::html_attr("href") + html_nodes(".mw-parser-output li a") %>% + html_attr("href") # Clean the list @@ -85,9 +84,9 @@ get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { # Check if the article is a redirect page if (grepl(".redirectText", article$parse$text$`*`)) { # Get the real article name - pname <- xml2::read_html(article$parse$text$`*`) %>% - rvest::html_nodes(".redirectText a") %>% - rvest::html_attr("href") + pname <- read_html(article$parse$text$`*`) %>% + html_nodes(".redirectText a") %>% + html_attr("href") panme <- gsub("_", " ", pname) diff --git a/processing/wikiproc/R/import_packages.R b/processing/wikiproc/R/import_packages.R index 03025a3e2cbb9f89380516124e74c48f8b4b95c2..06a0402535bb8b1c9b12d458706eaaf3fba58584 100644 --- a/processing/wikiproc/R/import_packages.R +++ b/processing/wikiproc/R/import_packages.R @@ -5,7 +5,6 @@ ### if some functions are used frequently you may just import them ### with something like @importFrom reshape2 melt cast -#' @import rvest -#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove read_html #' @importFrom data.table %like% +#' @importFrom magrittr %>% NULL \ No newline at end of file diff --git a/processing/wikiproc/R/utils.R b/processing/wikiproc/R/utils.R index a518f16f9fa62497250c8d64f87cf9dc2bb012ab..6a3bc32063ab1f2e2d40bb2b9bcb89fa0bd8ce0e 100644 --- a/processing/wikiproc/R/utils.R +++ b/processing/wikiproc/R/utils.R @@ -18,23 +18,23 @@ #' } get_infobox <- function(article) { # Read page as html - page <- read_html(article) + page <- xml2::read_html(article) # Extracting text from the html will erase all <br> tags, # this will replace them with line breaks - xml_find_all(page, ".//br") %>% - xml_add_sibling("p", "\n") + xml2::xml_find_all(page, ".//br") %>% + xml2::xml_add_sibling("p", "\n") - xml_find_all(page, ".//br") %>% - xml_remove() + xml2::xml_find_all(page, ".//br") %>% + xml2::xml_remove() # Get the info box # Will throw an error if there isnt any, so that should be checked beforehand table <- page %>% - html_nodes("table.vcard") %>% - html_table(fill = TRUE) %>% + rvest::html_nodes("table.vcard") %>% + rvest::html_table(fill = TRUE) %>% .[[1]] colnames(table) <- c("Desc", "Content") diff --git a/rasa/example/Makefile b/rasa/Makefile similarity index 100% rename from rasa/example/Makefile rename to rasa/Makefile diff --git a/rasa/example/README.md b/rasa/README.md similarity index 100% rename from rasa/example/README.md rename to rasa/README.md diff --git a/rasa/example/actions.py b/rasa/actions.py similarity index 100% rename from rasa/example/actions.py rename to rasa/actions.py diff --git a/rasa/example/bot.py b/rasa/bot.py similarity index 100% rename from rasa/example/bot.py rename to rasa/bot.py diff --git a/rasa/example/data.tsv b/rasa/data.tsv similarity index 100% rename from rasa/example/data.tsv rename to rasa/data.tsv diff --git a/rasa/example/domain.yml b/rasa/domain.yml similarity index 100% rename from rasa/example/domain.yml rename to rasa/domain.yml diff --git a/rasa/example/endpoints.yml b/rasa/endpoints.yml similarity index 100% rename from rasa/example/endpoints.yml rename to rasa/endpoints.yml diff --git a/rasa/example/nlu.md b/rasa/nlu.md similarity index 100% rename from rasa/example/nlu.md rename to rasa/nlu.md diff --git a/rasa/example/nlu_config.yml b/rasa/nlu_config.yml similarity index 100% rename from rasa/example/nlu_config.yml rename to rasa/nlu_config.yml diff --git a/rasa/example/nlu_config.yml.spacy b/rasa/nlu_config.yml.spacy similarity index 100% rename from rasa/example/nlu_config.yml.spacy rename to rasa/nlu_config.yml.spacy diff --git a/rasa/example/policy.py b/rasa/policy.py similarity index 100% rename from rasa/example/policy.py rename to rasa/policy.py diff --git a/rasa/example/stories.md b/rasa/stories.md similarity index 100% rename from rasa/example/stories.md rename to rasa/stories.md