diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE index 5e2056db2f27b4dfd24924442b85fe72621177c8..23ace1cca47f2bd6bfdf5be4aea3a32779c38baa 100644 --- a/processing/wikiproc/NAMESPACE +++ b/processing/wikiproc/NAMESPACE @@ -1,6 +1,9 @@ # Generated by roxygen2: do not edit by hand +export(clean_html) export(create_annotations) +export(get_birthdate) +export(get_birthplace) export(get_data) export(get_no_of_spouses) export(init_nlp) diff --git a/processing/wikiproc/R/GetNoOfSpouses.R b/processing/wikiproc/R/GetNoOfSpouses.R deleted file mode 100755 index d5882ceff7127d73eb011724dd43791b2466d61c..0000000000000000000000000000000000000000 --- a/processing/wikiproc/R/GetNoOfSpouses.R +++ /dev/null @@ -1,63 +0,0 @@ -### GetNoOfSpouses.R -### This extracts the number of spouses from the infobox -### If no infobox or no information about spouses is found assumes there are none -### Not for use in production, this does not actually get information from text - -# Author: David - -## Librarys - -library(rvest) -library(data.table) -library(xml2) - -### Get number of spouses -#' @export -getNoOfSpouses <- function(article) { - - # If there is no infobox we assume there were no spouses - if(!grepl("vcard", article)) { - return(0) - } - - infoBox <- getInfoBox(article) - - # Get the spouse field - spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content - # Remove everything in parentheses - spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses) - # Split the strings by newlines to get one spouse per line - spouses <- base::strsplit(spouses, "\n") - spouses <- base::unlist(spouses) - if(length(spouses) > 0) { - return(length(spouses)) - } - return(0) -} - -### Converts info box to table -getInfoBox <- function(article) { - # Read page as html - page <- xml2::read_html(article) - - # Extracting text from the html will erase all <br> tags, - # this will replace them with line breaks - - xml2::xml_find_all(page, ".//br") %>% - xml2::xml_add_sibling("p", "\n") - - xml2::xml_find_all(page, ".//br") %>% - xml2::xml_remove() - - # Get the info box - # Will throw an error if there isnt any, so that should be checked beforehand - - table <- page %>% - rvest::html_nodes("table.vcard") %>% - rvest::html_table(fill = TRUE) %>% - .[[1]] - - colnames(table) <- c("Desc", "Content") - - return(table) -} diff --git a/processing/wikiproc/R/CleanHtml.R b/processing/wikiproc/R/clean_html.R similarity index 97% rename from processing/wikiproc/R/CleanHtml.R rename to processing/wikiproc/R/clean_html.R index e541a1d86aa0aac0e65e806ca0d5a4e5ff1c98cc..d0a57c237ce4edef7b3ad9bb359b348e4994b4ce 100644 --- a/processing/wikiproc/R/CleanHtml.R +++ b/processing/wikiproc/R/clean_html.R @@ -9,7 +9,7 @@ #' @export #' @param html Url linking to a wikipedia webpage or a html formatted document. #' @return Plaintext document containing only the maintext of the give wikipedia page. -cleanHtml <- function(html) { +clean_html <- function(html) { # 1. read data from url or html-formatted text # 2 .extract nodes containing main information (ignore infoboxes, list of literature, ...) diff --git a/processing/wikiproc/R/createAnnotations.R b/processing/wikiproc/R/create_annotations.R similarity index 91% rename from processing/wikiproc/R/createAnnotations.R rename to processing/wikiproc/R/create_annotations.R index 6deb6a538e7d39472bc5d25a00f93a227ebeac12..bbe1642aadbed0be45eb7fa391bef6c719c4bad3 100644 --- a/processing/wikiproc/R/createAnnotations.R +++ b/processing/wikiproc/R/create_annotations.R @@ -1,7 +1,7 @@ library(cleanNLP) #' @export -createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) { +create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) { # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces # To prevent this we stip 'em again diff --git a/processing/wikiproc/R/GetBirthdate.R b/processing/wikiproc/R/get_birthdate.R similarity index 98% rename from processing/wikiproc/R/GetBirthdate.R rename to processing/wikiproc/R/get_birthdate.R index dd980b57e69162967f842d62abcea3d508598fae..3288e2da7904d6d7cf6a8c52eff3444a5f735e18 100644 --- a/processing/wikiproc/R/GetBirthdate.R +++ b/processing/wikiproc/R/get_birthdate.R @@ -16,7 +16,7 @@ library(xml2) #' @export #' @param article Article in HTML-format #' @return String birthdate as string|NULL -getBirthdate <- function(article) { +get_birthdate <- function(article) { if(grepl("vcard", article)) { diff --git a/processing/wikiproc/R/GetBirthplace.R b/processing/wikiproc/R/get_birthplace.R similarity index 97% rename from processing/wikiproc/R/GetBirthplace.R rename to processing/wikiproc/R/get_birthplace.R index 652217f87e0a9e0348f2fd15f6a3a1d92b6244e5..61839e07faef37f592b51e62f8605071d975dee3 100644 --- a/processing/wikiproc/R/GetBirthplace.R +++ b/processing/wikiproc/R/get_birthplace.R @@ -16,7 +16,7 @@ library(data.table) #' @export #' @param article Article in HTML-format #' @return String with birthplace of the physicist|0 -getBirthplace <- function(article) { +get_birthplace <- function(article) { # If there is no infobox we return 0 if(!grepl("vcard", article)) { diff --git a/processing/wikiproc/man/cleanHtml.Rd b/processing/wikiproc/man/clean_html.Rd similarity index 84% rename from processing/wikiproc/man/cleanHtml.Rd rename to processing/wikiproc/man/clean_html.Rd index 7247852e83e59f5f6c7ba54df2f90692b7f98a9c..3f49f4837d4e3ae7caab15fef5fc5180906d2f5e 100644 --- a/processing/wikiproc/man/cleanHtml.Rd +++ b/processing/wikiproc/man/clean_html.Rd @@ -1,12 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CleanHtml.R -\name{cleanHtml} -\alias{cleanHtml} +% Please edit documentation in R/clean_html.R +\name{clean_html} +\alias{clean_html} \title{Clean a html formatted wikipedia page. Nodes of interest from the DOM are extracted and then cleaned from all html tags and annotations.} \usage{ -cleanHtml(html) +clean_html(html) } \arguments{ \item{html}{Url linking to a wikipedia webpage or a html formatted document.} diff --git a/processing/wikiproc/man/getIntroduction.Rd b/processing/wikiproc/man/getIntroduction.Rd index 3dfe196b3bab7bf139554fa2d49042c0e3fec93b..5778a54c4f1ea2fab27a1344565267b8469cc967 100644 --- a/processing/wikiproc/man/getIntroduction.Rd +++ b/processing/wikiproc/man/getIntroduction.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/GetBirthdate.R +% Please edit documentation in R/get_birthdate.R \name{getIntroduction} \alias{getIntroduction} \title{Get Introduction Text from Wikipedia page that contains birthdate} diff --git a/processing/wikiproc/man/getBirthdate.Rd b/processing/wikiproc/man/get_birthdate.Rd similarity index 82% rename from processing/wikiproc/man/getBirthdate.Rd rename to processing/wikiproc/man/get_birthdate.Rd index a614ade2049e244a96e56eda90730b3e1a352abf..1e77780e95a0ad417d6fb3a2171ae9cad270075e 100644 --- a/processing/wikiproc/man/getBirthdate.Rd +++ b/processing/wikiproc/man/get_birthdate.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/GetBirthdate.R -\name{getBirthdate} -\alias{getBirthdate} +% Please edit documentation in R/get_birthdate.R +\name{get_birthdate} +\alias{get_birthdate} \title{Extract birthdate from infobox Will try to get infobox as table and extract birthdate from 'Born'-entry If there is no infobox, first paragraph of the article will be checked for birthdate} \usage{ -getBirthdate(article) +get_birthdate(article) } \arguments{ \item{article}{Article in HTML-format} diff --git a/processing/wikiproc/man/getBirthplace.Rd b/processing/wikiproc/man/get_birthplace.Rd similarity index 82% rename from processing/wikiproc/man/getBirthplace.Rd rename to processing/wikiproc/man/get_birthplace.Rd index b1b083083922001d92b76f97cfb38ec7ca0049bf..0db95fed89b30e3377511901914a0766b139ba81 100644 --- a/processing/wikiproc/man/getBirthplace.Rd +++ b/processing/wikiproc/man/get_birthplace.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/GetBirthplace.R -\name{getBirthplace} -\alias{getBirthplace} +% Please edit documentation in R/get_birthplace.R +\name{get_birthplace} +\alias{get_birthplace} \title{This script extracts Birthplace from physicist texts Try to get the infobox and extract the birthplace If there is no infobox, 0 will be returned as birthplace is hard to extract from text} \usage{ -getBirthplace(article) +get_birthplace(article) } \arguments{ \item{article}{Article in HTML-format}