From 189788339374e0a2aa9a68322824fcb6cb103209 Mon Sep 17 00:00:00 2001 From: Lulu Roth <ls80zyse@studserv.uni-leipzig.de> Date: Sat, 12 Jan 2019 17:03:45 +0100 Subject: [PATCH] Rename files snake style --- processing/wikiproc/NAMESPACE | 3 + processing/wikiproc/R/GetNoOfSpouses.R | 63 ------------------- .../wikiproc/R/{CleanHtml.R => clean_html.R} | 2 +- ...eateAnnotations.R => create_annotations.R} | 2 +- .../R/{GetBirthdate.R => get_birthdate.R} | 2 +- .../R/{GetBirthplace.R => get_birthplace.R} | 2 +- .../man/{cleanHtml.Rd => clean_html.Rd} | 8 +-- processing/wikiproc/man/getIntroduction.Rd | 2 +- .../man/{getBirthdate.Rd => get_birthdate.Rd} | 8 +-- .../{getBirthplace.Rd => get_birthplace.Rd} | 8 +-- 10 files changed, 20 insertions(+), 80 deletions(-) delete mode 100755 processing/wikiproc/R/GetNoOfSpouses.R rename processing/wikiproc/R/{CleanHtml.R => clean_html.R} (97%) rename processing/wikiproc/R/{createAnnotations.R => create_annotations.R} (91%) rename processing/wikiproc/R/{GetBirthdate.R => get_birthdate.R} (98%) rename processing/wikiproc/R/{GetBirthplace.R => get_birthplace.R} (97%) rename processing/wikiproc/man/{cleanHtml.Rd => clean_html.Rd} (84%) rename processing/wikiproc/man/{getBirthdate.Rd => get_birthdate.Rd} (82%) rename processing/wikiproc/man/{getBirthplace.Rd => get_birthplace.Rd} (82%) diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE index 5e2056d..23ace1c 100644 --- a/processing/wikiproc/NAMESPACE +++ b/processing/wikiproc/NAMESPACE @@ -1,6 +1,9 @@ # Generated by roxygen2: do not edit by hand +export(clean_html) export(create_annotations) +export(get_birthdate) +export(get_birthplace) export(get_data) export(get_no_of_spouses) export(init_nlp) diff --git a/processing/wikiproc/R/GetNoOfSpouses.R b/processing/wikiproc/R/GetNoOfSpouses.R deleted file mode 100755 index d5882ce..0000000 --- a/processing/wikiproc/R/GetNoOfSpouses.R +++ /dev/null @@ -1,63 +0,0 @@ -### GetNoOfSpouses.R -### This extracts the number of spouses from the infobox -### If no infobox or no information about spouses is found assumes there are none -### Not for use in production, this does not actually get information from text - -# Author: David - -## Librarys - -library(rvest) -library(data.table) -library(xml2) - -### Get number of spouses -#' @export -getNoOfSpouses <- function(article) { - - # If there is no infobox we assume there were no spouses - if(!grepl("vcard", article)) { - return(0) - } - - infoBox <- getInfoBox(article) - - # Get the spouse field - spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content - # Remove everything in parentheses - spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses) - # Split the strings by newlines to get one spouse per line - spouses <- base::strsplit(spouses, "\n") - spouses <- base::unlist(spouses) - if(length(spouses) > 0) { - return(length(spouses)) - } - return(0) -} - -### Converts info box to table -getInfoBox <- function(article) { - # Read page as html - page <- xml2::read_html(article) - - # Extracting text from the html will erase all <br> tags, - # this will replace them with line breaks - - xml2::xml_find_all(page, ".//br") %>% - xml2::xml_add_sibling("p", "\n") - - xml2::xml_find_all(page, ".//br") %>% - xml2::xml_remove() - - # Get the info box - # Will throw an error if there isnt any, so that should be checked beforehand - - table <- page %>% - rvest::html_nodes("table.vcard") %>% - rvest::html_table(fill = TRUE) %>% - .[[1]] - - colnames(table) <- c("Desc", "Content") - - return(table) -} diff --git a/processing/wikiproc/R/CleanHtml.R b/processing/wikiproc/R/clean_html.R similarity index 97% rename from processing/wikiproc/R/CleanHtml.R rename to processing/wikiproc/R/clean_html.R index e541a1d..d0a57c2 100644 --- a/processing/wikiproc/R/CleanHtml.R +++ b/processing/wikiproc/R/clean_html.R @@ -9,7 +9,7 @@ #' @export #' @param html Url linking to a wikipedia webpage or a html formatted document. #' @return Plaintext document containing only the maintext of the give wikipedia page. -cleanHtml <- function(html) { +clean_html <- function(html) { # 1. read data from url or html-formatted text # 2 .extract nodes containing main information (ignore infoboxes, list of literature, ...) diff --git a/processing/wikiproc/R/createAnnotations.R b/processing/wikiproc/R/create_annotations.R similarity index 91% rename from processing/wikiproc/R/createAnnotations.R rename to processing/wikiproc/R/create_annotations.R index 6deb6a5..bbe1642 100644 --- a/processing/wikiproc/R/createAnnotations.R +++ b/processing/wikiproc/R/create_annotations.R @@ -1,7 +1,7 @@ library(cleanNLP) #' @export -createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) { +create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) { # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces # To prevent this we stip 'em again diff --git a/processing/wikiproc/R/GetBirthdate.R b/processing/wikiproc/R/get_birthdate.R similarity index 98% rename from processing/wikiproc/R/GetBirthdate.R rename to processing/wikiproc/R/get_birthdate.R index dd980b5..3288e2d 100644 --- a/processing/wikiproc/R/GetBirthdate.R +++ b/processing/wikiproc/R/get_birthdate.R @@ -16,7 +16,7 @@ library(xml2) #' @export #' @param article Article in HTML-format #' @return String birthdate as string|NULL -getBirthdate <- function(article) { +get_birthdate <- function(article) { if(grepl("vcard", article)) { diff --git a/processing/wikiproc/R/GetBirthplace.R b/processing/wikiproc/R/get_birthplace.R similarity index 97% rename from processing/wikiproc/R/GetBirthplace.R rename to processing/wikiproc/R/get_birthplace.R index 652217f..61839e0 100644 --- a/processing/wikiproc/R/GetBirthplace.R +++ b/processing/wikiproc/R/get_birthplace.R @@ -16,7 +16,7 @@ library(data.table) #' @export #' @param article Article in HTML-format #' @return String with birthplace of the physicist|0 -getBirthplace <- function(article) { +get_birthplace <- function(article) { # If there is no infobox we return 0 if(!grepl("vcard", article)) { diff --git a/processing/wikiproc/man/cleanHtml.Rd b/processing/wikiproc/man/clean_html.Rd similarity index 84% rename from processing/wikiproc/man/cleanHtml.Rd rename to processing/wikiproc/man/clean_html.Rd index 7247852..3f49f48 100644 --- a/processing/wikiproc/man/cleanHtml.Rd +++ b/processing/wikiproc/man/clean_html.Rd @@ -1,12 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CleanHtml.R -\name{cleanHtml} -\alias{cleanHtml} +% Please edit documentation in R/clean_html.R +\name{clean_html} +\alias{clean_html} \title{Clean a html formatted wikipedia page. Nodes of interest from the DOM are extracted and then cleaned from all html tags and annotations.} \usage{ -cleanHtml(html) +clean_html(html) } \arguments{ \item{html}{Url linking to a wikipedia webpage or a html formatted document.} diff --git a/processing/wikiproc/man/getIntroduction.Rd b/processing/wikiproc/man/getIntroduction.Rd index 3dfe196..5778a54 100644 --- a/processing/wikiproc/man/getIntroduction.Rd +++ b/processing/wikiproc/man/getIntroduction.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/GetBirthdate.R +% Please edit documentation in R/get_birthdate.R \name{getIntroduction} \alias{getIntroduction} \title{Get Introduction Text from Wikipedia page that contains birthdate} diff --git a/processing/wikiproc/man/getBirthdate.Rd b/processing/wikiproc/man/get_birthdate.Rd similarity index 82% rename from processing/wikiproc/man/getBirthdate.Rd rename to processing/wikiproc/man/get_birthdate.Rd index a614ade..1e77780 100644 --- a/processing/wikiproc/man/getBirthdate.Rd +++ b/processing/wikiproc/man/get_birthdate.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/GetBirthdate.R -\name{getBirthdate} -\alias{getBirthdate} +% Please edit documentation in R/get_birthdate.R +\name{get_birthdate} +\alias{get_birthdate} \title{Extract birthdate from infobox Will try to get infobox as table and extract birthdate from 'Born'-entry If there is no infobox, first paragraph of the article will be checked for birthdate} \usage{ -getBirthdate(article) +get_birthdate(article) } \arguments{ \item{article}{Article in HTML-format} diff --git a/processing/wikiproc/man/getBirthplace.Rd b/processing/wikiproc/man/get_birthplace.Rd similarity index 82% rename from processing/wikiproc/man/getBirthplace.Rd rename to processing/wikiproc/man/get_birthplace.Rd index b1b0830..0db95fe 100644 --- a/processing/wikiproc/man/getBirthplace.Rd +++ b/processing/wikiproc/man/get_birthplace.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/GetBirthplace.R -\name{getBirthplace} -\alias{getBirthplace} +% Please edit documentation in R/get_birthplace.R +\name{get_birthplace} +\alias{get_birthplace} \title{This script extracts Birthplace from physicist texts Try to get the infobox and extract the birthplace If there is no infobox, 0 will be returned as birthplace is hard to extract from text} \usage{ -getBirthplace(article) +get_birthplace(article) } \arguments{ \item{article}{Article in HTML-format} -- GitLab