Skip to content
Snippets Groups Projects
Commit 80c1245f authored by Lucas Schons's avatar Lucas Schons
Browse files

Merge branch '32-add-unit-tests-for-cleanhtml-r' of...

Merge branch '32-add-unit-tests-for-cleanhtml-r' of git.informatik.uni-leipzig.de:text-mining-chatbot/wiki-rasa into 32-add-unit-tests-for-cleanhtml-r
parents 002a88f1 1c9037f0
No related branches found
No related tags found
3 merge requests!34Resolve "Add unit tests for clean_html.R",!28WIP: Resolve "Create pattern matching function",!27Resolve "Add unit tests for cleanHtml.R"
Showing with 300 additions and 45 deletions
#!/usr/bin/env Rscript
### Provides functionality to use NER, POS and Dependency Grammars
## Author: David
cat("Initializing spacy backend...\n")
# It's important to do this prior to loading any python related stuff
reticulate::use_condaenv("spcy", required = TRUE)
# Load librarys
library(cleanNLP)
# Init nlp models
cnlp_init_spacy(entity_flag = TRUE)
cat("Done.\n")
...@@ -3,10 +3,13 @@ ...@@ -3,10 +3,13 @@
### This script consolidates everything ### This script consolidates everything
library(pbapply) library(pbapply)
library(rvest)
library(wikiproc) library(wikiproc)
library(rprojroot) library(rprojroot)
## Set up nlp
init_nlp("conda", "spcy")
## Fetch data ## Fetch data
cat("Starting data import...\n") cat("Starting data import...\n")
...@@ -15,7 +18,7 @@ cat("Starting data import...\n") ...@@ -15,7 +18,7 @@ cat("Starting data import...\n")
project_root <- find_root(has_file("README.md")) project_root <- find_root(has_file("README.md"))
data_dir <- paste(project_root, "data", sep = .Platform$file.sep) data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
articles <- wikiproc::getData(use.cache = TRUE, data.dir = data_dir) articles <- get_data(use.cache = TRUE, data.dir = data_dir)
## Data processing ## Data processing
...@@ -31,11 +34,11 @@ results <- pbapply(articles, 1, function(article) { ...@@ -31,11 +34,11 @@ results <- pbapply(articles, 1, function(article) {
## Data preprocessing/annotating ## Data preprocessing/annotating
# annotation <- createAnnotations(cleaned.text, article[2], article[3]) annotation <- create_annotations(cleaned.text, article[2], article[3], data.dir = data_dir)
## Extract information from Text ## Extract information from Text
no.spouses <- wikiproc::getNoOfSpouses(article[4]) no.spouses <- get_no_of_spouses(article[4])
## Create Results ## Create Results
......
# Generated by roxygen2: do not edit by hand # Generated by roxygen2: do not edit by hand
export(cleanHtml) export(create_annotations)
export(createAnnotations) export(get_data)
export(getBirthdate) export(get_no_of_spouses)
export(getBirthplace) export(init_nlp)
export(getData) import(rvest)
export(getNoOfSpouses) importFrom(data.table,"%like%")
importFrom(xml2,read_html)
importFrom(xml2,xml_add_sibling)
importFrom(xml2,xml_find_all)
importFrom(xml2,xml_remove)
...@@ -2,11 +2,6 @@ ...@@ -2,11 +2,6 @@
# Author: Lucas # Author: Lucas
library(rvest)
library(stringi)
library(textclean)
library(xml2)
#' Clean a html formatted wikipedia page. #' Clean a html formatted wikipedia page.
#' Nodes of interest from the DOM are extracted and then cleaned from all html #' Nodes of interest from the DOM are extracted and then cleaned from all html
#' tags and annotations. #' tags and annotations.
......
...@@ -2,10 +2,6 @@ ...@@ -2,10 +2,6 @@
# Author: David # Author: David
library(WikipediR) # For querying wikipedia
library(rvest) # For getting the list of physicits
library(xml2)
## Though we could get the pages within the category 'physicists' with something like this ## Though we could get the pages within the category 'physicists' with something like this
## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
## this gives us only about 50 pages. ## this gives us only about 50 pages.
...@@ -19,7 +15,8 @@ library(xml2) ...@@ -19,7 +15,8 @@ library(xml2)
#' @param write.cache Write downloaded results into cache for use on future calls #' @param write.cache Write downloaded results into cache for use on future calls
#' @param data.dir Directory the data should be read from and/or written to #' @param data.dir Directory the data should be read from and/or written to
#' @return data.frame containing the title, id, revisionID and html-formatted full text #' @return data.frame containing the title, id, revisionID and html-formatted full text
getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") { #' @export
get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep) dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep)
dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep) dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep)
......
### GetNoOfSpouses.R
### This extracts the number of spouses from the infobox
### If no infobox or no information about spouses is found assumes there are none
### Not for use in production, this does not actually get information from text
# Author: David
#' Reads the number of spouses from the infobox of an wikipedia article
#'
#' @param article Wikipedia article in html format
#'
#' @return Integer indicating the number of spouses
#' @export
#'
#' @examples
#' \dontrun{
#' articles <- get_data()
#'
#' no.spouses <- get_no_of_spouses(articles$Text[54])
#'
#' no,spouses
#' }
get_no_of_spouses <- function(article) {
# If there is no infobox we assume there were no spouses
if(!grepl("vcard", article)) {
return(0)
}
infoBox <- get_infobox(article)
# Get the spouse field
spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
# Remove everything in parentheses
spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
# Split the strings by newlines to get one spouse per line
spouses <- strsplit(spouses, "\n")
spouses <- unlist(spouses)
if(length(spouses) > 0) {
return(length(spouses))
}
return(0)
}
### File used to automatically create package imports with roxygen2
### Note that it is discouraged to import many packages fully to avoid name conflicts
### If possible reference functions directy e.g. reshape2::melt()
### There is a (very) minor performance penalty for ::,
### if some functions are used frequently you may just import them
### with something like @importFrom reshape2 melt cast
#' @import rvest
#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove read_html
#' @importFrom data.table %like%
NULL
\ No newline at end of file
#' Initialize the nlp backend
#'
#' A wrapper used to set the python environment and call cnlp_init
#'
#' @param type Type of python env to use, either "conda" or "python"
#' @param value Connection string, if using a conda environment the name of it
#' if using python directly the path to the python executable
#'
#' @return Does not return data
#' @export
#'
#' @examples
#' \dontrun{
#' init_nlp("conda", "spcy")
#' }
init_nlp <- function(type, value) {
if (type == "conda") {
reticulate::use_condaenv(value, required = TRUE)
} else if (type == "python") {
reticulate::use_python(value, required = TRUE)
}
cleanNLP::cnlp_init_spacy(entity_flag = TRUE)
}
#' Create annotations for the given text
#'
#' @param text Text to annotate
#' @param article.id ArticleID used for cashing
#' @param article.rev.id ArticleRevisionID used for cashing
#' @param use.cache Should cashed data be uses
#' @param write.cache Should the generated annotations be cashed
#' @param data.dir Directory the data should be read from and/or written to
#'
#' @return Annotation object for use with cleanNLP methods
#' @export
create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
# Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
# To prevent this we stip 'em again
filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE)
# Check if there is a cached version of the annotations for this article in this specific revision
if(use.cache & file.exists(filename)) {
res <- tryCatch({
data <- readRDS(filename)
data
}, error = function (e) {
cat("Cached data seems to be corrupted, redoing annotation.\n")
})
return(res)
}
annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE)
# Write cache if desired
if(write.cache) {
if (!dir.exists("data")) {
dir.create("data")
}
if (!dir.exists("data/annotations")) {
dir.create("data/annotations")
}
saveRDS(annotation, filename)
}
# Return data
# On a side note: Should we do this? The tidyverse style guide discourages explicit returns.
# But then again, it suggests snake case for variables...
return(annotation)
}
\ No newline at end of file
### Utility functions used internally
#' Extract the inforbox contents from wikipedia articles
#'
#' @param article Character vector containing the contents of an wikipedia
#' article as html
#'
#' @return Data frame holding the contents of the table
#'
#' @examples
#' \dontrun{
#' articles <- get_data()
#'
#' infobox <- get_infobox(articles$Text[54])
#'
#' infobox[3:4,]
#' }
get_infobox <- function(article) {
# Read page as html
page <- read_html(article)
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>%
xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
html_nodes("table.vcard") %>%
html_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/nlp_annotate.R
\name{create_annotations}
\alias{create_annotations}
\title{Create annotations for the given text}
\usage{
create_annotations(text, article.id, article.rev.id, use.cache = TRUE,
write.cache = FALSE, data.dir = "data")
}
\arguments{
\item{text}{Text to annotate}
\item{article.id}{ArticleID used for cashing}
\item{article.rev.id}{ArticleRevisionID used for cashing}
\item{use.cache}{Should cashed data be uses}
\item{write.cache}{Should the generated annotations be cashed}
\item{data.dir}{Directory the data should be read from and/or written to}
}
\value{
Annotation object for use with cleanNLP methods
}
\description{
Create annotations for the given text
}
% Generated by roxygen2: do not edit by hand % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/GetData.R % Please edit documentation in R/get_data.R
\name{getData} \name{get_data}
\alias{getData} \alias{get_data}
\title{Retrieve wikipedia articles about physicists} \title{Retrieve wikipedia articles about physicists}
\usage{ \usage{
getData(use.cache = TRUE, write.cache = FALSE, data.dir = "data") get_data(use.cache = TRUE, write.cache = FALSE, data.dir = "data")
} }
\arguments{ \arguments{
\item{use.cache}{Use cached data if it exists over downloading new data} \item{use.cache}{Use cached data if it exists over downloading new data}
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{get_infobox}
\alias{get_infobox}
\title{Extract the inforbox contents from wikipedia articles}
\usage{
get_infobox(article)
}
\arguments{
\item{article}{Character vector containing the contents of an wikipedia
article as html}
}
\value{
Data frame holding the contents of the table
}
\description{
Extract the inforbox contents from wikipedia articles
}
\examples{
\dontrun{
articles <- get_data()
infobox <- get_infobox(articles$Text[54])
infobox[3:4,]
}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_no_of_spouses.R
\name{get_no_of_spouses}
\alias{get_no_of_spouses}
\title{Reads the number of spouses from the infobox of an wikipedia article}
\usage{
get_no_of_spouses(article)
}
\arguments{
\item{article}{Wikipedia article in html format}
}
\value{
Integer indicating the number of spouses
}
\description{
Reads the number of spouses from the infobox of an wikipedia article
}
\examples{
\dontrun{
articles <- get_data()
no.spouses <- get_no_of_spouses(articles$Text[54])
no,spouses
}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/nlp_annotate.R
\name{init_nlp}
\alias{init_nlp}
\title{Initialize the nlp backend}
\usage{
init_nlp(type, value)
}
\arguments{
\item{type}{Type of python env to use, either "conda" or "python"}
\item{value}{Connection string, if using a conda environment the name of it
if using python directly the path to the python executable}
}
\value{
Does not return data
}
\description{
A wrapper used to set the python environment and call cnlp_init
}
\examples{
\dontrun{
init_nlp("conda", "spcy")
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment