Skip to content
Snippets Groups Projects
Commit 96d9b438 authored by David Fuhry's avatar David Fuhry
Browse files

Refactoring

* Added roxygen comments
* Moved nlp init into package
* Fixed various bugs
* Added import_packages.R and replaced all other imports with explicit ones
* Converted my methods to snake case
* Some minor stuff
parent 49e8eff0
No related branches found
No related tags found
3 merge requests!34Resolve "Add unit tests for clean_html.R",!28WIP: Resolve "Create pattern matching function",!27Resolve "Add unit tests for cleanHtml.R"
Showing
with 309 additions and 45 deletions
#!/usr/bin/env Rscript
### Provides functionality to use NER, POS and Dependency Grammars
## Author: David
cat("Initializing spacy backend...\n")
# It's important to do this prior to loading any python related stuff
reticulate::use_condaenv("spcy", required = TRUE)
# Load librarys
library(cleanNLP)
# Init nlp models
cnlp_init_spacy(entity_flag = TRUE)
cat("Done.\n")
......@@ -3,10 +3,13 @@
### This script consolidates everything
library(pbapply)
library(rvest)
library(wikiproc)
library(rprojroot)
## Set up nlp
init_nlp("conda", "spcy")
## Fetch data
cat("Starting data import...\n")
......@@ -15,7 +18,7 @@ cat("Starting data import...\n")
project_root <- find_root(has_file("README.md"))
data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
articles <- wikiproc:::getData(use.cache = TRUE, data.dir = data_dir)
articles <- get_data(use.cache = TRUE, data.dir = data_dir)
## Data processing
......@@ -31,11 +34,11 @@ results <- pbapply(articles, 1, function(article) {
## Data preprocessing/annotating
# annotation <- createAnnotations(cleaned.text, article[2], article[3])
annotation <- create_annotations(cleaned.text, article[2], article[3], data.dir = data_dir)
## Extract information from Text
no.spouses <- wikiproc:::getNoOfSpouses(article[4])
no.spouses <- get_no_of_spouses(article[4])
## Create Results
......
# Generated by roxygen2: do not edit by hand
export(create_annotations)
export(get_data)
export(get_no_of_spouses)
export(init_nlp)
import(rvest)
importFrom(data.table,"%like%")
importFrom(xml2,read_html)
importFrom(xml2,xml_add_sibling)
importFrom(xml2,xml_find_all)
importFrom(xml2,xml_remove)
......@@ -2,10 +2,6 @@
# Author: Lucas
library(rvest)
library(stringi)
library(textclean)
#' Clean a html formatted wikipedia page.
#' Nodes of interest from the DOM are extracted and then cleaned from all html
#' tags and annotations.
......@@ -24,8 +20,8 @@ cleanHtml <- function(html) {
# - replace multiple newlines with single newline
result <- read_html(html) %>%
html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>%
stri_flatten(collapse = " ") %>%
replace_html() %>%
stringi::stri_flatten(collapse = " ") %>%
textclean::replace_html() %>%
gsub("\\[\\d*\\]", "", .) %>%
gsub(" +", " ", .) %>%
gsub("\n ", "\n", .) %>%
......
......@@ -2,10 +2,6 @@
# Author: David
library(WikipediR) # For querying wikipedia
library(rvest) # For getting the list of physicits
library(xml2)
## Though we could get the pages within the category 'physicists' with something like this
## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
## this gives us only about 50 pages.
......@@ -18,7 +14,8 @@ library(xml2)
#' @param write.cache Write downloaded results into cache for use on future calls
#' @param data.dir Directory the data should be read from and/or written to
#' @return data.frame containing the title, id, revisionID and html-formatted full text
getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
#' @export
get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep)
dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep)
......@@ -81,9 +78,9 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
# Call the wikipedia api for each entry in our list
articles <- pblapply(physicists, function(x) {
articles <- pbapply::pblapply(physicists, function(x) {
res <- tryCatch({
article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
article <- WikipediR::page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
# Check if the article is a redirect page
if (grepl(".redirectText", article$parse$text$`*`)) {
# Get the real article name
......@@ -101,7 +98,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
Encoding(tmp) <- "UTF-8"
pname <- tmp
article <- page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
article <- WikipediR::page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
}
data.frame(Title = article$parse$title,
PageID = article$parse$pageid,
......
......@@ -5,21 +5,29 @@
# Author: David
## Librarys
library(rvest)
library(data.table)
### Get number of spouses
getNoOfSpouses <- function(article) {
#' Reads the number of spouses from the infobox of an wikipedia article
#'
#' @param article Wikipedia article in html format
#'
#' @return Integer indicating the number of spouses
#' @export
#'
#' @examples
#' \dontrun{
#' articles <- get_data()
#'
#' no.spouses <- get_no_of_spouses(articles$Text[54])
#'
#' no,spouses
#' }
get_no_of_spouses <- function(article) {
# If there is no infobox we assume there were no spouses
if(!grepl("vcard", article)) {
return(0)
}
infoBox <- getInfoBox(article)
infoBox <- get_infobox(article)
# Get the spouse field
spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
......@@ -33,30 +41,3 @@ getNoOfSpouses <- function(article) {
}
return(0)
}
### Converts info box to table
getInfoBox <- function(article) {
# Read page as html
page <- read_html(article)
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>%
xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
html_nodes("table.vcard") %>%
html_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
### File used to automatically create package imports with roxygen2
### Note that it is discouraged to import many packages fully to avoid name conflicts
### If possible reference functions directy e.g. reshape2::melt()
### There is a (very) minor performance penalty for ::,
### if some functions are used frequently you may just import them
### with something like @importFrom reshape2 melt cast
#' @import rvest
#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove read_html
#' @importFrom data.table %like%
NULL
\ No newline at end of file
library(cleanNLP)
#' Initialize the nlp backend
#'
#' A wrapper used to set the python environment and call cnlp_init
#'
#' @param type Type of python env to use, either "conda" or "python"
#' @param value Connection string, if using a conda environment the name of it
#' if using python directly the path to the python executable
#'
#' @return Does not return data
#' @export
#'
#' @examples
#' \dontrun{
#' init_nlp("conda", "spcy")
#' }
init_nlp <- function(type, value) {
if (type == "conda") {
reticulate::use_condaenv(value, required = TRUE)
} else if (type == "python") {
reticulate::use_python(value, required = TRUE)
}
cleanNLP::cnlp_init_spacy(entity_flag = TRUE)
}
createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) {
#' Create annotations for the given text
#'
#' @param text Text to annotate
#' @param article.id ArticleID used for cashing
#' @param article.rev.id ArticleRevisionID used for cashing
#' @param use.cache Should cashed data be uses
#' @param write.cache Should the generated annotations be cashed
#' @param data.dir Directory the data should be read from and/or written to
#'
#' @return Annotation object for use with cleanNLP methods
#' @export
create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
# Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
# To prevent this we stip 'em again
filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE)
filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE)
# Check if there is a cached version of the annotations for this article in this specific revision
......@@ -19,7 +52,7 @@ createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE
return(res)
}
annotation <- cnlp_annotate(text, as_strings = TRUE)
annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE)
# Write cache if desired
......
### Utility functions used internally
#' Extract the inforbox contents from wikipedia articles
#'
#' @param article Character vector containing the contents of an wikipedia
#' article as html
#'
#' @return Data frame holding the contents of the table
#'
#' @examples
#' \dontrun{
#' articles <- get_data()
#'
#' infobox <- get_infobox(articles$Text[54])
#'
#' infobox[3:4,]
#' }
get_infobox <- function(article) {
# Read page as html
page <- read_html(article)
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>%
xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
html_nodes("table.vcard") %>%
html_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
......@@ -2,7 +2,7 @@
% Please edit documentation in R/CleanHtml.R
\name{cleanHtml}
\alias{cleanHtml}
\title{Clean a html formatted wikipedia page.
\title{Clean a html formatted wikipedia page.
Nodes of interest from the DOM are extracted and then cleaned from all html
tags and annotations.}
\usage{
......@@ -15,7 +15,7 @@ cleanHtml(html)
Plaintext document containing only the maintext of the give wikipedia page.
}
\description{
Clean a html formatted wikipedia page.
Clean a html formatted wikipedia page.
Nodes of interest from the DOM are extracted and then cleaned from all html
tags and annotations.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/nlp_annotate.R
\name{create_annotations}
\alias{create_annotations}
\title{Create annotations for the given text}
\usage{
create_annotations(text, article.id, article.rev.id, use.cache = TRUE,
write.cache = FALSE, data.dir = "data")
}
\arguments{
\item{text}{Text to annotate}
\item{article.id}{ArticleID used for cashing}
\item{article.rev.id}{ArticleRevisionID used for cashing}
\item{use.cache}{Should cashed data be uses}
\item{write.cache}{Should the generated annotations be cashed}
\item{data.dir}{Directory the data should be read from and/or written to}
}
\value{
Annotation object for use with cleanNLP methods
}
\description{
Create annotations for the given text
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/GetData.R
\name{getData}
\alias{getData}
% Please edit documentation in R/get_data.R
\name{get_data}
\alias{get_data}
\title{Retrieve wikipedia articles about physicists}
\usage{
getData(use.cache = TRUE, write.cache = FALSE)
get_data(use.cache = TRUE, write.cache = FALSE, data.dir = "data")
}
\arguments{
\item{use.cache}{Use cached data if it exists over downloading new data}
\item{write.cache}{Write downloaded results into cache for use on future calls}
\item{data.dir}{Directory the data should be read from and/or written to}
}
\value{
data.frame containing the title, id, revisionID and html-formatted full text
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{get_infobox}
\alias{get_infobox}
\title{Extract the inforbox contents from wikipedia articles}
\usage{
get_infobox(article)
}
\arguments{
\item{article}{Character vector containing the contents of an wikipedia
article as html}
}
\value{
Data frame holding the contents of the table
}
\description{
Extract the inforbox contents from wikipedia articles
}
\examples{
\dontrun{
articles <- get_data()
infobox <- get_infobox(articles$Text[54])
infobox[3:4,]
}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_no_of_spouses.R
\name{get_no_of_spouses}
\alias{get_no_of_spouses}
\title{Reads the number of spouses from the infobox of an wikipedia article}
\usage{
get_no_of_spouses(article)
}
\arguments{
\item{article}{Wikipedia article in html format}
}
\value{
Integer indicating the number of spouses
}
\description{
Reads the number of spouses from the infobox of an wikipedia article
}
\examples{
\dontrun{
articles <- get_data()
no.spouses <- get_no_of_spouses(articles$Text[54])
no,spouses
}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/nlp_annotate.R
\name{init_nlp}
\alias{init_nlp}
\title{Initialize the nlp backend}
\usage{
init_nlp(type, value)
}
\arguments{
\item{type}{Type of python env to use, either "conda" or "python"}
\item{value}{Connection string, if using a conda environment the name of it
if using python directly the path to the python executable}
}
\value{
Does not return data
}
\description{
A wrapper used to set the python environment and call cnlp_init
}
\examples{
\dontrun{
init_nlp("conda", "spcy")
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment