Skip to content
Snippets Groups Projects
Commit 815047b4 authored by Lucas Schons's avatar Lucas Schons
Browse files

Refactor text processing

* Create R package containing function definitions
* Create directory processing containing scripts
* fix some typos
* general cleanup
parent 10fcd152
No related branches found
No related tags found
3 merge requests!34Resolve "Add unit tests for clean_html.R",!28WIP: Resolve "Create pattern matching function",!27Resolve "Add unit tests for cleanHtml.R"
Showing
with 169 additions and 52 deletions
#!/usr/bin/env Rscript #!/usr/bin/env Rscript
### This script consolidates everything ### This script consolidates everything
## Librarys
library(pbapply) library(pbapply)
library(rvest)
#library(SomeLibrary) library(wikiproc)
## Load Scripts
cat("Sourcing R scripts... ")
source("r/GetData.R")
source("r/GetNoOfSpouses.R")
source("r/CleanHtml.R")
source("r/ProcessNER.R")
#source("r/getSomethingElse.R")
cat("Done.\n")
## Fetch data ## Fetch data
cat("Starting data import...\n") cat("Starting data import...\n")
articles <- getData(use.cache = TRUE) articles <- getData(use.cache = FALSE, write.cache = TRUE, data.dir = "../../data/")
## Data processing ## Data processing
...@@ -41,16 +26,12 @@ results <- pbapply(articles, 1, function(article) { ...@@ -41,16 +26,12 @@ results <- pbapply(articles, 1, function(article) {
## Data preprocessing/annotating ## Data preprocessing/annotating
annotation <- createAnnotations(cleaned.text, article[2], article[3]) # annotation <- createAnnotations(cleaned.text, article[2], article[3])
## Extract information from Text ## Extract information from Text
no.spouses <- getNoOfSpouses(article[4]) no.spouses <- getNoOfSpouses(article[4])
# someFact <- getFactFromTextFunctioN(annotated.text)
# someOtherFact <- getOtherFactFromText(data$Text)
## Create Results ## Create Results
data.frame(Name = article[1], data.frame(Name = article[1],
......
#!/usr/bin/env Rscript
### Provides functionality to use NER, POS and Dependency Grammars
## Author: David
cat("Initializing spacy backend...\n")
# It's important to do this prior to loading any python related stuff
reticulate::use_condaenv("spcy", required = TRUE)
# Load librarys
library(cleanNLP)
# Init nlp models
cnlp_init_spacy(entity_flag = TRUE)
cat("Done.\n")
File moved
^wikiproc\.Rproj$
^\.Rproj\.user$
Package: wikiproc
Title: What the Package Does (one line, title case)
Version: 0.0.0.9000
Authors@R: person("First", "Last", email = "first.last@example.com", role = c("aut", "cre"))
Description: What the package does (one paragraph).
Depends: R (>= 3.5.2)
License: What license is it under?
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.1
Suggests:
testthat
# Generated by roxygen2: do not edit by hand
File moved
File moved
File moved
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
library(WikipediR) # For querying wikipedia library(WikipediR) # For querying wikipedia
library(rvest) # For getting the list of physicits library(rvest) # For getting the list of physicits
library(xml2)
## Though we could get the pages within the category 'physicists' with something like this ## Though we could get the pages within the category 'physicists' with something like this
## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
...@@ -15,12 +16,16 @@ library(rvest) # For getting the list of physicits ...@@ -15,12 +16,16 @@ library(rvest) # For getting the list of physicits
#' #'
#' @param use.cache Use cached data if it exists over downloading new data #' @param use.cache Use cached data if it exists over downloading new data
#' @param write.cache Write downloaded results into cache for use on future calls #' @param write.cache Write downloaded results into cache for use on future calls
#' @param data.dir Directory the data should be read from and/or written to
#' @return data.frame containing the title, id, revisionID and html-formatted full text #' @return data.frame containing the title, id, revisionID and html-formatted full text
getData <- function(use.cache = TRUE, write.cache = FALSE) { getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data/") {
dest.articlesRDS <- paste(data.dir, "articles.RDS", "")
dest.articlesCSV <- paste(data.dir, "articles.csv", "")
### First we check if the data already exists and try to load it if it does ### First we check if the data already exists and try to load it if it does
if(file.exists("data/articles.RDS") & use.cache ) { if(file.exists(dest.articlesRDS) && use.cache ) {
res <- tryCatch({ res <- tryCatch({
data <- readRDS("data/articles.RDS") data <- readRDS(dest.articlesRDS)
cat("Found chached data to use, import finished.\n") cat("Found chached data to use, import finished.\n")
data data
}, error = function (e) { }, error = function (e) {
...@@ -118,12 +123,12 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) { ...@@ -118,12 +123,12 @@ getData <- function(use.cache = TRUE, write.cache = FALSE) {
# Write result if desired # Write result if desired
if(write.cache) { if(write.cache) {
if (!dir.exists("data")) { if (!dir.exists(data.dir)) {
dir.create("data") dir.create(data.dir)
} }
cat("Writing data to files... ") cat("Writing data to files... ")
write.table(articles, "data/articles.csv") write.table(articles, dest.articlesCSV)
saveRDS(articles, "data/articles.RDS") saveRDS(articles, dest.articlesRDS)
cat("Done.\n") cat("Done.\n")
} }
......
File moved
#!/usr/bin/env Rscript
### Provides functionality to use NER, POS and Dependency Grammers
## Author: David
cat("Initializing spacy backend...\n")
# It's important to do this prior to loading any python related stuff
reticulate::use_condaenv("spcy", required = TRUE)
# Load librarys
library(cleanNLP) library(cleanNLP)
# Init nlp models
cnlp_init_spacy(entity_flag = TRUE)
cat("Done.\n")
createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) { createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) {
# Generate filename, for some reason there paste0 will pad the article id with leading whitespaces # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
# To prevent this we stip 'em again # To prevent this we stip 'em again
filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE) filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE)
# Check if there is a cached version of the annotations for this article in this specific revision # Check if there is a cached version of the annotations for this article in this specific revision
if(use.cache & file.exists(filename)) { if(use.cache & file.exists(filename)) {
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/CleanHtml.R
\name{cleanHtml}
\alias{cleanHtml}
\title{Clean a html formatted wikipedia page.
Nodes of interest from the DOM are extracted and then cleaned from all html
tags and annotations.}
\usage{
cleanHtml(html)
}
\arguments{
\item{html}{Url linking to a wikipedia webpage or a html formatted document.}
}
\value{
Plaintext document containing only the maintext of the give wikipedia page.
}
\description{
Clean a html formatted wikipedia page.
Nodes of interest from the DOM are extracted and then cleaned from all html
tags and annotations.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/GetBirthdate.R
\name{getBirthdate}
\alias{getBirthdate}
\title{Extract birthdate from infobox
Will try to get infobox as table and extract birthdate
from 'Born'-entry
If there is no infobox, first paragraph of the article
will be checked for birthdate}
\usage{
getBirthdate(article)
}
\arguments{
\item{article}{Article in HTML-format}
}
\value{
String birthdate as string|NULL
}
\description{
Extract birthdate from infobox
Will try to get infobox as table and extract birthdate
from 'Born'-entry
If there is no infobox, first paragraph of the article
will be checked for birthdate
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/GetBirthplace.R
\name{getBirthplace}
\alias{getBirthplace}
\title{This script extracts Birthplace from physicist texts
Try to get the infobox and extract the birthplace
If there is no infobox, 0 will be returned as
birthplace is hard to extract from text}
\usage{
getBirthplace(article)
}
\arguments{
\item{article}{Article in HTML-format}
}
\value{
String with birthplace of the physicist|0
}
\description{
This script extracts Birthplace from physicist texts
Try to get the infobox and extract the birthplace
If there is no infobox, 0 will be returned as
birthplace is hard to extract from text
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/GetData.R
\name{getData}
\alias{getData}
\title{Retrieve wikipedia articles about physicists}
\usage{
getData(use.cache = TRUE, write.cache = FALSE)
}
\arguments{
\item{use.cache}{Use cached data if it exists over downloading new data}
\item{write.cache}{Write downloaded results into cache for use on future calls}
}
\value{
data.frame containing the title, id, revisionID and html-formatted full text
}
\description{
Retrieve wikipedia articles about physicists
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/GetBirthdate.R
\name{getIntroduction}
\alias{getIntroduction}
\title{Get Introduction Text from Wikipedia page that contains birthdate}
\usage{
getIntroduction(article)
}
\arguments{
\item{article}{article in HTML-format}
}
\value{
string introduction text from wikipedia article
}
\description{
Get Introduction Text from Wikipedia page that contains birthdate
}
library(testthat)
library(wikiproc)
test_check("wikiproc")
context("test-cleanhtml")
test_that("multiplication works", {
expect_equal(2 * 2, 4)
})
# Test cleanHtml function.
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment