diff --git a/processing/bin/ProcessNER.R b/processing/bin/ProcessNER.R
deleted file mode 100644
index 775ffb17ffb198b7707bfd51b8e23880cbf2cddd..0000000000000000000000000000000000000000
--- a/processing/bin/ProcessNER.R
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env Rscript
-
-### Provides functionality to use NER, POS and Dependency Grammars
-
-## Author: David
-
-cat("Initializing spacy backend...\n")
-
-# It's important to do this prior to loading any python related stuff
-
-reticulate::use_condaenv("spcy", required = TRUE)
-
-# Load librarys
-
-library(cleanNLP)
-
-# Init nlp models
-
-cnlp_init_spacy(entity_flag = TRUE)
-
-cat("Done.\n")
diff --git a/processing/bin/Master.R b/processing/script/master.R
similarity index 81%
rename from processing/bin/Master.R
rename to processing/script/master.R
index 0f476101b3c5fffeff755cd0623afd3947df3141..6e8121bef24989d341c1abf34c0017dd079a992c 100755
--- a/processing/bin/Master.R
+++ b/processing/script/master.R
@@ -3,10 +3,13 @@
 ### This script consolidates everything
 
 library(pbapply)
-library(rvest)
 library(wikiproc)
 library(rprojroot)
 
+## Set up nlp
+
+init_nlp("conda", "spcy")
+
 ## Fetch data
 
 cat("Starting data import...\n")
@@ -15,7 +18,7 @@ cat("Starting data import...\n")
 project_root <- find_root(has_file("README.md"))
 data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
 
-articles <- wikiproc::getData(use.cache = TRUE, data.dir = data_dir)
+articles <- get_data(use.cache = TRUE, data.dir = data_dir)
 
 ## Data processing
 
@@ -31,11 +34,11 @@ results <- pbapply(articles, 1, function(article) {
 
   ## Data preprocessing/annotating
 
-  # annotation <- createAnnotations(cleaned.text, article[2], article[3])
+  annotation <- create_annotations(cleaned.text, article[2], article[3], data.dir = data_dir)
 
   ## Extract information from Text
 
-  no.spouses <- wikiproc::getNoOfSpouses(article[4])
+  no.spouses <- get_no_of_spouses(article[4])
 
   ## Create Results
 
diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE
index f5ebafbf596ac4f41f745b7c83baff9b7edd80d3..5e2056db2f27b4dfd24924442b85fe72621177c8 100644
--- a/processing/wikiproc/NAMESPACE
+++ b/processing/wikiproc/NAMESPACE
@@ -1,8 +1,12 @@
 # Generated by roxygen2: do not edit by hand
 
-export(cleanHtml)
-export(createAnnotations)
-export(getBirthdate)
-export(getBirthplace)
-export(getData)
-export(getNoOfSpouses)
+export(create_annotations)
+export(get_data)
+export(get_no_of_spouses)
+export(init_nlp)
+import(rvest)
+importFrom(data.table,"%like%")
+importFrom(xml2,read_html)
+importFrom(xml2,xml_add_sibling)
+importFrom(xml2,xml_find_all)
+importFrom(xml2,xml_remove)
diff --git a/processing/wikiproc/R/CleanHtml.R b/processing/wikiproc/R/CleanHtml.R
index b78b74f8e8a0b0badfc313e729e39659236f6463..e541a1d86aa0aac0e65e806ca0d5a4e5ff1c98cc 100644
--- a/processing/wikiproc/R/CleanHtml.R
+++ b/processing/wikiproc/R/CleanHtml.R
@@ -2,11 +2,6 @@
 
 # Author: Lucas
 
-library(rvest)
-library(stringi)
-library(textclean)
-library(xml2)
-
 #' Clean a html formatted wikipedia page.
 #' Nodes of interest from the DOM are extracted and then cleaned from all html
 #' tags and annotations.
diff --git a/processing/wikiproc/R/GetData.R b/processing/wikiproc/R/get_data.R
similarity index 95%
rename from processing/wikiproc/R/GetData.R
rename to processing/wikiproc/R/get_data.R
index 2300cb676ad71130d2b789bd97cf58803a513453..d5dc1411e7b139563145a729371efb746d66b622 100644
--- a/processing/wikiproc/R/GetData.R
+++ b/processing/wikiproc/R/get_data.R
@@ -2,10 +2,6 @@
 
 # Author: David
 
-library(WikipediR) # For querying wikipedia
-library(rvest) # For getting the list of physicits
-library(xml2)
-
 ## Though we could get the pages within the category 'physicists' with something like this
 ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
 ## this gives us only about 50 pages.
@@ -19,7 +15,8 @@ library(xml2)
 #' @param write.cache Write downloaded results into cache for use on future calls
 #' @param data.dir Directory the data should be read from and/or written to
 #' @return data.frame containing the title, id, revisionID and html-formatted full text
-getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
+#' @export
+get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
 
   dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep)
   dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep)
diff --git a/processing/wikiproc/R/get_no_of_spouses.R b/processing/wikiproc/R/get_no_of_spouses.R
new file mode 100755
index 0000000000000000000000000000000000000000..c0fb31eeec13d9e4724da79a6ec7a3a35253e40b
--- /dev/null
+++ b/processing/wikiproc/R/get_no_of_spouses.R
@@ -0,0 +1,43 @@
+### GetNoOfSpouses.R
+### This extracts the number of spouses from the infobox
+### If no infobox or no information about spouses is found assumes there are none
+### Not for use in production, this does not actually get information from text
+
+# Author: David
+
+#' Reads the number of spouses from the infobox of an wikipedia article
+#'
+#' @param article Wikipedia article in html format
+#'
+#' @return Integer indicating the number of spouses
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' articles <- get_data()
+#' 
+#' no.spouses <- get_no_of_spouses(articles$Text[54])
+#' 
+#' no,spouses 
+#' }
+get_no_of_spouses <- function(article) {
+  
+  # If there is no infobox we assume there were no spouses
+  if(!grepl("vcard", article)) {
+    return(0)
+  }
+  
+  infoBox <- get_infobox(article)
+  
+  # Get the spouse field
+  spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
+  # Remove everything in parentheses
+  spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
+  # Split the strings by newlines to get one spouse per line
+  spouses <- strsplit(spouses, "\n")
+  spouses <- unlist(spouses)
+  if(length(spouses) > 0) {
+    return(length(spouses))
+  }
+  return(0)
+}
diff --git a/processing/wikiproc/R/import_packages.R b/processing/wikiproc/R/import_packages.R
new file mode 100644
index 0000000000000000000000000000000000000000..03025a3e2cbb9f89380516124e74c48f8b4b95c2
--- /dev/null
+++ b/processing/wikiproc/R/import_packages.R
@@ -0,0 +1,11 @@
+### File used to automatically create package imports with roxygen2
+### Note that it is discouraged to import many packages fully to avoid name conflicts
+### If possible reference functions directy e.g. reshape2::melt()
+### There is a (very) minor performance penalty for ::,
+### if some functions are used frequently you may just import them
+### with something like @importFrom reshape2 melt cast
+
+#' @import rvest
+#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove read_html
+#' @importFrom data.table %like%
+NULL
\ No newline at end of file
diff --git a/processing/wikiproc/R/nlp_annotate.R b/processing/wikiproc/R/nlp_annotate.R
new file mode 100644
index 0000000000000000000000000000000000000000..c83921ec93dcc21b20cbebec03c1b458a2bd8609
--- /dev/null
+++ b/processing/wikiproc/R/nlp_annotate.R
@@ -0,0 +1,74 @@
+#' Initialize the nlp backend
+#' 
+#' A wrapper used to set the python environment and call cnlp_init
+#'
+#' @param type Type of python env to use, either "conda" or "python"
+#' @param value Connection string, if using a conda environment the name of it
+#' if using python directly the path to the python executable
+#'
+#' @return Does not return data
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' init_nlp("conda", "spcy")
+#' }
+init_nlp <- function(type, value) {
+  if (type == "conda") {
+    reticulate::use_condaenv(value, required = TRUE)
+  } else if (type == "python") {
+    reticulate::use_python(value, required = TRUE)
+  }
+  cleanNLP::cnlp_init_spacy(entity_flag = TRUE)
+}
+
+#' Create annotations for the given text
+#'
+#' @param text Text to annotate
+#' @param article.id ArticleID used for cashing
+#' @param article.rev.id ArticleRevisionID used for cashing
+#' @param use.cache Should cashed data be uses
+#' @param write.cache Should the generated annotations be cashed
+#' @param data.dir Directory the data should be read from and/or written to
+#'
+#' @return Annotation object for use with cleanNLP methods
+#' @export
+create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
+  
+  # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
+  # To prevent this we stip 'em again
+  
+  filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE)
+  
+  # Check if there is a cached version of the annotations for this article in this specific revision
+  
+  if(use.cache & file.exists(filename)) {
+    res <- tryCatch({
+      data <- readRDS(filename)
+      data
+    }, error = function (e) {
+      cat("Cached data seems to be corrupted, redoing annotation.\n")
+    })
+    return(res)
+  }
+  
+  annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE)
+  
+  # Write cache if desired
+  
+  if(write.cache) {
+    if (!dir.exists("data")) {
+      dir.create("data")
+    }
+    if (!dir.exists("data/annotations")) {
+      dir.create("data/annotations")
+    }
+    saveRDS(annotation, filename)
+  }
+  
+  # Return data
+  # On a side note: Should we do this? The tidyverse style guide discourages explicit returns.
+  # But then again, it suggests snake case for variables...
+  
+  return(annotation)
+}
\ No newline at end of file
diff --git a/processing/wikiproc/R/utils.R b/processing/wikiproc/R/utils.R
new file mode 100644
index 0000000000000000000000000000000000000000..a518f16f9fa62497250c8d64f87cf9dc2bb012ab
--- /dev/null
+++ b/processing/wikiproc/R/utils.R
@@ -0,0 +1,43 @@
+### Utility functions used internally
+
+
+#' Extract the inforbox contents from wikipedia articles
+#'
+#' @param article Character vector containing the contents of an wikipedia
+#' article as html
+#'
+#' @return Data frame holding the contents of the table
+#' 
+#' @examples
+#' \dontrun{
+#' articles <- get_data()
+#' 
+#' infobox <- get_infobox(articles$Text[54])
+#' 
+#' infobox[3:4,]
+#' }
+get_infobox <- function(article) {
+  # Read page as html
+  page <- read_html(article)
+  
+  # Extracting text from the html will erase all <br> tags,
+  # this will replace them with line breaks
+  
+  xml_find_all(page, ".//br") %>%
+    xml_add_sibling("p", "\n")
+  
+  xml_find_all(page, ".//br") %>%
+    xml_remove()
+  
+  # Get the info box
+  # Will throw an error if there isnt any, so that should be checked beforehand
+  
+  table <- page %>%
+    html_nodes("table.vcard") %>%
+    html_table(fill = TRUE) %>%
+    .[[1]]
+  
+  colnames(table) <- c("Desc", "Content")
+  
+  return(table)
+}
diff --git a/processing/wikiproc/man/create_annotations.Rd b/processing/wikiproc/man/create_annotations.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..305b279d56dea9811b9c147f0912a005bb1f808e
--- /dev/null
+++ b/processing/wikiproc/man/create_annotations.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/nlp_annotate.R
+\name{create_annotations}
+\alias{create_annotations}
+\title{Create annotations for the given text}
+\usage{
+create_annotations(text, article.id, article.rev.id, use.cache = TRUE,
+  write.cache = FALSE, data.dir = "data")
+}
+\arguments{
+\item{text}{Text to annotate}
+
+\item{article.id}{ArticleID used for cashing}
+
+\item{article.rev.id}{ArticleRevisionID used for cashing}
+
+\item{use.cache}{Should cashed data be uses}
+
+\item{write.cache}{Should the generated annotations be cashed}
+
+\item{data.dir}{Directory the data should be read from and/or written to}
+}
+\value{
+Annotation object for use with cleanNLP methods
+}
+\description{
+Create annotations for the given text
+}
diff --git a/processing/wikiproc/man/getData.Rd b/processing/wikiproc/man/get_data.Rd
similarity index 77%
rename from processing/wikiproc/man/getData.Rd
rename to processing/wikiproc/man/get_data.Rd
index ec865807ac52c5cc079ccda9998e01632f97e969..cec7d173ad8abf28d0811a9f6ed2a00cc3f77b13 100644
--- a/processing/wikiproc/man/getData.Rd
+++ b/processing/wikiproc/man/get_data.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/GetData.R
-\name{getData}
-\alias{getData}
+% Please edit documentation in R/get_data.R
+\name{get_data}
+\alias{get_data}
 \title{Retrieve wikipedia articles about physicists}
 \usage{
-getData(use.cache = TRUE, write.cache = FALSE, data.dir = "data")
+get_data(use.cache = TRUE, write.cache = FALSE, data.dir = "data")
 }
 \arguments{
 \item{use.cache}{Use cached data if it exists over downloading new data}
diff --git a/processing/wikiproc/man/get_infobox.Rd b/processing/wikiproc/man/get_infobox.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..ef8d03180df7bce6c794ee1f7377e22de96c06af
--- /dev/null
+++ b/processing/wikiproc/man/get_infobox.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{get_infobox}
+\alias{get_infobox}
+\title{Extract the inforbox contents from wikipedia articles}
+\usage{
+get_infobox(article)
+}
+\arguments{
+\item{article}{Character vector containing the contents of an wikipedia
+article as html}
+}
+\value{
+Data frame holding the contents of the table
+}
+\description{
+Extract the inforbox contents from wikipedia articles
+}
+\examples{
+\dontrun{
+articles <- get_data()
+
+infobox <- get_infobox(articles$Text[54])
+
+infobox[3:4,]
+}
+}
diff --git a/processing/wikiproc/man/get_no_of_spouses.Rd b/processing/wikiproc/man/get_no_of_spouses.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..131c526cb56e28994b3295be2ada26343ecfb103
--- /dev/null
+++ b/processing/wikiproc/man/get_no_of_spouses.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_no_of_spouses.R
+\name{get_no_of_spouses}
+\alias{get_no_of_spouses}
+\title{Reads the number of spouses from the infobox of an wikipedia article}
+\usage{
+get_no_of_spouses(article)
+}
+\arguments{
+\item{article}{Wikipedia article in html format}
+}
+\value{
+Integer indicating the number of spouses
+}
+\description{
+Reads the number of spouses from the infobox of an wikipedia article
+}
+\examples{
+\dontrun{
+articles <- get_data()
+
+no.spouses <- get_no_of_spouses(articles$Text[54])
+
+no,spouses 
+}
+}
diff --git a/processing/wikiproc/man/init_nlp.Rd b/processing/wikiproc/man/init_nlp.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..47644aaed461189eefcea0cfaf43dfcbcb1f045e
--- /dev/null
+++ b/processing/wikiproc/man/init_nlp.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/nlp_annotate.R
+\name{init_nlp}
+\alias{init_nlp}
+\title{Initialize the nlp backend}
+\usage{
+init_nlp(type, value)
+}
+\arguments{
+\item{type}{Type of python env to use, either "conda" or "python"}
+
+\item{value}{Connection string, if using a conda environment the name of it
+if using python directly the path to the python executable}
+}
+\value{
+Does not return data
+}
+\description{
+A wrapper used to set the python environment and call cnlp_init
+}
+\examples{
+\dontrun{
+init_nlp("conda", "spcy")
+}
+}