diff --git a/processing/bin/ProcessNER.R b/processing/bin/ProcessNER.R
deleted file mode 100644
index 775ffb17ffb198b7707bfd51b8e23880cbf2cddd..0000000000000000000000000000000000000000
--- a/processing/bin/ProcessNER.R
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env Rscript
-
-### Provides functionality to use NER, POS and Dependency Grammars
-
-## Author: David
-
-cat("Initializing spacy backend...\n")
-
-# It's important to do this prior to loading any python related stuff
-
-reticulate::use_condaenv("spcy", required = TRUE)
-
-# Load librarys
-
-library(cleanNLP)
-
-# Init nlp models
-
-cnlp_init_spacy(entity_flag = TRUE)
-
-cat("Done.\n")
diff --git a/processing/bin/Master.R b/processing/script/master.R
similarity index 81%
rename from processing/bin/Master.R
rename to processing/script/master.R
index 8e338d5d65a3ada707fbabdd63d3e6f8b8a620fe..22cd34728e20a5eb1906d2a562431af670a3c2ca 100755
--- a/processing/bin/Master.R
+++ b/processing/script/master.R
@@ -3,10 +3,13 @@
 ### This script consolidates everything
 
 library(pbapply)
-library(rvest)
 library(wikiproc)
 library(rprojroot)
 
+## Set up nlp
+
+init_nlp("conda", "spcy")
+
 ## Fetch data
 
 cat("Starting data import...\n")
@@ -15,7 +18,7 @@ cat("Starting data import...\n")
 project_root <- find_root(has_file("README.md"))
 data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
 
-articles <- wikiproc:::getData(use.cache = TRUE, data.dir = data_dir)
+articles <- get_data(use.cache = TRUE, data.dir = data_dir)
 
 ## Data processing
 
@@ -31,11 +34,11 @@ results <- pbapply(articles, 1, function(article) {
 
   ## Data preprocessing/annotating
 
-  # annotation <- createAnnotations(cleaned.text, article[2], article[3])
+  annotation <- create_annotations(cleaned.text, article[2], article[3], data.dir = data_dir)
 
   ## Extract information from Text
 
-  no.spouses <- wikiproc:::getNoOfSpouses(article[4])
+  no.spouses <- get_no_of_spouses(article[4])
 
   ## Create Results
 
diff --git a/processing/wikiproc/NAMESPACE b/processing/wikiproc/NAMESPACE
index 6ae926839dd1829f1016a96f766d970ff184ad97..5e2056db2f27b4dfd24924442b85fe72621177c8 100644
--- a/processing/wikiproc/NAMESPACE
+++ b/processing/wikiproc/NAMESPACE
@@ -1,2 +1,12 @@
 # Generated by roxygen2: do not edit by hand
 
+export(create_annotations)
+export(get_data)
+export(get_no_of_spouses)
+export(init_nlp)
+import(rvest)
+importFrom(data.table,"%like%")
+importFrom(xml2,read_html)
+importFrom(xml2,xml_add_sibling)
+importFrom(xml2,xml_find_all)
+importFrom(xml2,xml_remove)
diff --git a/processing/wikiproc/R/CleanHtml.R b/processing/wikiproc/R/CleanHtml.R
index 182e9c839e512b15475b51821304eafec72cf959..0421cd0bd08aa5bd7c7259072a7267dcb0435cd4 100644
--- a/processing/wikiproc/R/CleanHtml.R
+++ b/processing/wikiproc/R/CleanHtml.R
@@ -2,10 +2,6 @@
 
 # Author: Lucas
 
-library(rvest)
-library(stringi)
-library(textclean)
-
 #' Clean a html formatted wikipedia page.
 #' Nodes of interest from the DOM are extracted and then cleaned from all html
 #' tags and annotations.
@@ -24,8 +20,8 @@ cleanHtml <- function(html) {
   #     - replace multiple newlines with single newline
   result <- read_html(html) %>%
     html_nodes(css="h3:nth-child(13) , h4 , p+ h3 , p") %>%
-    stri_flatten(collapse = " ") %>%
-    replace_html() %>%
+    stringi::stri_flatten(collapse = " ") %>%
+    textclean::replace_html() %>%
     gsub("\\[\\d*\\]", "", .) %>%
     gsub("  +", " ", .) %>%
     gsub("\n ", "\n", .) %>%
diff --git a/processing/wikiproc/R/GetNoOfSpouses.R b/processing/wikiproc/R/GetNoOfSpouses.R
deleted file mode 100755
index 5190edab023ba3063d2afe1bc4f67f85f7cf4e36..0000000000000000000000000000000000000000
--- a/processing/wikiproc/R/GetNoOfSpouses.R
+++ /dev/null
@@ -1,62 +0,0 @@
-### GetNoOfSpouses.R
-### This extracts the number of spouses from the infobox
-### If no infobox or no information about spouses is found assumes there are none
-### Not for use in production, this does not actually get information from text
-
-# Author: David
-
-## Librarys
-
-library(rvest)
-library(data.table)
-
-### Get number of spouses
-
-getNoOfSpouses <- function(article) {
-  
-  # If there is no infobox we assume there were no spouses
-  if(!grepl("vcard", article)) {
-    return(0)
-  }
-  
-  infoBox <- getInfoBox(article)
-  
-  # Get the spouse field
-  spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
-  # Remove everything in parentheses
-  spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
-  # Split the strings by newlines to get one spouse per line
-  spouses <- strsplit(spouses, "\n")
-  spouses <- unlist(spouses)
-  if(length(spouses) > 0) {
-    return(length(spouses))
-  }
-  return(0)
-}
-
-### Converts info box to table
-getInfoBox <- function(article) {
-  # Read page as html
-  page <- read_html(article)
-  
-  # Extracting text from the html will erase all <br> tags,
-  # this will replace them with line breaks
-  
-  xml_find_all(page, ".//br") %>%
-    xml_add_sibling("p", "\n")
-  
-  xml_find_all(page, ".//br") %>%
-    xml_remove()
-  
-  # Get the info box
-  # Will throw an error if there isnt any, so that should be checked beforehand
-  
-  table <- page %>%
-    html_nodes("table.vcard") %>%
-    html_table(fill = TRUE) %>%
-    .[[1]]
-  
-  colnames(table) <- c("Desc", "Content")
-  
-  return(table)
-}
diff --git a/processing/wikiproc/R/createAnnotations.R b/processing/wikiproc/R/createAnnotations.R
deleted file mode 100644
index b9ca6ebea7029055fc484efb723beb4605a607c9..0000000000000000000000000000000000000000
--- a/processing/wikiproc/R/createAnnotations.R
+++ /dev/null
@@ -1,41 +0,0 @@
-library(cleanNLP)
-
-createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) {
-  
-  # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
-  # To prevent this we stip 'em again
-  
-  filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE)
-  
-  # Check if there is a cached version of the annotations for this article in this specific revision
-  
-  if(use.cache & file.exists(filename)) {
-    res <- tryCatch({
-      data <- readRDS(filename)
-      data
-    }, error = function (e) {
-      cat("Cached data seems to be corrupted, redoing annotation.\n")
-    })
-    return(res)
-  }
-  
-  annotation <- cnlp_annotate(text, as_strings = TRUE)
-  
-  # Write cache if desired
-  
-  if(write.cache) {
-    if (!dir.exists("data")) {
-      dir.create("data")
-    }
-    if (!dir.exists("data/annotations")) {
-      dir.create("data/annotations")
-    }
-    saveRDS(annotation, filename)
-  }
-  
-  # Return data
-  # On a side note: Should we do this? The tidyverse style guide discourages explicit returns.
-  # But then again, it suggests snake case for variables...
-  
-  return(annotation)
-}
\ No newline at end of file
diff --git a/processing/wikiproc/R/GetData.R b/processing/wikiproc/R/get_data.R
similarity index 90%
rename from processing/wikiproc/R/GetData.R
rename to processing/wikiproc/R/get_data.R
index ef8713e866678a1313db70927bc63216683f2d79..abcf0094a43ee04b8385e6e978258170777b0f7b 100644
--- a/processing/wikiproc/R/GetData.R
+++ b/processing/wikiproc/R/get_data.R
@@ -2,10 +2,6 @@
 
 # Author: David
 
-library(WikipediR) # For querying wikipedia
-library(rvest) # For getting the list of physicits
-library(xml2)
-
 ## Though we could get the pages within the category 'physicists' with something like this
 ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
 ## this gives us only about 50 pages.
@@ -18,7 +14,8 @@ library(xml2)
 #' @param write.cache Write downloaded results into cache for use on future calls
 #' @param data.dir Directory the data should be read from and/or written to
 #' @return data.frame containing the title, id, revisionID and html-formatted full text
-getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
+#' @export
+get_data <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
 
   dest.articlesRDS <- paste(data.dir, "articles.RDS", sep = .Platform$file.sep)
   dest.articlesCSV <- paste(data.dir, "articles.csv", sep = .Platform$file.sep)
@@ -81,9 +78,9 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
 
   # Call the wikipedia api for each entry in our list
 
-  articles <- pblapply(physicists, function(x) {
+  articles <- pbapply::pblapply(physicists, function(x) {
     res <- tryCatch({
-      article <- page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
+      article <- WikipediR::page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
       # Check if the article is a redirect page
       if (grepl(".redirectText", article$parse$text$`*`)) {
         # Get the real article name
@@ -101,7 +98,7 @@ getData <- function(use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
         Encoding(tmp) <- "UTF-8"
         pname <- tmp
 
-        article <- page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
+        article <- WikipediR::page_content("en", "wikipedia", page_name = pname, as_wikitext = FALSE)
       }
       data.frame(Title = article$parse$title,
                  PageID = article$parse$pageid,
diff --git a/processing/wikiproc/R/get_no_of_spouses.R b/processing/wikiproc/R/get_no_of_spouses.R
new file mode 100755
index 0000000000000000000000000000000000000000..c0fb31eeec13d9e4724da79a6ec7a3a35253e40b
--- /dev/null
+++ b/processing/wikiproc/R/get_no_of_spouses.R
@@ -0,0 +1,43 @@
+### GetNoOfSpouses.R
+### This extracts the number of spouses from the infobox
+### If no infobox or no information about spouses is found assumes there are none
+### Not for use in production, this does not actually get information from text
+
+# Author: David
+
+#' Reads the number of spouses from the infobox of an wikipedia article
+#'
+#' @param article Wikipedia article in html format
+#'
+#' @return Integer indicating the number of spouses
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' articles <- get_data()
+#' 
+#' no.spouses <- get_no_of_spouses(articles$Text[54])
+#' 
+#' no,spouses 
+#' }
+get_no_of_spouses <- function(article) {
+  
+  # If there is no infobox we assume there were no spouses
+  if(!grepl("vcard", article)) {
+    return(0)
+  }
+  
+  infoBox <- get_infobox(article)
+  
+  # Get the spouse field
+  spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
+  # Remove everything in parentheses
+  spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
+  # Split the strings by newlines to get one spouse per line
+  spouses <- strsplit(spouses, "\n")
+  spouses <- unlist(spouses)
+  if(length(spouses) > 0) {
+    return(length(spouses))
+  }
+  return(0)
+}
diff --git a/processing/wikiproc/R/import_packages.R b/processing/wikiproc/R/import_packages.R
new file mode 100644
index 0000000000000000000000000000000000000000..03025a3e2cbb9f89380516124e74c48f8b4b95c2
--- /dev/null
+++ b/processing/wikiproc/R/import_packages.R
@@ -0,0 +1,11 @@
+### File used to automatically create package imports with roxygen2
+### Note that it is discouraged to import many packages fully to avoid name conflicts
+### If possible reference functions directy e.g. reshape2::melt()
+### There is a (very) minor performance penalty for ::,
+### if some functions are used frequently you may just import them
+### with something like @importFrom reshape2 melt cast
+
+#' @import rvest
+#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove read_html
+#' @importFrom data.table %like%
+NULL
\ No newline at end of file
diff --git a/processing/wikiproc/R/nlp_annotate.R b/processing/wikiproc/R/nlp_annotate.R
new file mode 100644
index 0000000000000000000000000000000000000000..c83921ec93dcc21b20cbebec03c1b458a2bd8609
--- /dev/null
+++ b/processing/wikiproc/R/nlp_annotate.R
@@ -0,0 +1,74 @@
+#' Initialize the nlp backend
+#' 
+#' A wrapper used to set the python environment and call cnlp_init
+#'
+#' @param type Type of python env to use, either "conda" or "python"
+#' @param value Connection string, if using a conda environment the name of it
+#' if using python directly the path to the python executable
+#'
+#' @return Does not return data
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' init_nlp("conda", "spcy")
+#' }
+init_nlp <- function(type, value) {
+  if (type == "conda") {
+    reticulate::use_condaenv(value, required = TRUE)
+  } else if (type == "python") {
+    reticulate::use_python(value, required = TRUE)
+  }
+  cleanNLP::cnlp_init_spacy(entity_flag = TRUE)
+}
+
+#' Create annotations for the given text
+#'
+#' @param text Text to annotate
+#' @param article.id ArticleID used for cashing
+#' @param article.rev.id ArticleRevisionID used for cashing
+#' @param use.cache Should cashed data be uses
+#' @param write.cache Should the generated annotations be cashed
+#' @param data.dir Directory the data should be read from and/or written to
+#'
+#' @return Annotation object for use with cleanNLP methods
+#' @export
+create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
+  
+  # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
+  # To prevent this we stip 'em again
+  
+  filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE)
+  
+  # Check if there is a cached version of the annotations for this article in this specific revision
+  
+  if(use.cache & file.exists(filename)) {
+    res <- tryCatch({
+      data <- readRDS(filename)
+      data
+    }, error = function (e) {
+      cat("Cached data seems to be corrupted, redoing annotation.\n")
+    })
+    return(res)
+  }
+  
+  annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE)
+  
+  # Write cache if desired
+  
+  if(write.cache) {
+    if (!dir.exists("data")) {
+      dir.create("data")
+    }
+    if (!dir.exists("data/annotations")) {
+      dir.create("data/annotations")
+    }
+    saveRDS(annotation, filename)
+  }
+  
+  # Return data
+  # On a side note: Should we do this? The tidyverse style guide discourages explicit returns.
+  # But then again, it suggests snake case for variables...
+  
+  return(annotation)
+}
\ No newline at end of file
diff --git a/processing/wikiproc/R/utils.R b/processing/wikiproc/R/utils.R
new file mode 100644
index 0000000000000000000000000000000000000000..a518f16f9fa62497250c8d64f87cf9dc2bb012ab
--- /dev/null
+++ b/processing/wikiproc/R/utils.R
@@ -0,0 +1,43 @@
+### Utility functions used internally
+
+
+#' Extract the inforbox contents from wikipedia articles
+#'
+#' @param article Character vector containing the contents of an wikipedia
+#' article as html
+#'
+#' @return Data frame holding the contents of the table
+#' 
+#' @examples
+#' \dontrun{
+#' articles <- get_data()
+#' 
+#' infobox <- get_infobox(articles$Text[54])
+#' 
+#' infobox[3:4,]
+#' }
+get_infobox <- function(article) {
+  # Read page as html
+  page <- read_html(article)
+  
+  # Extracting text from the html will erase all <br> tags,
+  # this will replace them with line breaks
+  
+  xml_find_all(page, ".//br") %>%
+    xml_add_sibling("p", "\n")
+  
+  xml_find_all(page, ".//br") %>%
+    xml_remove()
+  
+  # Get the info box
+  # Will throw an error if there isnt any, so that should be checked beforehand
+  
+  table <- page %>%
+    html_nodes("table.vcard") %>%
+    html_table(fill = TRUE) %>%
+    .[[1]]
+  
+  colnames(table) <- c("Desc", "Content")
+  
+  return(table)
+}
diff --git a/processing/wikiproc/man/cleanHtml.Rd b/processing/wikiproc/man/cleanHtml.Rd
index 56994f44d9eadcd9f8cb1fee71bd54d91e518629..7247852e83e59f5f6c7ba54df2f90692b7f98a9c 100644
--- a/processing/wikiproc/man/cleanHtml.Rd
+++ b/processing/wikiproc/man/cleanHtml.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/CleanHtml.R
 \name{cleanHtml}
 \alias{cleanHtml}
-\title{Clean a html formatted wikipedia page. 
+\title{Clean a html formatted wikipedia page.
 Nodes of interest from the DOM are extracted and then cleaned from all html
 tags and annotations.}
 \usage{
@@ -15,7 +15,7 @@ cleanHtml(html)
 Plaintext document containing only the maintext of the give wikipedia page.
 }
 \description{
-Clean a html formatted wikipedia page. 
+Clean a html formatted wikipedia page.
 Nodes of interest from the DOM are extracted and then cleaned from all html
 tags and annotations.
 }
diff --git a/processing/wikiproc/man/create_annotations.Rd b/processing/wikiproc/man/create_annotations.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..305b279d56dea9811b9c147f0912a005bb1f808e
--- /dev/null
+++ b/processing/wikiproc/man/create_annotations.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/nlp_annotate.R
+\name{create_annotations}
+\alias{create_annotations}
+\title{Create annotations for the given text}
+\usage{
+create_annotations(text, article.id, article.rev.id, use.cache = TRUE,
+  write.cache = FALSE, data.dir = "data")
+}
+\arguments{
+\item{text}{Text to annotate}
+
+\item{article.id}{ArticleID used for cashing}
+
+\item{article.rev.id}{ArticleRevisionID used for cashing}
+
+\item{use.cache}{Should cashed data be uses}
+
+\item{write.cache}{Should the generated annotations be cashed}
+
+\item{data.dir}{Directory the data should be read from and/or written to}
+}
+\value{
+Annotation object for use with cleanNLP methods
+}
+\description{
+Create annotations for the given text
+}
diff --git a/processing/wikiproc/man/getData.Rd b/processing/wikiproc/man/get_data.Rd
similarity index 65%
rename from processing/wikiproc/man/getData.Rd
rename to processing/wikiproc/man/get_data.Rd
index 13e362d15d94d684eb53fe4c6a8001bc4b89949e..cec7d173ad8abf28d0811a9f6ed2a00cc3f77b13 100644
--- a/processing/wikiproc/man/getData.Rd
+++ b/processing/wikiproc/man/get_data.Rd
@@ -1,15 +1,17 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/GetData.R
-\name{getData}
-\alias{getData}
+% Please edit documentation in R/get_data.R
+\name{get_data}
+\alias{get_data}
 \title{Retrieve wikipedia articles about physicists}
 \usage{
-getData(use.cache = TRUE, write.cache = FALSE)
+get_data(use.cache = TRUE, write.cache = FALSE, data.dir = "data")
 }
 \arguments{
 \item{use.cache}{Use cached data if it exists over downloading new data}
 
 \item{write.cache}{Write downloaded results into cache for use on future calls}
+
+\item{data.dir}{Directory the data should be read from and/or written to}
 }
 \value{
 data.frame containing the title, id, revisionID and html-formatted full text
diff --git a/processing/wikiproc/man/get_infobox.Rd b/processing/wikiproc/man/get_infobox.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..ef8d03180df7bce6c794ee1f7377e22de96c06af
--- /dev/null
+++ b/processing/wikiproc/man/get_infobox.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{get_infobox}
+\alias{get_infobox}
+\title{Extract the inforbox contents from wikipedia articles}
+\usage{
+get_infobox(article)
+}
+\arguments{
+\item{article}{Character vector containing the contents of an wikipedia
+article as html}
+}
+\value{
+Data frame holding the contents of the table
+}
+\description{
+Extract the inforbox contents from wikipedia articles
+}
+\examples{
+\dontrun{
+articles <- get_data()
+
+infobox <- get_infobox(articles$Text[54])
+
+infobox[3:4,]
+}
+}
diff --git a/processing/wikiproc/man/get_no_of_spouses.Rd b/processing/wikiproc/man/get_no_of_spouses.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..131c526cb56e28994b3295be2ada26343ecfb103
--- /dev/null
+++ b/processing/wikiproc/man/get_no_of_spouses.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_no_of_spouses.R
+\name{get_no_of_spouses}
+\alias{get_no_of_spouses}
+\title{Reads the number of spouses from the infobox of an wikipedia article}
+\usage{
+get_no_of_spouses(article)
+}
+\arguments{
+\item{article}{Wikipedia article in html format}
+}
+\value{
+Integer indicating the number of spouses
+}
+\description{
+Reads the number of spouses from the infobox of an wikipedia article
+}
+\examples{
+\dontrun{
+articles <- get_data()
+
+no.spouses <- get_no_of_spouses(articles$Text[54])
+
+no,spouses 
+}
+}
diff --git a/processing/wikiproc/man/init_nlp.Rd b/processing/wikiproc/man/init_nlp.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..47644aaed461189eefcea0cfaf43dfcbcb1f045e
--- /dev/null
+++ b/processing/wikiproc/man/init_nlp.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/nlp_annotate.R
+\name{init_nlp}
+\alias{init_nlp}
+\title{Initialize the nlp backend}
+\usage{
+init_nlp(type, value)
+}
+\arguments{
+\item{type}{Type of python env to use, either "conda" or "python"}
+
+\item{value}{Connection string, if using a conda environment the name of it
+if using python directly the path to the python executable}
+}
+\value{
+Does not return data
+}
+\description{
+A wrapper used to set the python environment and call cnlp_init
+}
+\examples{
+\dontrun{
+init_nlp("conda", "spcy")
+}
+}