preliminary result of Setup and Intents

91b46b4e · Jonas Wolff · 8cdbe661 · df83ccf1 · 91b46b4e · 91b46b4e
Commit 91b46b4e authored 6 years ago by Jonas Wolff
--- a/docs/abschluss_presentation/Wiki_Chatbot_Architecture.png
+++ b/docs/abschluss_presentation/Wiki_Chatbot_Architecture.png
--- a/docs/abschluss_presentation/aarons.png
+++ b/docs/abschluss_presentation/aarons.png
--- a/docs/abschluss_presentation/chatbot.png
+++ b/docs/abschluss_presentation/chatbot.png
--- a/docs/abschluss_presentation/eval_spouse.R
+++ b/docs/abschluss_presentation/eval_spouse.R
+library(ggplot2)
+
+## First of we get the spouses from all the infoboxes we got
+
+spouses <- sapply(articles$Text, function(x) {
+  info_box <- wikiproc:::get_infobox(x)
+  if(!is.na(info_box) && "Spouse(s)" %in% info_box$Desc) {
+    return(info_box[info_box$Desc == "Spouse(s)",]$Content)
+  } else {
+    return(NA)
+  }
+})
+
+# Remove names
+
+spouses <- unname(spouses)
+
+# Remove everything in parentheses and square brackets
+
+spouses <- gsub("\\s*\\([^\\)]+\\)", " ", spouses)
+spouses <- gsub("\\[.*?\\]", "", spouses)
+
+# We bind this onto the results data frame now to subset
+
+results_with_infobox <- results[, c("name", "spouse")]
+results_with_infobox$spouses_infobox <- spouses
+
+# We filter only for thoses were we got a value in both columns
+
+results_with_both <- results_with_infobox[!is.na(results_with_infobox$spouse) & !is.na(results_with_infobox$spouses_infobox),]
+
+results_with_both$distance <- apply(results_with_both, 1, function(x) {
+  adist(x[2], x[3], partial = TRUE)[1,1]
+})
+
+precision <- nrow(results_with_both[results_with_both$distance <= 5,]) / nrow(results_with_both)
+
+# Recall is a bit more difficult
+# First of we check for all articles if they even contain the word marry
+
+results_with_infobox$has_married <- apply(articles, 1, function(x) {
+  cleaned_text <- wikiproc::clean_html(x[4])
+  annotation <- wikiproc::create_annotations(cleaned_text, x[2], x[3], data.dir = data_dir)
+  tokens <- cleanNLP::cnlp_get_token(annotation)
+  if ("marry" %in% tokens$lemma) {
+    return(TRUE)
+  } else {
+    return(FALSE)
+  }
+})
+
+recall_high <- nrow(results_with_infobox[!is.na(results_with_infobox$spouse),]) / nrow(results_with_infobox[results_with_infobox$has_married,])
+
+# Another possibility to calculate stuff
+
+# Get the ones we know were married and calculate recall only for those
+
+results_with_infobox <- results_with_infobox[!is.na(results_with_infobox$spouses_infobox),]
+
+recall_low <- nrow(results_with_infobox[!is.na(results_with_infobox$spouse),]) / nrow(results_with_infobox[results_with_infobox$has_married,])
+
+
+eval_res <- data.frame(Parameter = c("Precision", "Recall (Low Estimate)", "Recall (High Estimate)"),
+                       Value = c(precision, recall_low, recall_high),
+                       stringsAsFactors = FALSE)
+
+
+p <- ggplot(eval_res, aes(x = Parameter, y = Value))
+p <- p + geom_bar(position = "dodge", stat = "identity", fill = "blue", width = 0.5)
+p <- p + geom_text(aes(label = sprintf("%0.2f", round(Value, digits = 2))), vjust = 0, nudge_y = 0.01)
+p <- p + theme_minimal()
+p <- p + theme(axis.title.x = element_blank(), axis.title.y = element_blank())
+p
+
+ggsave("spouse_eval.png", path = "plots", width = 5, height = 4, units = "in")
--- a/docs/abschluss_presentation/kurchatov.png
+++ b/docs/abschluss_presentation/kurchatov.png
--- a/docs/abschluss_presentation/plots.R
+++ b/docs/abschluss_presentation/plots.R
+library(ggplot2)
+
+# This assumes that the master script ran through and the result data frame is available
+
+feature_count <- sapply(results, function(col) sum(length(which(!is.na(col)))))
+feature_count <- data.frame(Count <- feature_count,
+                            Feature <- colnames(results),
+                            stringsAsFactors = FALSE)
+feature_count <- feature_count[feature_count$Count > 0 & feature_count$Feature != "name",]
+colnames(feature_count) <- c("Count", "Feature")
+
+# Flipped coord barplot
+p <- ggplot(feature_count, aes(x = reorder(Feature, Count), y = Count, fill = Feature))
+p <- p + geom_bar(stat = "identity", width = 0.5)
+p <- p + geom_hline(aes(yintercept = 983), colour = "red")
+p <- p + coord_flip()
+p <- p + guides(fill = FALSE)
+p <- p + theme_minimal()
+p <- p + labs(x = "Feature")
+p
+ggsave("feature_count_flip.png", path = "plots", width = 4, height = 4, units = "in")
+
+# Normal barplot
+p <- ggplot(feature_count, aes(x = reorder(Feature, -Count), y = Count, fill = Feature))
+p <- p + geom_bar(stat = "identity", width = 0.7)
+p <- p + geom_hline(aes(yintercept = 983), colour = "red")
+p <- p + theme_minimal()
+p <- p + theme(axis.text.x=element_blank())
+p <- p + labs(x = "Feature")
+p
+ggsave("feature_count_normal.png", path = "plots", width = 5, height = 4, units = "in")
+
--- a/docs/abschluss_presentation/plots/feature_count_flip.png
+++ b/docs/abschluss_presentation/plots/feature_count_flip.png
--- a/docs/abschluss_presentation/plots/feature_count_normal.png
+++ b/docs/abschluss_presentation/plots/feature_count_normal.png
--- a/docs/abschluss_presentation/plots/precision_birthdate.png
+++ b/docs/abschluss_presentation/plots/precision_birthdate.png
--- a/docs/abschluss_presentation/plots/rasa_architecture.png
+++ b/docs/abschluss_presentation/plots/rasa_architecture.png
--- a/docs/abschluss_presentation/plots/spouse_eval.png
+++ b/docs/abschluss_presentation/plots/spouse_eval.png
--- a/docs/abschluss_presentation/presentation.md
+++ b/docs/abschluss_presentation/presentation.md
+---
+title: Rasa Chatbot Demo
+separator: <!--s-->
+verticalSeparator: <!--v-->
+theme: solarized
+revealOptions:
+    transition: 'convex'
+    controls: true
+    slideNumber: true
+    
+---
+
+### Wikipedia Fact Chatbot
+
+David Fuhry, Lukas Gehrke, Leonard Haas, 
+
+Lucas Schons, Jonas Wolff
+
+<img src="chatbot.png" width="40%">
+
+<!--s-->
+
+### Inhalt
+
+* Thema & Forschungsfrage
+* Lösungsansatz
+* Ergebnisse
+
+<!--s-->
+
+### Thema - RASA Chatbot
+
+<section style="text-align: left;">
+RASA ist ein Framework für die Nutzung von **Chatbots** im Bereich _Conversational AI_. Das System muss mit **Facts** befüllt werden, um Antworten generieren zu können.
+
+_"The Rasa Stack is a set of open source machine learning tools for developers to create contextual chatbots and assistants."_
+
+<!--s-->
+
+### Thema - Architektur von RASA
+
+<img src="plots/rasa_architecture.png" width="70%">
+Note: Bild von Rasa NLU und Core Architektur erklären
+
+<!--s-->
+
+### Forschungsfrage
+
+- _Kann man dieses Wissen aus Texten akquirieren?_  
+- _Kann dies automatisch mit Textquellen durchgeführt werden?_  
+
+Dies soll anhand der _Wikipedia-Einträge von Physikern_ erprobt werden
+
+<!--s-->
+
+### Inhalt
+
+* ~~Thema & Forschungsfrage~~
+* Lösungsansatz
+* Ergebnisse
+
+<!--s-->
+
+### Lösungsansatz - RASA Bot konfigurierern
+
+* Für Training eines Bots werden _entities_, _text_ und _intents_ benötigt
+* Definition _intent_ aus [RASA Docs](https://rasa.com/docs/nlu/dataformat/):  
+_"The intent is the intent that should be associated with the text."_
+* Antworten über _Custom Actions_, dafür **Wissensbasis** benötigt
+
+<!--s-->
+
+### Lösungsansatz - RASA Bot konfigurierern
+
+#### Aufstellen von Intents zu Physikern:
+* awards
+* birthdate
+* birthplace
+* spouse
+* university
+* area of research
+* ...
+
+<!--s-->
+
+### Lösungsansatz - Datenquelle
+
+* 982 englischsprachige Wikipedia-Artikel
+* Scraping über jeweiligen Link mit `wikipediR`
+* HTML-Formatierung
+* Ca. 550 Wörter/Artikel
+<img src="kurchatov.png" width="70%">
+
+<!--s-->
+
+### Lösungsansatz - Processing
+
+```
+├── packages.list
+├── script
+│   └── master.R
+└── wikiproc
+    ├── DESCRIPTION
+    ├── NAMESPACE
+    ├── R
+    │   ├── get_awards.R
+    │   ├── get_birthdate.R
+    │   ├── get_birthplace.R
+    │   ├── get_data.R
+    │   ├── get_spouse.R
+    │   └── get_university.R
+    ├── man
+    └── tests
+```
+Note: Zu jedem Intent ein R-Skript. Master lädt die Artikel, speichert sie in einem Dataframe und ruft für jeden Artikel das clean_html-Skript und die Intent-processing-Skripte auf. Die extrahierten Informationen werden in einem neuen data-frame gespeichert. Außerdem wird das .tsv für den Bot generiert
+
+<!--s-->
+
+### Lösungsansatz - Gesamtarchitektur
+
+<img src="Wiki_Chatbot_Architecture.png" width="100%">
+Note: Schaubild der Gesamtarchitektur einfügen, AUF JEDEN FALL mit docker-Wal
+
+<!--s-->
+
+### Lösungsansatz - Processing, Extraktion Intents
+   
+#### `R/get_awards.R`
+
+* Annahme: Alle Auszeichnungen im Text werden von spacy getaggt
+* Matching aller Entities eines Textes gegen Menge an Stichwörtern.
+
+<!--s-->
+
+### Lösungsansatz - Processing, Extraktion Intents
+
+* Beispiel `R/get_spouse.R`
+* Identifiziere Sätze über Schlüsselwort _marry_ (lemma)
+* Nutze _Pattern_ auf _POS-Tags_
+* Verifiziere Ergebnisse über Physikernamen sowie _NER-Entities_
+
+<!--s-->
+
+### Vorführung RASA - Bot
+
+Note: I see what you did there
+
+<!--s-->
+
+### Inhalt
+
+* ~~Thema & Forschungsfrage~~
+* ~~Lösungsansatz~~
+* Ergebnisse
+
+<!--s-->
+
+### Ergebnisse
+
+##### Anzahl gewonnener Ergebnisse zu Intents; Rest ist `NA`
+<img src="plots/feature_count_flip.png" width="50%">
+Note: fancy plots mit precision und recall zu awards, birthdate und spouse
+
+<!--s-->
+
+### Ergebnisse
+
+<img src="plots/precision_birthdate.png" width="70%">
+Note: Die Auswertung erfolgte händisch über die ersten 300 Ergebnisse von get_birthdate.R mit den Ergebnissen der infobox als ground truth (auch wenn hier tlw NA steht). Als partial match wurden solch Ergebnisse gewertet, die sinnvolle Daten sind und bis auf das Fehlen des Tages mit dem Referenzwert aus der Infobox übereinstimmen (BSP: infobox: "3 May 1960"; get_birthdate: "May 1960"). Als full match wurden solche Ergebnisse gewertet, die sinnvolle Daten sind und als Zeitangabe gegenüber der infobox nicht weniger ausführlich sind (kein Fehlen von Tag etc)
+
+
+<!--s-->
+
+##### Auswertung zu 'get_spouse.R'
+
+<img src="plots/spouse_eval.png" width="50%">
+Note: Recall noch ausbaufähig über integration weiterer Pattern.
+
+<!--s-->
+
+### Bewertung Software/Datengrundlage I
+
+#### 1. Rasa-Bot
+* (+) NLU funktioniert in Rasa sehr gut
+* (+) Frei konfigurierbares Skript (actions.py)
+* (-) RASA-Software schwierig aufzusetzen 
+* (-) RASA wurde kurz vor Projektbeginn stark umgestellt
+Note: Tutorial, Dokumenation schlagartig veraltet; keine Dokumentation in Beispielen, keine Beispiele in der Dokumentation
+
+<!--s-->
+
+### Bewertung Software/Datengrundlage II
+
+#### 2. Wikipedia Artikel
+
+* (+) Relativ einheitlicher Aufbau (Einleitung, Werdegang etc.)
+* (-) Unterschiedlich ausführlich, tlw. Fehlen von Daten
+
+<!--s-->
+
+### Beantwortung der Forschungsfrage
+
+* Definieren der Intents für den Bot sollte vorab geschehen  
+* Informationen zu Intents unterschiedlich schwierig aus Text zu extrahieren  
+Note: Wäre möglich, aber nicht sinnvoll, denn Intents sind Grundlage für Funktionieren von Rasa-Architektur; Verfahren teilweise allgemein verwendbar, tlw. auch von der Domäne abhängig
+
+<!--s-->
+
+### Ergebnisse
+
+<section style="text-align: left;">
+_Kann man dieses Wissen aus Texten akquirieren?_  
+_Kann man dies automatisch mit Textquellen durchführen?_  
+
+Es ist möglich, Fakten zu _vordefinierten Intents_ aus Texten zu extrahieren und diese _dem Bot zur Verfügung zu stellen_.
+
+<!--s-->
--- a/docs/abschluss_presentation/presentation.pdf
+++ b/docs/abschluss_presentation/presentation.pdf
--- a/docs/abschluss_presentation/rasa_architecture.png
+++ b/docs/abschluss_presentation/rasa_architecture.png
--- a/docs/abschluss_presentation/speaker_notes.md
+++ b/docs/abschluss_presentation/speaker_notes.md
+# Recap - Überblick über Thema/Aufgabe
+
+Rasa ist ein Python-Framework für die Implementation von Chatbots im Bereich Conversational AI. ([Definition](https://www.iotforall.com/what-is-conversational-ai/) Conversational AI: A set of technologies that enable computers to simulate real conversations). Chatbots sollen dabei konkret die Funktion von ständig verfügbaren "Gesprächspartnern" übernehmen, die auf einer Website Auskünfte oder Anleitungen geben. Daraus ergeben sich verschiedene Vorteile: Für Kunden ist rund um die Uhr ein Gesprächsservice verfügbar. Außerdem verbuchen Unternehmen, die Chatbots nutzen, Einsparungen im Servicepersonalbereich.
+* Funktionsweise Rasa:
+  - NLU understands the user’s message based on your previous training data:
+    - Intent classification: Interpreting meaning based on predefined intents (Example: Please send the confirmation to amy@example.com is a provide_email intent with 93% confidence)
+    - Entity extraction: Recognizing structured data (Example: amy@example.com is an email)
+  - Core decides what happens next in this conversation. It’s machine learning-based dialogue management predicts the next best action based on the input from NLU, the conversation history and your training data. (Example: Core has a confidence of 87% that ask_primary_change is the next best action to confirm with the user if they want to change their primary contact information.)
+
+Mit Core können zudem sogenannte [Custom Actions](https://rasa.com/docs/core/customactions/) ausgeführt werden. Dabei kann über einen Endpoint beliebiger Code durch den Bot ausgeführt werden.
+
+
+**Forschungsfrage:** Der Bot braucht _Wissen_ für die Reaktion auf erkannte Intents des Users.
+  * Kann dieses Wissen aus Texten akquiriert werden?
+  * Kann dies automatisch mit Textquellen durchgeführt werden?
+    (Beispielfragen: Wer war Albert Einstein? Was hat Albert Einstein erfunden? Wo hat er gelebt?)
+
+
+# Lösungsansatz
+
+### Prototyp und Intents
+
+Zur Beantwortung der Forschungsfrage haben wir zunächst einen Bot-Prototypen aufgesetzt. Dafür war der Download von Rasa NLU und Rasa Core über pip oder conda nötig. Außerdem ist für Rasa eine Umgebung mit Python 3.6 notwendig. Dafür wurde miniconda genutzt. Beim Konfigurieren des Bots war festzustellen, dass für das Training der NLU-Komponente aus der Kommunikation des Users zu extrahierende Entitäten und Intents festgelegt werden müssen. Aus diesem Grund legten wurden die ersten Intents festgelegt, mit denen der Prototyp trainiert werden sollte.
+```
+  - birth - "Where and when was $physicist born?"
+  - isAlive - "Is $physicist still alive?"
+  - education - "Where did $physicist go to school?"
+  - researchArea - "What did $physicist discover?"
+  - hasNobelPrize - "Did $physicist win the Nobel Prize?"
+
+```
+Bei der Nutzung des Bots fiel schnell auf, dass für die Beantwotung der Fragen eine Wissensbasis der Form (physicist x, intent y) benötigt würde.
+
+| `data`   | intent1 | intent2 | ... |
+|----------|---------|---------|-----|
+|physicist1| _data_  | _data_  | ... |
+|physicist1| _data_  | _data_  | ... |
+|   ...    |  ...    |  ...    | ... |
+
+### Datengewinn
+
+Daher wurde der Ansatz gewählt _zu einer vorbestimmten Menge an Intents mittels Text Mining-Verfahren das nötige Wissen zu akquirieren_. Zu der gegebenen Auswahl an Intents wurden, wie in der Aufgabenstellung gefordert, Wikipedia-Artikel beschafft. Dazu wurde auf den Artikel [List of physicists](https://en.wikipedia.org/wiki/List_of_physicists) zurückgegriffen, der zu dem Zeitpunkt Verlinkungen auf 982 Artikel zu berühmten Physikern enthielt. Das Scrapping wird mittels eines R-Skriptes durchgeführt, das das R-Paket 'WikipediR' nutzt. Anschließend liegen in einem R-Dataframe alle Artikel als HTML vor
+
+```
+#!/usr/bin/env Rscript
+...
+page <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_physicists")
+...
+article <- WikipediR::page_content("en", "wikipedia", page_name = x, as_wikitext = FALSE)
+...
+```
+
+### Processing
+
+Um aus den gewonnenen Wikipedia-Artikeln das nötige Wissen für den Bot zu generieren, wurde ein eigenes R-Paket erstellt: `processing/wikiproc`. Das Paket enthält zunächst einmal ein Skript zum Entfernen von HTML-Tags und sonstigen Formatierungsbestandteilen des Textes: `clean_html.R` Mithilfe dieses Skriptes kann entsprechend dem Forschungsfeld 'Text Mining' auf _natürlichsprachigem Text_ gearbeitet werden.
+Weiterhin enthält das Paket _ein Skript pro Intent_, um mithilfe von auf den Intent zugeschnittenen Verfahren das gesuchte Wissen aus den Artikeln zu extrahieren.
+
+```
+wikiproc
+├── DESCRIPTION
+├── NAMESPACE
+├── R
+│   ├── clean_html.R
+│   ├── get_awards.R
+│   ├── get_birthdate.R
+│   ├── get_birthplace.R
+│   ├── get_data.R
+│   ├── get_spouse.R
+│   └── get_university.R
+└── tests
+```
+
+### Processing - Beispiele
+
+`get_birthdate.R`:
+Für die Gewinnug des Geburtsdatums aus einem Text werden zunächst durch Nutzung des Paketes 'cleanNLP.R' alles DATE-Entities aus dem Text gewonnen. Davon wird die erste erkannte Entität genutzt.
+Mithilfe von Regex wird zudem ein evtl über '-' anhängendes Todesdatum abgeschnitten sowie Klammern und Whitespaces entfernt.
+
+`get_spouse.R`:
+Der Ehepartner wird über NER und Patterns aus dem jeweiligen Text extrahiert. Die Patterns sind Vektoren aus POS-Tags und Wildcards für die Position der gesuchten Entität davor oder dahinter, dem Token 'marry' und Wildcards für die Position der gesuchten Entität davor oder dahinter. Mit diesen Patterns und der Menge aller PERSON Entities aus dem jeweiligen Text wird die Pattern-Matching-Funktion in 'utils.R' aufgerufen.
+
+## Architektur
+
+Die Extraktion des Daten mittels Skripten aus 'wikiproc' wird über ein zentrales Skript - `master.R`
+ausgeführt, die die Artikel lädt und in einer Iteration einen Dataframe erstellt. Aus diesem Dataframe wird schließlich ein .tsv-File erstellt: `data.tsv`. Dieses File ist die Wissensbasis für den Rasa-Bot.
+Der Rasa Bot kann durch Custom-Actions zu einem _erkannten Intent_ und einer _erkannten physicist-Entität_ `data.tsv` durchsuchen und seinen Treffer als Antwort ausgeben. Damit wird der Bot zum Experten über voher festgelegte Themen zu den Physikern aus den gewonnenen Artikeln.
+
+![](Wiki_Chatbot_Architecture.png)
+
+# Ergebnisse
+
+### Evaluation der Ergebnisse
+
+![Recall](plots/feature_count_normal.png)
+
+Mit den behandelten Intents ließen sich eine hohe Precision und ein geringer Recall erzielen.
+Vor allem das Pattern-Matching bei 'spouse' liefert immer die richtige Information. Dennoch wird in ca. 90% der Fälle kein Ergebnis geliefert, da die nötige Information entweder _im Text fehlt_ oder über ein _anderes Pattern extrahiert_ werden müsste. In diesem Fall z.B. über `"* husband * *", "* wife * *"`; genutzt wird nur `"* marry * *"`.
+
+Bei anderen Intents, die über NER und reguläre Ausdrücke extrahiert werden, ist die Precision geringer, da diese auf relativ naiven Angaben basieren ("Das erste Datum ist das Geburtsdatum"). Zudem ist die Qualität der gewonnenen Daten nicht sehr gut - oftmals sind sie sehr heterogen
+```
+  [1] "October 3, 1921"                    "23 January 1840"                   
+  [3] "May 1960"                           "August 20, 1918"                   
+  [5] "June 25, 1928"                      "December 4, 1913"                  
+  [7] "November 30, 1939"                  "December 13, 1724"                 
+  [9] "1976"                               "March 28, 1964"                    
+ [11] "15 March 1930"                      "May 1908"           
+```
+
+```
+ [64] "Stalin Prize, National Prize"                                                                  
+ [65] "Nobel Price"
+ [66] NA                                                 
+ [67] NA
+ [68] NA
+ [69] "Nobel Price, Rumford Medal, Helmholtz Medal, Barnard Medal"
+ [70] "Nobel Price"                              
+ [71] NA                                             
+ [72] "Scientific American Trophy for, Volta Prize, The Volta Prize, Albert Medal, Edison Medal, Alexander Graham Bell Medal"        
+ [73] "Nobel Price"
+ [74] "Nobel Price, Special Breakthrough Prize in Fundamental Physics"
+ [75] NA
+ [76] NA
+```
+
+* Bewertung RASA-Software
+  - Setup schwierig
+  - Keine Beispiel in Manuals, keine Dokumentation in Beispielen
+
+* Bewertung Wikipedia-Artikel als unstrukturierte Daten als Grundlage
+  - (+) relativ ähnlicher Aufbau
+  - (-) dennoch im Detail unterschiede ("persönliche Note" ständig unterschiedlicher Autoren)
+  - (-) unterschiedlich ausführlich
+
+* Beantwortung der Forschungsfrage
+> Kann man dieses Wissen aus Texten generieren?
+  - Ja, nutze Pattern, NER
+> Lässt sich dieses Wissen automatisch generieren?
+  - Nein. Intents sollten vorher generiert werden, dann lässt sich das Wissen mit **auf den Intent und die Datengrundlage zugeschnittenen Verfahren** extrahieren
\ No newline at end of file
--- a/docs/final-report/img/Wiki_Chatbot_Architecture.png
+++ b/docs/final-report/img/Wiki_Chatbot_Architecture.png
--- a/docs/final-report/report.tex
+++ b/docs/final-report/report.tex
@@ -5,6 +5,8 @@
 \usepackage{ngerman}
 \usepackage[]{listings}
 \usepackage{hyperref}
+\usepackage{graphicx}
+\graphicspath{{./img/}}

 \title{Text Mining Lab \\ Training Rasa-Chatbots with Natural Language Texts \\ Project Report}
 \author{David Fuhry \\ Leonard Haas \\ Lukas Gehrke \\ Lucas Schons \\ Jonas Wolff}
@@ -18,59 +20,141 @@

 \pagebreak

-% To be edited - my (Lukas) suggestion so far
 \section{Project Description}
-    \subsection{Converstaional AI and Training}
-    Conversational AI describes computer systems that users can interact with by having a conversation. One important goal is to make the conversation seem as natural as possible. Ideally, an interacting user should assume to be interacting with another human beeing. This can make communication with a computer become very pleasant and easy for humaning beeings as they are simply using the language the always use. Besides there is no need for menu interaction with the system and thus no learning curve required.
-    % TODO add example use case (website information)
-    % TODO add more benefits (24/7 availability)
+
+    \subsection{Conversational AI and Training}
+    Conversational AI describes computer systems that users can interact with by having a
+    conversation. One important goal is to make the conversation seem as natural as possible.
+    Ideally, an interacting user should assume to be interacting with another human. This
+    can make communication with a computer become very pleasant and easy for humans as
+    they are simply using their natural language. Besides there is no need for menu
+    interaction with the system and thus no learning curve.
    \\ Conversational AI can be used in Voice Assistants that communicate through spoken words or
-    through chatbots that imitate a human beeing one is chatting with by text messages.
-    \subsection{Rasa Framwork}
-    Rasa is a collection of frameworks for conversational AI software. The Rasa Stack contains two open source libraries called Rasa NLU and Rasa Core that can be used to create contextual chatbots. Rasa NLU is a library for natural language understanding with intent classification and entity extraction Rasa Core is a Chatbot framework with machine learning based dialogue management. Both can be uses independently but rasa recommends using both.
-    % TODO add description of how a rasa bot must be trained to achieve results
+    through chatbots that imitate a human by sending text messages.
+
+    \subsection{Rasa Framework}
+    Rasa is a collection of tools for conversational AI software. The Rasa Stack contains two
+    open source libraries called Rasa NLU and Rasa Core that can be used to create contextual
+    chatbots. Rasa NLU is a library for natural language understanding with intent classification
+    and entity extraction Rasa Core is a chatbot framework with machine learning based dialogue
+    management. Both can be uses independently but rasa recommends using both.
+    \\ A Rasa Bot needs training data to work properly. The NLU component must be provided with example questions for each \textit{intent} it will have to deal with. Inside of these questions, \textit{entities} must be marked in order to train rasa where to extract these from.
+    The Core component requires example conversation flows and utterance templates for training. Examples can be seen in \ref{rasa_chatbot}.

    \subsection{Research Question}
-    The objective of this project is to find out, wether chatbots can be trained with natural language texts \textit{automatically}. There are two inital research questions: Given that chatbots need to be trained with knowledge, called facts.
+    The objective of this project is to find out, wether chatbots can be trained with natural
+    language texts \textit{automatically}. There are two initial research questions: Given that
+    chatbots need to be trained with knowledge, called facts:
    \begin{itemize}
-        \item Can these facts be extracted from natural language text?
-        \item Can this be done automaitcally? 
+        \item can these facts be extracted from natural language text?
+        \item can this be done automatically? 
    \end{itemize}
    
-\section{Solution Approach}
+\section{Approach}
+
    \subsection{Project Goals}
    
 		\subsection{Rasa Setup and Intents}

-		The Rasa-Stack consists of two components: \textit{Rasa-Core} and \textit{Rasa-NLU}. The \textit{Rasa-NLU} component takes care of getting user input and matching it with the respective intents. It also extracts all possibly provided entities and stores them in variables, called ``slots''. After that, the \textit{Rasa-Core} component executes all actions associated with the determined intent.
+		The Rasa-Stack consists of two components: \textit{Rasa-Core} and \textit{Rasa-NLU}. The \textit{Rasa-NLU} component takes care of getting user input and matching it with the respective intents. It also extracts all possibly provided entities and stores them in variables, called ``slots''. After that, the \textit{Rasa-Core} component executes all actions associated with the determined intent. 

    \subsection{Scrapping of Source Texts}
+    Wikipedia was choosen as resource for texts as it provides texts of relatively long length in a somewhat uniform manner.
+    While Wikipedia does have a \textit{Physicists} category\footnote{\url{https://en.wikipedia.org/wiki/Category:Physicists}}, 
+    it is fragmented into somewhat arbitrary subcategories and thus not optimal to use as a collection.
+    However Wikipedia also has a \textit{List of physicists} which contains 981 physicists and was used to build the collection used. \\
+    Data scraping was done using the R Package \textit{WikipediR}, a wrapper around the Wikipedia API.
+    Articles were downloaded as HTML\footnote{HTML was choosen over wikitext to ease text cleaning} and afterwards strapped of all HTML Tags and Quotation marks. 
+    
    \subsection{Fact Extraction Approaches}
+    Fact extraction greatly varies depending on the nature of the fact to extract.
+    As all approaches leverage on some form of NER or POS tagging, annotations were created for all text.
+    This was done using the R Package \textit{cleanNLP} with an spaCy backend to create NER and POS tags, as well as lemmatization. \\
+    Fact extraction for physicists spouses was done using pre-defined patterns on word lemmata.\footnote{Functionality to use patterns on POS Tags is also available but did not yield a better outcome.}
+    A pattern is consists of word lemmata to be matched (including wildcards) as well as defined places to look for the name of the phisicit as well as his/her spouse.
+    When a matching phrase is found the results are verified by checking that the corresponding physicist is mentioned as well as the potential spouse beeing detected as a Person by the NER tagger.

 \section{Software Architecture}
-    \subsection{Rasa Chatbot}
-    The Rasa Chatbot built for this project uses both Rasa Stack components - \textit{Rasa Core} and \textit{Rasa NLU}. Configuration has been organised in reference to examples from the Rasa github repository. \\ Rasa NLU has been trained with example questions in Markdown format that cotain highlighted enities. This ensures the bot to understand intents and extract the entities inside the sentences. One example can be seen in Figure \ref{nlu_example}. \\
-    \lstinputlisting[label={nlu_example}, caption={NLU example}]{nlu_example.md} 
-    Rasa Core has been configured with \textit{stories} that contain example conversation flows as training data \ref{stories_example} and the \textit{domain} of the bot. The domain contains all actions, entities, slots, intents, and templates the bot deals with. \textit{Templates} means template strings for bot utterances. \textit{Slots} are variables that can hold different values. The bot proposed in this project uses a slot to store the name of a recognized physicist entity for instance. According to the Rasa website \footnote{\url{https://rasa.com/docs/get_started_step2/}}, the domain is \textit{the universe the bot is living in}. \\
+
+    \subsection{Rasa Chatbot} \label{rasa_chatbot}
+    The chatbot built for this project uses both Rasa Stack components - \textit{Rasa Core}
+    and \textit{Rasa NLU}. Configuration has been organized in reference to examples from the Rasa
+    github repository. \\ Rasa NLU has been trained with example questions in markdown format that
+    contain highlighted entities. This ensures that the bot is able to understand intents and
+    extract the entities inside the sentences. One example can be seen in listing \ref{nlu_example}. \\
+
+    \lstinputlisting[label={nlu_example}, caption={NLU example}]{nlu_example.md}
+
+    Rasa Core has been configured with \textit{stories} that contain example conversation flows as
+    training data (listing \ref{stories_example}) and the \textit{domain} of the bot. The domain
+    contains all actions, entities, slots, intents, and templates the bot deals with. \textit
+    {Templates} means template strings for bot utterances. \textit{Slots} are variables that can
+    hold different values. The bot proposed in this project uses a slot to store the name of a
+    recognized physicist entity. According to the Rasa website
+    \footnote{\url{https://rasa.com/docs/get_started_step2/}}
+    , the domain is \textit{the universe the bot is living in}. \\
+
    \lstinputlisting[label={stories_example}, caption={Example Story}]{stories_example.md}
-    The bot recognizes the intents shown in Figure \ref{intent_plot}. It can be started through \textit{MAKE}-commands. For further details, please refer to the README \footnote{\url{https://git.informatik.uni-leipzig.de/text-mining-chatbot/wiki-rasa/blob/master/README.md}}. Development of the bot was focused on proof of concept so there is not a lot of natural conversation ability available.
-    % TODO complete table.
+
+    The bot recognizes the intents shown in the table on page \pageref
+    {table:intent_table}. It can be started through
+    \textit{MAKE}-commands. For further details, please refer to the README
+    \footnote{
+    \url{https://git.informatik.uni-leipzig.de/text-mining-chatbot/wiki-rasa/blob/master/README.md}}
+
+    Development of the bot was focused on proof of concept so there is not a lot of natural
+    conversation ability available.

    \begin{center}
-        \begin{tabular}{| l | l | l |}
-            \hline
-            No & Intent & Example \\ \hline
-            1 & birthdate & When was Albert Einstein born \\ \hline
-            1 & birthdate & When was Albert Einstein born \\ \hline
-            1 & birthdate & When was Albert Einstein born \\ \hline
-            1 & birthdate & When was Albert Einstein born \\
-            \hline
-        \end{tabular}
+        \begin{table}
+            \begin{tabular}{| c | l | l |}
+                \hline
+                No & Intent & Example \\ \hline
+                1 & birthdate & When was Albert Einstein born \\ \hline
+                2 & nationality & Where was Albert Einstein born \\ \hline
+                3 & day of death & When did Albert Einstein die \\ \hline
+                4 & place of death & Where did Albert Einstein die \\ \hline
+                5 & is alive & Is Albert Einstein still alive \\ \hline
+                6 & spouse & Who was Albert Einstein married to \\ \hline
+                7 & primary education & Where did Albert Einstein go to school \\ \hline
+                8 & university & Which university did Albert Einstein attend \\ \hline
+                9 & area of research & What was Albert Einstein area of research \\ \hline
+                10 & workplace & Where did Albert Einstein work \\ \hline
+                11 & awards & What awards did Albert Einstein win \\ \hline
+            \end{tabular}
+            \caption{Intents that are recognized by the bot}
+            \label{table:intent_table}
+        \end{table}
    \end{center}

    \subsection{R Package 'wikiproc'}
+    All functionality to extract facts, download data from wikipedia as well as some utility functions 
+    is encapsulated inside the \textit{wikiproc} R Package. 
+    This allows for a better management of dependencys as well as inclusion of unit tests for fact extraction methods.
+    
+
+    \begin{table}
+        \centering
+        \begin{tabular}{| l | l |}
+            \hline
+            Function & Category \\ \hline \hline
+            clean\textunderscore html & Utility \\ \hline
+            create\textunderscore annotations & Utility \\ \hline
+            init\textunderscore nlp & Utility \\ \hline
+            get\textunderscore data & Data scraping \\ \hline
+            get\textunderscore awards & Fact extraction \\ \hline
+            get\textunderscore birthdate & Fact extraction \\ \hline
+            get\textunderscore birthplace & Fact extraction \\ \hline
+            get\textunderscore spouse & Fact extraction \\ \hline
+            get\textunderscore university & Fact extraction \\ \hline
+        \end{tabular}
+        \caption{Exported functions of the wikiproc package}
+        \label{table:intent_table}
+    \end{table}
+
    \subsection{Interworking of R and Rasa}
-    %TODO add architecture chart made by Lucas
+
+    \includegraphics[width=\textwidth]{Wiki_Chatbot_Architecture}


 \section{Results}

--- a/docs/img/result_data_frame.png
+++ b/docs/img/result_data_frame.png
--- a/docs/img/wiki-rasa-proj-struct.png
+++ b/docs/img/wiki-rasa-proj-struct.png