From 52f20fcf0e2b11997ec10ccd2f3b1e1944ca54dc Mon Sep 17 00:00:00 2001
From: David Fuhry <david@129a-records.de>
Date: Thu, 22 Nov 2018 17:57:31 +0100
Subject: [PATCH 1/2] Added xml extraction script; Fixed xml for use with R;
 Cosmetic Changes; Updated Readme

---
 README.md                         | 13 +++++++++++++
 data/Wikipedia-20181120103842.xml |  2 +-
 r/ExtractFromXML.R                | 16 ++++++++++++++++
 r/PhysicistsList.R                |  2 +-
 4 files changed, 31 insertions(+), 2 deletions(-)
 create mode 100644 r/ExtractFromXML.R

diff --git a/README.md b/README.md
index f56e3bc..07ce035 100644
--- a/README.md
+++ b/README.md
@@ -31,3 +31,16 @@ python -m rasa_core.run -d models/dialogue -u models/current/nlu
 ```
 
 
+### R Scripts
+
+# PhysicistsList.R
+
+Will crawl wikipedias [List of Physicists](https://en.wikipedia.org/wiki/List_of_physicists) for all physicist names and save them in a file *Physicists.txt* in the data directory.
+Use that file to generate xml dump at wikipedias [Export page](https://en.wikipedia.org/wiki/Special:Export)
+
+# ExtractFromXML.Rasa
+
+Will read in the xml file from the data directory and extract the title and text of the pages in the dump. Will then write them to *texte.csv* in the data directory. For convenience will also create a texte.RDS file, load with `texte <- read.RDS("../data/texte.RDS")`.
+**NOTE:** For the script to work, the first line of the xml needs to be replaced with `<mediawiki xml:lang="en">`.
+
+
diff --git a/data/Wikipedia-20181120103842.xml b/data/Wikipedia-20181120103842.xml
index 5659353..b0ed542 100644
--- a/data/Wikipedia-20181120103842.xml
+++ b/data/Wikipedia-20181120103842.xml
@@ -1,4 +1,4 @@
-<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">
+<mediawiki xml:lang="en">
   <siteinfo>
     <sitename>Wikipedia</sitename>
     <dbname>enwiki</dbname>
diff --git a/r/ExtractFromXML.R b/r/ExtractFromXML.R
new file mode 100644
index 0000000..ac13f0b
--- /dev/null
+++ b/r/ExtractFromXML.R
@@ -0,0 +1,16 @@
+library(xml2)
+
+data <- read_xml("../data/Wikipedia-20181120103842.xml")
+
+title.nodes <- xml_find_all(data, ".//title")
+
+titles <- sapply(title.nodes, xml_text)
+
+text.nodes <- xml_find_all(data, ".//text")
+
+texts <- sapply(text.nodes, xml_text)
+
+df.out <- data.frame(Title = titles,
+                     Text = texts)
+
+write.csv2(df.out, "../data/texte.csv")
diff --git a/r/PhysicistsList.R b/r/PhysicistsList.R
index 474e565..2998f9b 100644
--- a/r/PhysicistsList.R
+++ b/r/PhysicistsList.R
@@ -17,5 +17,5 @@ physicists <- physicists[nchar(physicists) > 5]
 length(physicists) <- length(physicists) - 3
 
 # Done
-write(physicists, "physicists.txt")
+write(physicists, "../data/physicists.txt")
 
-- 
GitLab


From 783a1ab8b74d427f3791930b68491c3b0b1e2179 Mon Sep 17 00:00:00 2001
From: David Fuhry <david@129a-records.de>
Date: Thu, 22 Nov 2018 18:06:30 +0100
Subject: [PATCH 2/2] Minor fixes

---
 README.md          | 2 +-
 r/ExtractFromXML.R | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 07ce035..df8de47 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ Use that file to generate xml dump at wikipedias [Export page](https://en.wikipe
 
 # ExtractFromXML.Rasa
 
-Will read in the xml file from the data directory and extract the title and text of the pages in the dump. Will then write them to *texte.csv* in the data directory. For convenience will also create a texte.RDS file, load with `texte <- read.RDS("../data/texte.RDS")`.
+Will read in the xml file from the data directory and extract the title and text of the pages in the dump. Will then write them to *texte.csv* in the data directory, use `read.table` to import.  For convenience will also create a texte.RDS file, load with `texte <- readRDS("../data/texte.RDS")`.
 **NOTE:** For the script to work, the first line of the xml needs to be replaced with `<mediawiki xml:lang="en">`.
 
 
diff --git a/r/ExtractFromXML.R b/r/ExtractFromXML.R
index ac13f0b..2af3bc4 100644
--- a/r/ExtractFromXML.R
+++ b/r/ExtractFromXML.R
@@ -13,4 +13,6 @@ texts <- sapply(text.nodes, xml_text)
 df.out <- data.frame(Title = titles,
                      Text = texts)
 
-write.csv2(df.out, "../data/texte.csv")
+saveRDS(df.out, "../data/texte.RDS")
+
+write.table(df.out, "../data/texte.csv")
\ No newline at end of file
-- 
GitLab