diff --git a/README.md b/README.md index f56e3bce991f5279c69975bff8f6dd06611993fd..df8de47b0334e8bc08072361a6c289d649834d37 100644 --- a/README.md +++ b/README.md @@ -31,3 +31,16 @@ python -m rasa_core.run -d models/dialogue -u models/current/nlu ``` +### R Scripts + +# PhysicistsList.R + +Will crawl wikipedias [List of Physicists](https://en.wikipedia.org/wiki/List_of_physicists) for all physicist names and save them in a file *Physicists.txt* in the data directory. +Use that file to generate xml dump at wikipedias [Export page](https://en.wikipedia.org/wiki/Special:Export) + +# ExtractFromXML.Rasa + +Will read in the xml file from the data directory and extract the title and text of the pages in the dump. Will then write them to *texte.csv* in the data directory, use `read.table` to import. For convenience will also create a texte.RDS file, load with `texte <- readRDS("../data/texte.RDS")`. +**NOTE:** For the script to work, the first line of the xml needs to be replaced with `<mediawiki xml:lang="en">`. + + diff --git a/data/Wikipedia-20181120103842.xml b/data/Wikipedia-20181120103842.xml index 5659353397b37325e0eba64f724a618314e55b93..b0ed542e6437c3378490bf6f2ba9d68ff976a807 100644 --- a/data/Wikipedia-20181120103842.xml +++ b/data/Wikipedia-20181120103842.xml @@ -1,4 +1,4 @@ -<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en"> +<mediawiki xml:lang="en"> <siteinfo> <sitename>Wikipedia</sitename> <dbname>enwiki</dbname> diff --git a/r/ExtractFromXML.R b/r/ExtractFromXML.R new file mode 100644 index 0000000000000000000000000000000000000000..2af3bc4230664f514f71a2e68189792a0bdcb4cb --- /dev/null +++ b/r/ExtractFromXML.R @@ -0,0 +1,18 @@ +library(xml2) + +data <- read_xml("../data/Wikipedia-20181120103842.xml") + +title.nodes <- xml_find_all(data, ".//title") + +titles <- sapply(title.nodes, xml_text) + +text.nodes <- xml_find_all(data, ".//text") + +texts <- sapply(text.nodes, xml_text) + +df.out <- data.frame(Title = titles, + Text = texts) + +saveRDS(df.out, "../data/texte.RDS") + +write.table(df.out, "../data/texte.csv") \ No newline at end of file diff --git a/r/PhysicistsList.R b/r/PhysicistsList.R index 474e56548e9a95812be25054144ceba51da099e9..2998f9bd78b136a58775634211f58dba9052f553 100644 --- a/r/PhysicistsList.R +++ b/r/PhysicistsList.R @@ -17,5 +17,5 @@ physicists <- physicists[nchar(physicists) > 5] length(physicists) <- length(physicists) - 3 # Done -write(physicists, "physicists.txt") +write(physicists, "../data/physicists.txt")