Resolve "(Dummy) feature extraktion"

543a7c8e · David Fuhry · Lucas Schons · 7004a81f · 543a7c8e · 543a7c8e
Commit 543a7c8e authored 6 years ago by David Fuhry Committed by Lucas Schons 6 years ago
--- a/r/GetNoOfSpouses.R
+++ b/r/GetNoOfSpouses.R
+### GetNoOfSpouses.R
+### This extracts the number of spouses from the infobox
+### If no infobox or no information about spouses is found assumes there are none
+### Not for use in production, this does not actually get information from text
+# Author: David
+## Librarys
+library(rvest)
+library(data.table)
+### Get number of spouses
+getNoOfSpouses <- function(article) {
+  # If there is no infobox we assume there were no spouses
+  if(!grepl("vcard", article)) {
+    return(0)
+  }
+  infoBox <- getInfoBox(article)
+  # Get the spouse field
+  spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
+  # Remove everything in parentheses
+  spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
+  # Split the strings by newlines to get one spouse per line
+  spouses <- strsplit(spouses, "\n")
+  spouses <- unlist(spouses)
+  if(length(spouses) > 0) {
+    return(length(spouses))
+  }
+  return(0)
+}
+### Converts info box to table
+getInfoBox <- function(article) {
+  # Read page as html
+  page <- read_html(article)
+  # Extracting text from the html will erase all <br> tags,
+  # this will replace them with line breaks
+  xml_find_all(page, ".//br") %>%
+    xml_add_sibling("p", "\n")
+  xml_find_all(page, ".//br") %>%
+    xml_remove()
+  # Get the info box
+  # Will throw an error if there isnt any, so that should be checked beforehand
+  table <- page %>%
+    html_nodes("table.vcard") %>%
+    html_table(fill = TRUE) %>%
+    .[[1]]
+  colnames(table) <- c("Desc", "Content")
+  return(table)
+}
--- a/r/Master.R
+++ b/r/Master.R
@@ -12,6 +12,7 @@
 cat("Sourcing R scripts... ")
 source("r/GetData.R")
+source("r/GetNoOfSpouses.R")
 #source("r/getBirthday.R")
 #source("r/getSomethingElse.R")
@@ -27,7 +28,10 @@ articles <- getData(use.cache = TRUE)
 cat("Processing data...\n")
-results <- lapply(articles, function(data) {
+results <- apply(articles, 1, function(article) {
+  # Within this function article is a vector representing a single row of our original data frame
+  # This means article[1] represents the Title, article[2] the PageID etc.
  ## Data cleaning
  # cleaned.text <- someCleanFunctioN(data$Text)
@@ -38,22 +42,29 @@ results <- lapply(articles, function(data) {
  ## Extract information from Text
+  no.spouses <- getNoOfSpouses(article[4])
  # someFact <- getFactFromTextFunctioN(annotated.text)
  # someOtherFact <- getOtherFactFromText(data$Text)
  ## Create Results
-  # data.frame(Name = x$Name,
+  data.frame(Name = article[1],
-  #            FactOne = someFact,
+             NoSpouses = no.spouses)
-  #            FactTwo = someOtherFact)
 })
 results <- do.call(rbind, results)
+cat("Data processing finished.\n")
 ## Results are now in results
 ## Format for rasa 
+cat("Writing rasa files to 'rasa/'...\n")
 # someFormatFunction(results)
+cat("Data processing finished.\n")