Skip to content
Snippets Groups Projects
Commit 543a7c8e authored by David Fuhry's avatar David Fuhry :fist: Committed by Lucas Schons
Browse files

Resolve "(Dummy) feature extraktion"

parent 7004a81f
No related branches found
No related tags found
No related merge requests found
### GetNoOfSpouses.R
### This extracts the number of spouses from the infobox
### If no infobox or no information about spouses is found assumes there are none
### Not for use in production, this does not actually get information from text
# Author: David
## Librarys
library(rvest)
library(data.table)
### Get number of spouses
getNoOfSpouses <- function(article) {
# If there is no infobox we assume there were no spouses
if(!grepl("vcard", article)) {
return(0)
}
infoBox <- getInfoBox(article)
# Get the spouse field
spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
# Remove everything in parentheses
spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
# Split the strings by newlines to get one spouse per line
spouses <- strsplit(spouses, "\n")
spouses <- unlist(spouses)
if(length(spouses) > 0) {
return(length(spouses))
}
return(0)
}
### Converts info box to table
getInfoBox <- function(article) {
# Read page as html
page <- read_html(article)
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>%
xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
html_nodes("table.vcard") %>%
html_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
r/Master.R 100644 → 100755
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
cat("Sourcing R scripts... ") cat("Sourcing R scripts... ")
source("r/GetData.R") source("r/GetData.R")
source("r/GetNoOfSpouses.R")
#source("r/getBirthday.R") #source("r/getBirthday.R")
#source("r/getSomethingElse.R") #source("r/getSomethingElse.R")
...@@ -27,7 +28,10 @@ articles <- getData(use.cache = TRUE) ...@@ -27,7 +28,10 @@ articles <- getData(use.cache = TRUE)
cat("Processing data...\n") cat("Processing data...\n")
results <- lapply(articles, function(data) { results <- apply(articles, 1, function(article) {
# Within this function article is a vector representing a single row of our original data frame
# This means article[1] represents the Title, article[2] the PageID etc.
## Data cleaning ## Data cleaning
# cleaned.text <- someCleanFunctioN(data$Text) # cleaned.text <- someCleanFunctioN(data$Text)
...@@ -38,22 +42,29 @@ results <- lapply(articles, function(data) { ...@@ -38,22 +42,29 @@ results <- lapply(articles, function(data) {
## Extract information from Text ## Extract information from Text
no.spouses <- getNoOfSpouses(article[4])
# someFact <- getFactFromTextFunctioN(annotated.text) # someFact <- getFactFromTextFunctioN(annotated.text)
# someOtherFact <- getOtherFactFromText(data$Text) # someOtherFact <- getOtherFactFromText(data$Text)
## Create Results ## Create Results
# data.frame(Name = x$Name, data.frame(Name = article[1],
# FactOne = someFact, NoSpouses = no.spouses)
# FactTwo = someOtherFact)
}) })
results <- do.call(rbind, results) results <- do.call(rbind, results)
cat("Data processing finished.\n")
## Results are now in results ## Results are now in results
## Format for rasa ## Format for rasa
cat("Writing rasa files to 'rasa/'...\n")
# someFormatFunction(results) # someFormatFunction(results)
cat("Data processing finished.\n")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment