Skip to content
Snippets Groups Projects
Commit 881f4eac authored by David Fuhry's avatar David Fuhry :fist: Committed by Leonard Haas
Browse files

Resolve "Implement NER"

parent 79e45a76
No related branches found
No related tags found
No related merge requests found
......@@ -16,6 +16,7 @@ cat("Sourcing R scripts... ")
source("r/GetData.R")
source("r/GetNoOfSpouses.R")
source("r/CleanHtml.R")
source("r/ProcessNER.R")
#source("r/getSomethingElse.R")
cat("Done.\n")
......@@ -40,7 +41,7 @@ results <- pbapply(articles, 1, function(article) {
## Data preprocessing/annotating
# annotated.text <- annotationFunction(data$Text)
annotation <- createAnnotations(cleaned.text, article[2], article[3])
## Extract information from Text
......
#!/usr/bin/env Rscript
### Provides functionality to use NER, POS and Dependency Grammers
## Author: David
cat("Initializing spacy backend...\n")
# It's important to do this prior to loading any python related stuff
reticulate::use_condaenv("spcy", required = TRUE)
# Load librarys
library(cleanNLP)
# Init nlp models
cnlp_init_spacy(entity_flag = TRUE)
cat("Done.\n")
createAnnotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE) {
# Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
# To prevent this we stip 'em again
filename <- gsub(" ", "", paste0("data/annotations/", article.id, "-", article.rev.id, ".RDS"), fixed = TRUE)
# Check if there is a cached version of the annotations for this article in this specific revision
if(use.cache & file.exists(filename)) {
res <- tryCatch({
data <- readRDS(filename)
data
}, error = function (e) {
cat("Cached data seems to be corrupted, redoing annotation.\n")
})
return(res)
}
annotation <- cnlp_annotate(text, as_strings = TRUE)
# Write cache if desired
if(write.cache) {
if (!dir.exists("data")) {
dir.create("data")
}
if (!dir.exists("data/annotations")) {
dir.create("data/annotations")
}
saveRDS(annotation, filename)
}
# Return data
# On a side note: Should we do this? The tidyverse style guide discourages explicit returns.
# But then again, it suggests snake case for variables...
return(annotation)
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment