Skip to content
Snippets Groups Projects
Commit bfd535a8 authored by David Fuhry's avatar David Fuhry :fist:
Browse files

Merge branch '74-r-make-stuff-pretty' into 'master'

Resolve "R: Make stuff pretty"

Closes #74

See merge request !77
parents 003b3aee 1bfa4b74
Branches master
No related tags found
1 merge request!77Resolve "R: Make stuff pretty"
......@@ -15,10 +15,13 @@ init_nlp("conda", "spcy")
cat("Starting data import...\n")
# Define paths
project_root <- find_root(has_file("README.md"))
data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
rasa_dir <- paste(project_root, "rasa", sep = .Platform$file.sep)
# Download data or use cache if we have it
articles <- get_data(use.cache = TRUE, data.dir = data_dir)
## Data processing
......@@ -29,15 +32,15 @@ results <- pbapply(articles, 1, function(article) {
# Within this function article is a vector representing a single row of our original data frame
# This means article[1] represents the Title, article[2] the PageID etc.
## Data cleaning
# Data cleaning
cleaned_text <- clean_html(article[4])
## Data preprocessing/annotating
# Data preprocessing/annotating
annotation <- create_annotations(cleaned_text, article[2], article[3], data.dir = data_dir)
## Extract information from Text
# Extract information from Text
spouse_found <- get_spouse(article[4], annotation)
awards_found <- get_awards(annotation)
......@@ -45,7 +48,7 @@ results <- pbapply(articles, 1, function(article) {
birthdate_found <- get_birthdate(cleaned_text, annotation)
nationality_found <- get_nationality(cleaned_text, annotation)
## Create Results
# Create Results
data.frame(name = article[1],
spouse = spouse_found,
......@@ -67,8 +70,6 @@ results <- do.call(rbind, results)
cat("Data processing finished.\n")
## Results are now in results
## Format for rasa
cat("Writing rasa files to 'rasa/'...\n")
......
......@@ -2,7 +2,7 @@
# Author: David
## Though we could get the pages within the category 'physicists' with something like this
## Though we could get the pages within the category 'physicists' with something like
## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
## this gives us only about 50 pages.
## Instead we crawl the names on the article 'List of Physicists' and query those names
......
#!/usr/bin/env Rscript
### Function to extract academic affiliation from text
## TODO: check if the ORG is in relation with another person
## TODO: clean Parentices
## Authors: Leonard
#' Try to extract all places where the person worked
#' To-do: 1) check if the ORG is in relation with another person
#' 2) clean Parentices
#' Function to extract academic affiliation from text
#'
#' @param annotation annotations object from cleanNLP's createAnnotations() function.
#'
#' @return string with all found Academia organizations
#' @export
#'
get_university <- function(annotation) {
# Extracts the entitys with cleanNLP
# Extracts the entitys
entities <- cleanNLP::cnlp_get_entity(annotation)
# Extracts only the organization entitys Strings. With Sid and only the char vectors
# Extracts only the organization entitys
entities_org <- entities[entities$entity_type == "ORG", ]
char_org <- entities_org$entity
## Matching Keywords
# strings we want to match
## Find relevant entities
# Define keywords
to_match <- c("Academy", "University", "Institute", "Department", "Research Centre")
# takes in to_match and extractes them from Organization Entities
# Find matching entities
string_matched <- grep(paste(to_match, collapse = "|"), char_org) %>%
entities_org$entity[.] %>%
unique(.)
#catch if no match
# Return NA if we found nothing
if (length(string_matched) == 0) {
return(NA)
}
# Return result if we only found one
if(length(string_matched)== 1){
return(string_matched)
}
## Duplicates
# Creat Matrix by levistein distance to eleminate duplicates
# Creat Matrix of levistein distances to eleminate duplicates
string_dup <- sapply(string_matched, function(x) {
sapply(string_matched, function(y) {
......@@ -56,7 +55,7 @@ get_university <- function(annotation) {
r <- as.data.frame(string_dup)
# Return a table of logic vector were the distance is to low -> dublicate
# Return a table of logic vector were the distance is too low -> dublicate
res <- sapply(r, function(x) {
x <= 5 & x != 0
......@@ -67,21 +66,20 @@ get_university <- function(annotation) {
str <- sapply(seq_along(r), function(x) {
colnames(r)[res[x, ]]
})
# comparing this lists and delete ever pait which could be a duplicate
# Eliminate possible duplicates
dup <- unlist(str, use.names = FALSE)
result <- string_matched
result <- result[!result %in% dup]
# check if string is empty and return NA
# If we got nothing now (for some reason) return NA
if (length(result) == 0) {
return(NA)
}
# Bind the results together and return them
result <- paste(result, collapse = ", ")
}
# Functinality to use cleanNLP
# Author: David
#' Initialize the nlp backend
#'
#' A wrapper used to set the python environment and call cnlp_init
......@@ -33,19 +37,16 @@ init_nlp <- function(type, value) {
#'
#' @return Annotation object for use with cleanNLP methods
#' @export
create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
if (use.cache || write.cache) {
if (!missing(article.id) && !missing(article.rev.id)) {
# Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
# To prevent this we stip 'em agsain
filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE)
# Check if there is a cached version of the annotations for this article in this specific revision
if(use.cache && file.exists(filename)) {
res <- tryCatch({
data <- readRDS(filename)
......@@ -81,7 +82,6 @@ create_annotations <- function(text, article.id, article.rev.id, use.cache = TRU
# Return data
# Still need this
return(annotation)
}
\ No newline at end of file
......@@ -43,19 +43,10 @@ get_infobox <- function(article) {
info.table <<- NA
})
return(info.table)
}
### TODO:
### * performance?
### * should be a bit better now as regex is lazy
### * make the method be usable with only text or pos tags
### * add matching of 'up to 3 arbitrary words'*{1:3} or something like this
### * maybe expose some functionality to use with custom regex?
### * Testing (!!)
# Author: David
#' Matches the given pattern to the provided annotations object
#'
......@@ -114,7 +105,6 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
}
# Let's build the vector we're going to be searching through
# TODO: This way of doing this is probably inefficent
combined <- NA
annot <- cleanNLP::cnlp_get_token(annotations)
......@@ -137,7 +127,6 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
combined <- paste(annot$word, annot$upos, sep = "\u001F")
}
# We could also use something else to collapse, we would slightly need to modify the query
# but we could skip removing spaces. If that is worth anything?
# Also appending an extra space so we can match the last element
......@@ -178,7 +167,6 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
# Stitch it all together
expr <- paste(expr, collapse = "")
matches <- gregexpr(expr, combined, ignore.case = ignore.case)
# If there was no match we may exit here
......@@ -201,6 +189,5 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
})
}
unname(matches)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment