Skip to content
Snippets Groups Projects
Commit 537298d4 authored by David Fuhry's avatar David Fuhry :fist: Committed by Lucas Schons
Browse files

Resolve "Make get_universtiy mergable"

parent 6d48bdb1
No related branches found
No related tags found
No related merge requests found
......@@ -41,6 +41,7 @@ results <- pbapply(articles, 1, function(article) {
spouse_found <- get_spouse(article[4], annotation)
awards <- get_awards(annotation)
university_found <- get_university(annotation)
## Create Results
......@@ -52,7 +53,7 @@ results <- pbapply(articles, 1, function(article) {
place_of_death = NA,
is_alive = NA,
primary_education = NA,
university = NA,
university = university_found,
area_of_research = NA,
workplace = NA,
awards = awards,
......
......@@ -6,6 +6,8 @@ export(get_awards)
export(get_birthdate)
export(get_birthplace)
export(get_data)
export(get_no_of_spouses)
export(get_university)
export(get_spouse)
export(init_nlp)
importFrom(data.table,"%like%")
......
#!/usr/bin/env Rscript
### Function to extract academic affiliation from text
## Authors: Leonard
#' Try to extract all places where the person worked
#' To-do: 1) check if the ORG is in relation with another person
#' 2) clean Parentices
#' @param annotation annotations object from cleanNLP's createAnnotations() function.
#'
#' @return string with all found Academia organizations
#' @export
#'
get_university <- function(annotation) {
# Extracts the entitys with cleanNLP
entities <- cleanNLP::cnlp_get_entity(annotation)
# Extracts only the organization entitys Strings. With Sid and only the char vectors
entities_org <- entities[entities$entity_type == "ORG", ]
char_org <- entities_org$entity
## Matching Keywords
# strings we want to match
to_match <- c("Academy", "University", "Institute", "Department", "Research Centre")
# takes in to_match and extractes them from Organization Entities
string_matched <- grep(paste(to_match, collapse = "|"), char_org) %>%
entities_org$entity[.] %>%
unique(.)
#catch if no match
if (length(string_matched) == 0) {
return(NA)
}
if(length(string_matched)== 1){
return(string_matched)
}
## Duplicates
# Creat Matrix by levistein distance to eleminate duplicates
string_dup <- sapply(string_matched, function(x) {
sapply(string_matched, function(y) {
adist(x, y, partial = TRUE)
})
})
r <- as.data.frame(string_dup)
# Return a table of logic vector were the distance is to low -> dublicate
res <- sapply(r, function(x) {
x <= 5 & x != 0
})
# Gives list with potential duplicates
str <- sapply(seq_along(r), function(x) {
colnames(r)[res[x, ]]
})
# comparing this lists and delete ever pait which could be a duplicate
dup <- unlist(str, use.names = FALSE)
result <- string_matched
result <- result[!result %in% dup]
# check if string is empty and return NA
if (length(result) == 0) {
return(NA)
}
result <- paste(result, collapse = ", ")
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_university.R
\name{get_university}
\alias{get_university}
\title{Try to extract all places where the person worked
To-do: 1) check if the ORG is in relation with another person
2) clean Parentices}
\usage{
get_university(annotation)
}
\arguments{
\item{annotation}{annotations object from cleanNLP's createAnnotations() function.}
}
\value{
string with all found Academia organizations
}
\description{
Try to extract all places where the person worked
To-do: 1) check if the ORG is in relation with another person
2) clean Parentices
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment