Resolve "Make get_universtiy mergable"

537298d4 · David Fuhry · Lucas Schons · 6d48bdb1 · 537298d4 · 537298d4
Commit 537298d4 authored 6 years ago by David Fuhry Committed by Lucas Schons 6 years ago
--- a/processing/script/master.R
+++ b/processing/script/master.R
@@ -41,6 +41,7 @@ results <- pbapply(articles, 1, function(article) {

  spouse_found <- get_spouse(article[4], annotation)
  awards <- get_awards(annotation)
+  university_found <- get_university(annotation)

  ## Create Results

@@ -52,7 +53,7 @@ results <- pbapply(articles, 1, function(article) {
             place_of_death = NA,
             is_alive = NA,
             primary_education = NA,
-             university = NA,
+             university = university_found,
             area_of_research = NA,
             workplace = NA,
             awards = awards,

--- a/processing/wikiproc/NAMESPACE
+++ b/processing/wikiproc/NAMESPACE
@@ -6,6 +6,8 @@ export(get_awards)
 export(get_birthdate)
 export(get_birthplace)
 export(get_data)
+export(get_no_of_spouses)
+export(get_university)
 export(get_spouse)
 export(init_nlp)
 importFrom(data.table,"%like%")

--- a/processing/wikiproc/R/get_university.R
+++ b/processing/wikiproc/R/get_university.R
+#!/usr/bin/env Rscript
+
+
+### Function to extract academic affiliation from text
+
+## Authors: Leonard
+
+
+#' Try to extract all places where the person worked 
+#' To-do: 1) check if the ORG is in relation with another person
+#'        2) clean Parentices 
+#' @param annotation annotations object from cleanNLP's createAnnotations() function.
+#'
+#' @return string with all found Academia organizations
+#' @export
+#'
+get_university <- function(annotation) {
+ 
+  
+  # Extracts the entitys with cleanNLP
+  
+  entities <- cleanNLP::cnlp_get_entity(annotation)
+  
+  # Extracts only the organization entitys Strings. With Sid and only the char vectors
+  
+  entities_org <- entities[entities$entity_type == "ORG", ]
+  char_org <- entities_org$entity
+  
+  ## Matching Keywords
+  # strings we want to match
+  
+  to_match <- c("Academy", "University", "Institute", "Department", "Research Centre")
+  
+  # takes in to_match and extractes them from Organization Entities
+  
+  string_matched <- grep(paste(to_match, collapse = "|"), char_org) %>%
+    entities_org$entity[.] %>%
+    unique(.)
+  
+  #catch if no match 
+  if (length(string_matched) == 0) {
+    return(NA)
+  }
+  if(length(string_matched)== 1){
+    return(string_matched)
+  }
+  
+  ## Duplicates
+  # Creat Matrix by levistein distance to eleminate duplicates
+  
+  string_dup <- sapply(string_matched, function(x) {
+    sapply(string_matched, function(y) {
+      adist(x, y, partial = TRUE)
+    })
+  })
+  
+  r <- as.data.frame(string_dup)
+  
+  # Return a table of logic vector were the distance is to low -> dublicate
+  
+  res <- sapply(r, function(x) {
+    x <= 5 & x != 0
+  })
+  
+  # Gives list with potential duplicates
+  
+  str <- sapply(seq_along(r), function(x) {
+    colnames(r)[res[x, ]]
+  })
+  
+  
+  # comparing this lists and delete ever pait which could be a duplicate
+  dup <- unlist(str, use.names = FALSE)
+  result <- string_matched
+  result <- result[!result %in% dup]
+  
+  
+  # check if string is empty and return NA
+  if (length(result) == 0) {
+    return(NA)
+  }
+  
+  result <- paste(result, collapse = ", ")
+  
+  
+  
+}
--- a/processing/wikiproc/man/get_university.Rd
+++ b/processing/wikiproc/man/get_university.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_university.R
+\name{get_university}
+\alias{get_university}
+\title{Try to extract all places where the person worked 
+To-do: 1) check if the ORG is in relation with another person
+       2) clean Parentices}
+\usage{
+get_university(annotation)
+}
+\arguments{
+\item{annotation}{annotations object from cleanNLP's createAnnotations() function.}
+}
+\value{
+string with all found Academia organizations
+}
+\description{
+Try to extract all places where the person worked 
+To-do: 1) check if the ORG is in relation with another person
+       2) clean Parentices
+}