Merge branch '74-r-make-stuff-pretty' into 'master'

Resolve "R: Make stuff pretty" Closes #74 See merge request !77

Merge branch '74-r-make-stuff-pretty' into 'master'
bfd535a8 · David Fuhry · 003b3aee · 1bfa4b74 · bfd535a8 · bfd535a8
Commit bfd535a8 authored 6 years ago by David Fuhry
--- a/processing/script/master.R
+++ b/processing/script/master.R
@@ -15,10 +15,13 @@ init_nlp("conda", "spcy")
 cat("Starting data import...\n")

 # Define paths
+
 project_root <- find_root(has_file("README.md"))
 data_dir <- paste(project_root, "data", sep = .Platform$file.sep)
 rasa_dir <- paste(project_root, "rasa", sep = .Platform$file.sep)

+# Download data or use cache if we have it
+
 articles <- get_data(use.cache = TRUE, data.dir = data_dir)

 ## Data processing
@@ -29,15 +32,15 @@ results <- pbapply(articles, 1, function(article) {
  # Within this function article is a vector representing a single row of our original data frame
  # This means article[1] represents the Title, article[2] the PageID etc.

-  ## Data cleaning
+  # Data cleaning

  cleaned_text <- clean_html(article[4])

-  ## Data preprocessing/annotating
+  # Data preprocessing/annotating

  annotation <- create_annotations(cleaned_text, article[2], article[3], data.dir = data_dir)

-  ## Extract information from Text
+  # Extract information from Text

  spouse_found <- get_spouse(article[4], annotation)
  awards_found <- get_awards(annotation)
@@ -45,7 +48,7 @@ results <- pbapply(articles, 1, function(article) {
  birthdate_found <- get_birthdate(cleaned_text, annotation)
  nationality_found <- get_nationality(cleaned_text, annotation)

-  ## Create Results
+  # Create Results

  data.frame(name = article[1],
             spouse = spouse_found,
@@ -67,8 +70,6 @@ results <- do.call(rbind, results)

 cat("Data processing finished.\n")

-## Results are now in results
-
 ## Format for rasa

 cat("Writing rasa files to 'rasa/'...\n")

--- a/processing/wikiproc/R/get_data.R
+++ b/processing/wikiproc/R/get_data.R
@@ -2,7 +2,7 @@

 # Author: David

-## Though we could get the pages within the category 'physicists' with something like this
+## Though we could get the pages within the category 'physicists' with something like
 ## pages_in_category("en", "wikipedia", categories = "physicists")$query$categorymembers
 ## this gives us only about 50 pages.
 ## Instead we crawl the names on the article 'List of Physicists' and query those names

--- a/processing/wikiproc/R/get_university.R
+++ b/processing/wikiproc/R/get_university.R
 #!/usr/bin/env Rscript

-
-### Function to extract academic affiliation from text
+## TODO: check if the ORG is in relation with another person
+## TODO: clean Parentices 

 ## Authors: Leonard

-
-#' Try to extract all places where the person worked 
-#' To-do: 1) check if the ORG is in relation with another person
-#'        2) clean Parentices 
+#' Function to extract academic affiliation from text
+#' 
 #' @param annotation annotations object from cleanNLP's createAnnotations() function.
 #'
 #' @return string with all found Academia organizations
 #' @export
-#'
 get_university <- function(annotation) {
- 
-  
-  # Extracts the entitys with cleanNLP
+  # Extracts the entitys 
  
  entities <- cleanNLP::cnlp_get_entity(annotation)
  
-  # Extracts only the organization entitys Strings. With Sid and only the char vectors
-  
+  # Extracts only the organization entitys
+
  entities_org <- entities[entities$entity_type == "ORG", ]
  char_org <- entities_org$entity
  
-  ## Matching Keywords
-  # strings we want to match
+  ## Find relevant entities
+  # Define keywords
  
  to_match <- c("Academy", "University", "Institute", "Department", "Research Centre")
  
-  # takes in to_match and extractes them from Organization Entities
-  
+  # Find matching entities
+
  string_matched <- grep(paste(to_match, collapse = "|"), char_org) %>%
    entities_org$entity[.] %>%
    unique(.)
  
-  #catch if no match 
+  # Return NA if we found nothing 
+
  if (length(string_matched) == 0) {
    return(NA)
  }
+
+  # Return result if we only found one
+
  if(length(string_matched)== 1){
    return(string_matched)
  }
  
  ## Duplicates
-  # Creat Matrix by levistein distance to eleminate duplicates
+  # Creat Matrix of levistein distances to eleminate duplicates
  
  string_dup <- sapply(string_matched, function(x) {
    sapply(string_matched, function(y) {
@@ -56,7 +55,7 @@ get_university <- function(annotation) {
  
  r <- as.data.frame(string_dup)
  
-  # Return a table of logic vector were the distance is to low -> dublicate
+  # Return a table of logic vector were the distance is too low -> dublicate
  
  res <- sapply(r, function(x) {
    x <= 5 & x != 0
@@ -67,21 +66,20 @@ get_university <- function(annotation) {
  str <- sapply(seq_along(r), function(x) {
    colnames(r)[res[x, ]]
  })
-  
-  
-  # comparing this lists and delete ever pait which could be a duplicate
+
+  # Eliminate possible duplicates  
+
  dup <- unlist(str, use.names = FALSE)
  result <- string_matched
  result <- result[!result %in% dup]
  
-  
-  # check if string is empty and return NA
+  # If we got nothing now (for some reason) return NA
+
  if (length(result) == 0) {
    return(NA)
  }
  
+  # Bind the results together and return them
+
  result <- paste(result, collapse = ", ")
-  
-  
-  
 }
--- a/processing/wikiproc/R/nlp_annotate.R
+++ b/processing/wikiproc/R/nlp_annotate.R
+# Functinality to use cleanNLP
+
+# Author: David
+
 #' Initialize the nlp backend
 #'
 #' A wrapper used to set the python environment and call cnlp_init
@@ -33,19 +37,16 @@ init_nlp <- function(type, value) {
 #'
 #' @return Annotation object for use with cleanNLP methods
 #' @export
-create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
-  
+create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") {  
  if (use.cache || write.cache) {
    if (!missing(article.id) && !missing(article.rev.id)) {
      
      # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
      # To prevent this we stip 'em agsain
-      
-      
+
      filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE)
      
      # Check if there is a cached version of the annotations for this article in this specific revision
-      
      if(use.cache && file.exists(filename)) {
        res <- tryCatch({
          data <- readRDS(filename)
@@ -81,7 +82,6 @@ create_annotations <- function(text, article.id, article.rev.id, use.cache = TRU

  # Return data

-  # Still need this 
  return(annotation)
 }
  
\ No newline at end of file
--- a/processing/wikiproc/R/utils.R
+++ b/processing/wikiproc/R/utils.R
@@ -43,19 +43,10 @@ get_infobox <- function(article) {
    info.table <<- NA
  })

-  
  return(info.table)
 }

-
-### TODO:
-### * performance?
-###   * should be a bit better now as regex is lazy
-### * make the method be usable with only text or pos tags
-### * add matching of 'up to 3 arbitrary words'*{1:3} or something like this
-### * maybe expose some functionality to use with custom regex?
-### * Testing (!!)
-
+# Author: David

 #' Matches the given pattern to the provided annotations object
 #'
@@ -114,7 +105,6 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
  }

  # Let's build the vector we're going to be searching through
-  # TODO: This way of doing this is probably inefficent

  combined <- NA
  annot <- cleanNLP::cnlp_get_token(annotations)
@@ -137,7 +127,6 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
    combined <- paste(annot$word, annot$upos, sep = "\u001F")
  }

-
  # We could also use something else to collapse, we would slightly need to modify the query
  # but we could skip removing spaces. If that is worth anything?
  # Also appending an extra space so we can match the last element
@@ -178,7 +167,6 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
  # Stitch it all together
  expr <- paste(expr, collapse = "")

-
  matches <- gregexpr(expr, combined, ignore.case = ignore.case)

  # If there was no match we may exit here
@@ -201,6 +189,5 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
    })
  }

-
  unname(matches)
 }