Skip to content
Snippets Groups Projects
nlp_annotate.R 2.33 KiB
Newer Older
David Fuhry's avatar
David Fuhry committed
#' Initialize the nlp backend
#' 
#' A wrapper used to set the python environment and call cnlp_init
#'
#' @param type Type of python env to use, either "conda" or "python"
#' @param value Connection string, if using a conda environment the name of it
#' if using python directly the path to the python executable
#'
#' @return Does not return data
#' @export
#'
#' @examples
#' \dontrun{
#' init_nlp("conda", "spcy")
#' }
init_nlp <- function(type, value) {
  if (type == "conda") {
    reticulate::use_condaenv(value, required = TRUE)
  } else if (type == "python") {
    reticulate::use_python(value, required = TRUE)
  }
  cleanNLP::cnlp_init_spacy(entity_flag = TRUE)
}

#' Create annotations for the given text
#'
#' @param text Text to annotate
#' @param article.id ArticleID used for cashing
#' @param article.rev.id ArticleRevisionID used for cashing
#' @param use.cache Should cashed data be uses
#' @param write.cache Should the generated annotations be cashed
#' @param data.dir Directory the data should be read from and/or written to
#'
#' @return Annotation object for use with cleanNLP methods
#' @export
create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
  
  # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
  # To prevent this we stip 'em again
  
  filename <- gsub(" ", "", paste(data.dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE)
  
  # Check if there is a cached version of the annotations for this article in this specific revision
  
  if(use.cache & file.exists(filename)) {
    res <- tryCatch({
      data <- readRDS(filename)
      data
    }, error = function (e) {
      cat("Cached data seems to be corrupted, redoing annotation.\n")
    })
    return(res)
  }
  
  annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE)
  
  # Write cache if desired
  
  if(write.cache) {
    if (!dir.exists("data")) {
      dir.create("data")
    }
    if (!dir.exists("data/annotations")) {
      dir.create("data/annotations")
    }
    saveRDS(annotation, filename)
  }
  
  # Return data
  # On a side note: Should we do this? The tidyverse style guide discourages explicit returns.
  # But then again, it suggests snake case for variables...
  
  return(annotation)
}