Skip to content
Snippets Groups Projects
nlp_annotate.R 2.9 KiB
Newer Older
#' Initialize the nlp backend
#' A wrapper used to set the python environment and call cnlp_init
#'
#' @param type Type of python env to use, either "conda" or "python"
#' @param value Connection string, if using a conda environment the name of it
#' if using python directly the path to the python executable
#'
#' @return Does not return data
#' @export
#'
#' @examples
#' \dontrun{
#' init_nlp("conda", "spcy")
#' }
init_nlp <- function(type, value) {
  if (type == "conda") {
    reticulate::use_condaenv(value, required = TRUE)
  } else if (type == "python") {
    reticulate::use_python(value, required = TRUE)
  }
  cleanNLP::cnlp_init_spacy(entity_flag = TRUE)
}

#' Create annotations for the given text
#'
#' @param text Text to annotate
#' @param article.id ArticleID used for cashing. **Optional** as long as cache is not to be used.
#' @param article.rev.id ArticleRevisionID used for cashing. **Optional** as long as cache is not to be used.
#' @param use.cache Should cashed data be uses
#' @param write.cache Should the generated annotations be cashed
#' @param data.dir Directory the data should be read from and/or written to
#'
#' @return Annotation object for use with cleanNLP methods
#' @export
create_annotations <- function(text, article.id, article.rev.id, use.cache = TRUE, write.cache = FALSE, data.dir = "data") {
  
  if (use.cache || write.cache) {
David Fuhry's avatar
David Fuhry committed
    if (!missing(article.id) && !missing(article.rev.id)) {
      
      # Generate filename, for some reason there paste0 will pad the article id with leading whitespaces
David Fuhry's avatar
David Fuhry committed
      # To prevent this we stip 'em agsain
      
David Fuhry's avatar
David Fuhry committed
      filename <- gsub(" ", "", paste(data_dir, "annotations", paste0(article.id, "-", article.rev.id, ".RDS"), sep = .Platform$file.sep), fixed = TRUE)
      
      # Check if there is a cached version of the annotations for this article in this specific revision
      
David Fuhry's avatar
David Fuhry committed
      if(use.cache && file.exists(filename)) {
        res <- tryCatch({
          data <- readRDS(filename)
          data
        }, error = function (e) {
          cat("Cached data seems to be corrupted, redoing annotation.\n")
        })
        return(res)
      }
    } else if (use.cache) {
      warning("use.cache was set to true without providing article id and revision id, cache will not be used.")
    }
  annotation <- cleanNLP::cnlp_annotate(text, as_strings = TRUE)
  # Write cache if desired
  if(write.cache) {
    if (!missing(article.id) && !missing(article.rev.id)) {
David Fuhry's avatar
David Fuhry committed
      if (!dir.exists(data_dir)) {
        dir.create(data_dir)
David Fuhry's avatar
David Fuhry committed
      if (!dir.exists(paste(data_dir, "annotations", sep = .Platform$file.sep))) {
        dir.create(paste(data_dir, "annotations", sep = .Platform$file.sep))
      }
      saveRDS(annotation, filename)
    } else {
      warning("write.cache was set to true without providing article id and revision id, cache will not be written")
  # Return data
Dev's avatar
Dev committed
  # Still need this 
  return(annotation)
David Fuhry's avatar
David Fuhry committed
}