Skip to content
Snippets Groups Projects
get_no_of_spouses.R 1.13 KiB
Newer Older
David Fuhry's avatar
David Fuhry committed
### GetNoOfSpouses.R
### This extracts the number of spouses from the infobox
### If no infobox or no information about spouses is found assumes there are none
### Not for use in production, this does not actually get information from text

# Author: David

#' Reads the number of spouses from the infobox of an wikipedia article
#'
#' @param article Wikipedia article in html format
#'
#' @return Integer indicating the number of spouses
#' @export
#'
#' @examples
#' \dontrun{
#' articles <- get_data()
#' 
#' no.spouses <- get_no_of_spouses(articles$Text[54])
#' 
#' no,spouses 
#' }
get_no_of_spouses <- function(article) {
  
  # If there is no infobox we assume there were no spouses
  if(!grepl("vcard", article)) {
    return(0)
  }
  
  infoBox <- get_infobox(article)
  
  # Get the spouse field
  spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
  # Remove everything in parentheses
  spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
  # Split the strings by newlines to get one spouse per line
  spouses <- strsplit(spouses, "\n")
  spouses <- unlist(spouses)
  if(length(spouses) > 0) {
    return(length(spouses))
  }
  return(0)
}