### GetNoOfSpouses.R ### This extracts the number of spouses from the infobox ### If no infobox or no information about spouses is found assumes there are none ### Not for use in production, this does not actually get information from text # Author: David ## Librarys library(rvest) library(data.table) ### Get number of spouses getNoOfSpouses <- function(article) { # If there is no infobox we assume there were no spouses if(!grepl("vcard", article)) { return(0) } infoBox <- getInfoBox(article) # Get the spouse field spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content # Remove everything in parentheses spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses) # Split the strings by newlines to get one spouse per line spouses <- strsplit(spouses, "\n") spouses <- unlist(spouses) if(length(spouses) > 0) { return(length(spouses)) } return(0) } ### Converts info box to table getInfoBox <- function(article) { # Read page as html page <- read_html(article) # Extracting text from the html will erase all <br> tags, # this will replace them with line breaks xml_find_all(page, ".//br") %>% xml_add_sibling("p", "\n") xml_find_all(page, ".//br") %>% xml_remove() # Get the info box # Will throw an error if there isnt any, so that should be checked beforehand table <- page %>% html_nodes("table.vcard") %>% html_table(fill = TRUE) %>% .[[1]] colnames(table) <- c("Desc", "Content") return(table) }