Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
GetNoOfSpouses.R 1.48 KiB
### GetNoOfSpouses.R
### This extracts the number of spouses from the infobox
### If no infobox or no information about spouses is found assumes there are none
### Not for use in production, this does not actually get information from text

# Author: David

## Librarys

library(rvest)
library(data.table)

### Get number of spouses

getNoOfSpouses <- function(article) {
  
  # If there is no infobox we assume there were no spouses
  if(!grepl("vcard", article)) {
    return(0)
  }
  
  infoBox <- getInfoBox(article)
  
  # Get the spouse field
  spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
  # Remove everything in parentheses
  spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
  # Split the strings by newlines to get one spouse per line
  spouses <- strsplit(spouses, "\n")
  spouses <- unlist(spouses)
  if(length(spouses) > 0) {
    return(length(spouses))
  }
  return(0)
}

### Converts info box to table
getInfoBox <- function(article) {
  # Read page as html
  page <- read_html(article)
  
  # Extracting text from the html will erase all <br> tags,
  # this will replace them with line breaks
  
  xml_find_all(page, ".//br") %>%
    xml_add_sibling("p", "\n")
  
  xml_find_all(page, ".//br") %>%
    xml_remove()
  
  # Get the info box
  # Will throw an error if there isnt any, so that should be checked beforehand
  
  table <- page %>%
    html_nodes("table.vcard") %>%
    html_table(fill = TRUE) %>%
    .[[1]]
  
  colnames(table) <- c("Desc", "Content")
  
  return(table)
}