-
Lucas Schons authored
* Create R package containing function definitions * Create directory processing containing scripts * fix some typos * general cleanup
Lucas Schons authored* Create R package containing function definitions * Create directory processing containing scripts * fix some typos * general cleanup
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
GetNoOfSpouses.R 1.48 KiB
### GetNoOfSpouses.R
### This extracts the number of spouses from the infobox
### If no infobox or no information about spouses is found assumes there are none
### Not for use in production, this does not actually get information from text
# Author: David
## Librarys
library(rvest)
library(data.table)
### Get number of spouses
getNoOfSpouses <- function(article) {
# If there is no infobox we assume there were no spouses
if(!grepl("vcard", article)) {
return(0)
}
infoBox <- getInfoBox(article)
# Get the spouse field
spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
# Remove everything in parentheses
spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
# Split the strings by newlines to get one spouse per line
spouses <- strsplit(spouses, "\n")
spouses <- unlist(spouses)
if(length(spouses) > 0) {
return(length(spouses))
}
return(0)
}
### Converts info box to table
getInfoBox <- function(article) {
# Read page as html
page <- read_html(article)
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>%
xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
html_nodes("table.vcard") %>%
html_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}