Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
### GetNoOfSpouses.R
### This extracts the number of spouses from the infobox
### If no infobox or no information about spouses is found assumes there are none
### Not for use in production, this does not actually get information from text
# Author: David
#' Reads the number of spouses from the infobox of an wikipedia article
#'
#' @param article Wikipedia article in html format
#'
#' @return Integer indicating the number of spouses
#' @export
#'
#' @examples
#' \dontrun{
#' articles <- get_data()
#'
#' no.spouses <- get_no_of_spouses(articles$Text[54])
#'
#' no,spouses
#' }
get_no_of_spouses <- function(article) {
# If there is no infobox we assume there were no spouses
if(!grepl("vcard", article)) {
return(0)
}
infoBox <- get_infobox(article)
# Get the spouse field
spouses <- infoBox[infoBox$Desc %like% "Spouse",]$Content
# Remove everything in parentheses
spouses <- gsub("\\s*\\([^\\)]+\\)", "", spouses)
# Split the strings by newlines to get one spouse per line
spouses <- strsplit(spouses, "\n")
spouses <- unlist(spouses)
if(length(spouses) > 0) {
return(length(spouses))
}
return(0)
}