diff --git a/r/GetBirthdate.R b/r/GetBirthdate.R new file mode 100644 index 0000000000000000000000000000000000000000..a925372dabe219beb4ccf7593f00990c1e9e159c --- /dev/null +++ b/r/GetBirthdate.R @@ -0,0 +1,96 @@ +#!/usr/bin/env Rscript +### Extracts birthdate fronm the infobox +### If there is no infobox the Introduction text can be checked + +# Author: Lukas + +## Librarys + +library(rvest) +library(stringr) +library(data.table) + +### Try to extract birthdate from infobox +### If there is no infobox, try to extract from introduction text +getBirthdate <- function(article) { + # check + if(!grepl("vcard", article)) { + # check first paragraph + introduction <- getIntroduction(article) + if(!introduction == "") { + # get birthdate inside of parentheses + birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]] + # remove parentheses + birthdate <- substring(birthdate, 2, nchar(birthdate)-1) + } else { + # retrun Null if there is no birthdate + return(0) + } + } + + # try to get birthdate via infobox + infoBox <- getInfoBox(article) + + # get the born field + birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content + + # remove everything except the birthdate + # remove everything in round brackets + birthdate <- gsub("\\s*\\([^\\)]+\\)", "", birthdate) + # remove everything starting with newline + birthdate <- gsub("\\n.*$", "", birthdate) + return(birthdate) + +} + +### Uses Davids function to get infobox +### Converts info box to table +getInfoBox <- function(article) { + # Read page as html + page <- read_html(article) + + # Extracting text from the html will erase all <br> tags, + # this will replace them with line breaks + + xml_find_all(page, ".//br") %>% + xml_add_sibling("p", "\n") + + xml_find_all(page, ".//br") %>% + xml_remove() + + # Get the info box + # Will throw an error if there isnt any, so that should be checked beforehand + + table <- page %>% + html_nodes("table.vcard") %>% + html_table(fill = TRUE) %>% + .[[1]] + + colnames(table) <- c("Desc", "Content") + + return(table) +} + +### Get Introduction Text from Wikipedia page that contains birthdate +getIntroduction <- function(article) { + # Read page as html + page <- read_html(article) + + # Extracting text from the html will erase all <br> tags, + # This will replace them with line breaks + + xml_find_all(page, ".//br") %>% + xml_add_sibling("p", "\n") + + xml_find_all(page, ".//br") %>% + xml_remove + + # Get first paragraph + introduction <- page %>% + html_nodes("p") %>% + html_text() %>% + .[[1]] + + # Return first paragraph + return(introduction) +}