From 168ac46afa1e6b0b7d96e956e3cc3504b80988e0 Mon Sep 17 00:00:00 2001 From: Lukas Gehrke <lukasgehrke@Lukass-MacBook-Pro.local> Date: Tue, 1 Jan 2019 14:27:20 +0100 Subject: [PATCH] =?UTF-8?q?F=C3=BCgt=20Skript=20f=C3=BCr=20Birthdate-Extra?= =?UTF-8?q?ktion=20hinzu.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- r/GetBirthdate.R | 96 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 r/GetBirthdate.R diff --git a/r/GetBirthdate.R b/r/GetBirthdate.R new file mode 100644 index 0000000..a925372 --- /dev/null +++ b/r/GetBirthdate.R @@ -0,0 +1,96 @@ +#!/usr/bin/env Rscript +### Extracts birthdate fronm the infobox +### If there is no infobox the Introduction text can be checked + +# Author: Lukas + +## Librarys + +library(rvest) +library(stringr) +library(data.table) + +### Try to extract birthdate from infobox +### If there is no infobox, try to extract from introduction text +getBirthdate <- function(article) { + # check + if(!grepl("vcard", article)) { + # check first paragraph + introduction <- getIntroduction(article) + if(!introduction == "") { + # get birthdate inside of parentheses + birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]] + # remove parentheses + birthdate <- substring(birthdate, 2, nchar(birthdate)-1) + } else { + # retrun Null if there is no birthdate + return(0) + } + } + + # try to get birthdate via infobox + infoBox <- getInfoBox(article) + + # get the born field + birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content + + # remove everything except the birthdate + # remove everything in round brackets + birthdate <- gsub("\\s*\\([^\\)]+\\)", "", birthdate) + # remove everything starting with newline + birthdate <- gsub("\\n.*$", "", birthdate) + return(birthdate) + +} + +### Uses Davids function to get infobox +### Converts info box to table +getInfoBox <- function(article) { + # Read page as html + page <- read_html(article) + + # Extracting text from the html will erase all <br> tags, + # this will replace them with line breaks + + xml_find_all(page, ".//br") %>% + xml_add_sibling("p", "\n") + + xml_find_all(page, ".//br") %>% + xml_remove() + + # Get the info box + # Will throw an error if there isnt any, so that should be checked beforehand + + table <- page %>% + html_nodes("table.vcard") %>% + html_table(fill = TRUE) %>% + .[[1]] + + colnames(table) <- c("Desc", "Content") + + return(table) +} + +### Get Introduction Text from Wikipedia page that contains birthdate +getIntroduction <- function(article) { + # Read page as html + page <- read_html(article) + + # Extracting text from the html will erase all <br> tags, + # This will replace them with line breaks + + xml_find_all(page, ".//br") %>% + xml_add_sibling("p", "\n") + + xml_find_all(page, ".//br") %>% + xml_remove + + # Get first paragraph + introduction <- page %>% + html_nodes("p") %>% + html_text() %>% + .[[1]] + + # Return first paragraph + return(introduction) +} -- GitLab