Skip to content
Snippets Groups Projects
Commit 168ac46a authored by Lukas Gehrke's avatar Lukas Gehrke
Browse files

Fügt Skript für Birthdate-Extraktion hinzu.

parent 3207c37f
No related branches found
No related tags found
1 merge request!21Resolve "R-Skript für Birthdate erstellen"
#!/usr/bin/env Rscript
### Extracts birthdate fronm the infobox
### If there is no infobox the Introduction text can be checked
# Author: Lukas
## Librarys
library(rvest)
library(stringr)
library(data.table)
### Try to extract birthdate from infobox
### If there is no infobox, try to extract from introduction text
getBirthdate <- function(article) {
# check
if(!grepl("vcard", article)) {
# check first paragraph
introduction <- getIntroduction(article)
if(!introduction == "") {
# get birthdate inside of parentheses
birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]]
# remove parentheses
birthdate <- substring(birthdate, 2, nchar(birthdate)-1)
} else {
# retrun Null if there is no birthdate
return(0)
}
}
# try to get birthdate via infobox
infoBox <- getInfoBox(article)
# get the born field
birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content
# remove everything except the birthdate
# remove everything in round brackets
birthdate <- gsub("\\s*\\([^\\)]+\\)", "", birthdate)
# remove everything starting with newline
birthdate <- gsub("\\n.*$", "", birthdate)
return(birthdate)
}
### Uses Davids function to get infobox
### Converts info box to table
getInfoBox <- function(article) {
# Read page as html
page <- read_html(article)
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>%
xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
html_nodes("table.vcard") %>%
html_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
### Get Introduction Text from Wikipedia page that contains birthdate
getIntroduction <- function(article) {
# Read page as html
page <- read_html(article)
# Extracting text from the html will erase all <br> tags,
# This will replace them with line breaks
xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>%
xml_remove
# Get first paragraph
introduction <- page %>%
html_nodes("p") %>%
html_text() %>%
.[[1]]
# Return first paragraph
return(introduction)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment