Skip to content
Snippets Groups Projects
Commit ba8af03b authored by Lukas Gehrke's avatar Lukas Gehrke
Browse files

Fixt getter für ersten paragraph.

parent 6e1d902d
No related branches found
No related tags found
1 merge request!21Resolve "R-Skript für Birthdate erstellen"
......@@ -13,7 +13,7 @@ library(data.table)
### Try to extract birthdate from infobox
### If there is no infobox, try to extract from introduction text
getBirthdate <- function(article) {
# check if vcard exists
# check
if(!grepl("vcard", article)) {
# check first paragraph
introduction <- getIntroduction(article)
......@@ -22,8 +22,6 @@ getBirthdate <- function(article) {
birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]]
# remove parentheses
birthdate <- substring(birthdate, 2, nchar(birthdate)-1)
return(birthdate)
} else {
# retrun Null if there is no birthdate
return(0)
......@@ -87,11 +85,16 @@ getIntroduction <- function(article) {
xml_find_all(page, ".//br") %>%
xml_remove
# Get first paragraph
introduction <- page %>%
# Get all paragraphs
paragraphs <- page %>%
html_nodes("p") %>%
html_text() %>%
.[[1]]
html_text()
# there will be some leading paragraphs containing only "\n"
# remove those leading paragraphs
remove <- c("\n")
cleaned <- setdiff(paragraphs, remove)
introduction <- cleaned[1]
# Return first paragraph
return(introduction)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment