Skip to content
Snippets Groups Projects
Commit ba8af03b authored by Lukas Gehrke's avatar Lukas Gehrke
Browse files

Fixt getter für ersten paragraph.

parent 6e1d902d
No related branches found
No related tags found
1 merge request!21Resolve "R-Skript für Birthdate erstellen"
...@@ -13,7 +13,7 @@ library(data.table) ...@@ -13,7 +13,7 @@ library(data.table)
### Try to extract birthdate from infobox ### Try to extract birthdate from infobox
### If there is no infobox, try to extract from introduction text ### If there is no infobox, try to extract from introduction text
getBirthdate <- function(article) { getBirthdate <- function(article) {
# check if vcard exists # check
if(!grepl("vcard", article)) { if(!grepl("vcard", article)) {
# check first paragraph # check first paragraph
introduction <- getIntroduction(article) introduction <- getIntroduction(article)
...@@ -22,8 +22,6 @@ getBirthdate <- function(article) { ...@@ -22,8 +22,6 @@ getBirthdate <- function(article) {
birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]] birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]]
# remove parentheses # remove parentheses
birthdate <- substring(birthdate, 2, nchar(birthdate)-1) birthdate <- substring(birthdate, 2, nchar(birthdate)-1)
return(birthdate)
} else { } else {
# retrun Null if there is no birthdate # retrun Null if there is no birthdate
return(0) return(0)
...@@ -87,11 +85,16 @@ getIntroduction <- function(article) { ...@@ -87,11 +85,16 @@ getIntroduction <- function(article) {
xml_find_all(page, ".//br") %>% xml_find_all(page, ".//br") %>%
xml_remove xml_remove
# Get first paragraph # Get all paragraphs
introduction <- page %>% paragraphs <- page %>%
html_nodes("p") %>% html_nodes("p") %>%
html_text() %>% html_text()
.[[1]]
# there will be some leading paragraphs containing only "\n"
# remove those leading paragraphs
remove <- c("\n")
cleaned <- setdiff(paragraphs, remove)
introduction <- cleaned[1]
# Return first paragraph # Return first paragraph
return(introduction) return(introduction)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment