From ba8af03b0fec489ea6ad589155159fce6c99f8e9 Mon Sep 17 00:00:00 2001 From: Lukas Gehrke <lukasgehrke@Lukass-MacBook-Pro.local> Date: Wed, 2 Jan 2019 09:04:29 +0100 Subject: [PATCH] =?UTF-8?q?Fixt=20getter=20f=C3=BCr=20ersten=20paragraph.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- r/GetBirthdate.R | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/r/GetBirthdate.R b/r/GetBirthdate.R index a00060c..dbe0cdd 100644 --- a/r/GetBirthdate.R +++ b/r/GetBirthdate.R @@ -13,7 +13,7 @@ library(data.table) ### Try to extract birthdate from infobox ### If there is no infobox, try to extract from introduction text getBirthdate <- function(article) { - # check if vcard exists + # check if(!grepl("vcard", article)) { # check first paragraph introduction <- getIntroduction(article) @@ -22,8 +22,6 @@ getBirthdate <- function(article) { birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]] # remove parentheses birthdate <- substring(birthdate, 2, nchar(birthdate)-1) - - return(birthdate) } else { # retrun Null if there is no birthdate return(0) @@ -87,11 +85,16 @@ getIntroduction <- function(article) { xml_find_all(page, ".//br") %>% xml_remove - # Get first paragraph - introduction <- page %>% + # Get all paragraphs + paragraphs <- page %>% html_nodes("p") %>% - html_text() %>% - .[[1]] + html_text() + + # there will be some leading paragraphs containing only "\n" + # remove those leading paragraphs + remove <- c("\n") + cleaned <- setdiff(paragraphs, remove) + introduction <- cleaned[1] # Return first paragraph return(introduction) -- GitLab