From ba8af03b0fec489ea6ad589155159fce6c99f8e9 Mon Sep 17 00:00:00 2001
From: Lukas Gehrke <lukasgehrke@Lukass-MacBook-Pro.local>
Date: Wed, 2 Jan 2019 09:04:29 +0100
Subject: [PATCH] =?UTF-8?q?Fixt=20getter=20f=C3=BCr=20ersten=20paragraph.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 r/GetBirthdate.R | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/r/GetBirthdate.R b/r/GetBirthdate.R
index a00060c..dbe0cdd 100644
--- a/r/GetBirthdate.R
+++ b/r/GetBirthdate.R
@@ -13,7 +13,7 @@ library(data.table)
 ### Try to extract birthdate from infobox
 ### If there is no infobox, try to extract from introduction text
 getBirthdate <- function(article) {
-  # check if vcard exists
+  # check
   if(!grepl("vcard", article)) {
     # check first paragraph
     introduction <- getIntroduction(article)
@@ -22,8 +22,6 @@ getBirthdate <- function(article) {
       birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]]
       # remove parentheses
       birthdate <- substring(birthdate, 2, nchar(birthdate)-1)
-
-      return(birthdate)
     } else {
       # retrun Null if there is no birthdate
       return(0)
@@ -87,11 +85,16 @@ getIntroduction <- function(article) {
   xml_find_all(page, ".//br") %>%
     xml_remove
   
-  # Get first paragraph
-  introduction <- page %>%
+  # Get all paragraphs
+  paragraphs <- page %>%
     html_nodes("p") %>%
-    html_text() %>%
-    .[[1]]
+    html_text()
+  
+  # there will be some leading paragraphs containing only "\n"
+  # remove those leading paragraphs
+  remove <- c("\n")
+  cleaned <- setdiff(paragraphs, remove)
+  introduction <- cleaned[1]
   
   # Return first paragraph
   return(introduction)
-- 
GitLab