Fixt getter für ersten paragraph.

ba8af03b · Lukas Gehrke · 6e1d902d · ba8af03b
Commit ba8af03b authored 6 years ago by Lukas Gehrke
--- a/r/GetBirthdate.R
+++ b/r/GetBirthdate.R
@@ -13,7 +13,7 @@ library(data.table)
 ### Try to extract birthdate from infobox
 ### If there is no infobox, try to extract from introduction text
 getBirthdate <- function(article) {
-  # check if vcard exists
+  # check
  if(!grepl("vcard", article)) {
    # check first paragraph
    introduction <- getIntroduction(article)
@@ -22,8 +22,6 @@ getBirthdate <- function(article) {
      birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]]
      # remove parentheses
      birthdate <- substring(birthdate, 2, nchar(birthdate)-1)
-
-      return(birthdate)
    } else {
      # retrun Null if there is no birthdate
      return(0)
@@ -87,11 +85,16 @@ getIntroduction <- function(article) {
  xml_find_all(page, ".//br") %>%
    xml_remove
  
-  # Get first paragraph
-  introduction <- page %>%
+  # Get all paragraphs
+  paragraphs <- page %>%
    html_nodes("p") %>%
-    html_text() %>%
-    .[[1]]
+    html_text()
+  
+  # there will be some leading paragraphs containing only "\n"
+  # remove those leading paragraphs
+  remove <- c("\n")
+  cleaned <- setdiff(paragraphs, remove)
+  introduction <- cleaned[1]
  
  # Return first paragraph
  return(introduction)