diff --git a/r/GetBirthdate.R b/r/GetBirthdate.R index a00060c918e11e1e5dd3c67ab554f3d8878c0726..dbe0cdd89419f8db43e2f169b14f0e00026fe2b8 100644 --- a/r/GetBirthdate.R +++ b/r/GetBirthdate.R @@ -13,7 +13,7 @@ library(data.table) ### Try to extract birthdate from infobox ### If there is no infobox, try to extract from introduction text getBirthdate <- function(article) { - # check if vcard exists + # check if(!grepl("vcard", article)) { # check first paragraph introduction <- getIntroduction(article) @@ -22,8 +22,6 @@ getBirthdate <- function(article) { birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]] # remove parentheses birthdate <- substring(birthdate, 2, nchar(birthdate)-1) - - return(birthdate) } else { # retrun Null if there is no birthdate return(0) @@ -87,11 +85,16 @@ getIntroduction <- function(article) { xml_find_all(page, ".//br") %>% xml_remove - # Get first paragraph - introduction <- page %>% + # Get all paragraphs + paragraphs <- page %>% html_nodes("p") %>% - html_text() %>% - .[[1]] + html_text() + + # there will be some leading paragraphs containing only "\n" + # remove those leading paragraphs + remove <- c("\n") + cleaned <- setdiff(paragraphs, remove) + introduction <- cleaned[1] # Return first paragraph return(introduction)