From 39ae3a9edcf31ee9af0357290a71733835d74c0a Mon Sep 17 00:00:00 2001 From: Lukas Gehrke <lukasgehrke@Lukass-MacBook-Pro.local> Date: Thu, 3 Jan 2019 14:23:10 +0100 Subject: [PATCH] Aendert Documentation. --- r/GetBirthdate.R | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/r/GetBirthdate.R b/r/GetBirthdate.R index dbe0cdd..efe63b1 100644 --- a/r/GetBirthdate.R +++ b/r/GetBirthdate.R @@ -1,56 +1,57 @@ #!/usr/bin/env Rscript -### Extracts birthdate fronm the infobox -### If there is no infobox the Introduction text can be checked # Author: Lukas -## Librarys - library(rvest) library(stringr) library(data.table) -### Try to extract birthdate from infobox -### If there is no infobox, try to extract from introduction text +#' Extract birthdate from infobox +#' Will try to get infobox as table and extract birthdate +#' from 'Born'-entry +#' If there is no infobox, first paragraph of the article +#' will be checked for birthdate +#' +#' @param article Article in HTML-format +#' @return String birthdate as string getBirthdate <- function(article) { - # check + # Check if there is an infobox if(!grepl("vcard", article)) { - # check first paragraph + # Check first paragraph introduction <- getIntroduction(article) if(!introduction == "") { - # get birthdate inside of parentheses + # Get birthdate inside of parentheses birthdate <- str_extract_all(introduction, "\\([^()]+\\)")[[1]] - # remove parentheses + # Remove parentheses birthdate <- substring(birthdate, 2, nchar(birthdate)-1) } else { - # retrun Null if there is no birthdate + # Retrun Null if there is no birthdate return(0) } } - # try to get birthdate via infobox + # Try to get birthdate via infobox infoBox <- getInfoBox(article) - # get the born field + # Get the born field birthdate <- infoBox[infoBox$Desc %like% "Born",]$Content - # remove everything except the birthdate - # remove everything in round brackets + # Remove everything except the birthdate + # Remove everything in round brackets birthdate <- gsub("\\s*\\([^\\)]+\\)", "", birthdate) - # remove everything starting with newline + # Remove everything starting with newline birthdate <- gsub("\\n.*$", "", birthdate) return(birthdate) } -### Uses Davids function to get infobox ### Converts info box to table getInfoBox <- function(article) { # Read page as html page <- read_html(article) # Extracting text from the html will erase all <br> tags, - # this will replace them with line breaks + # This will replace them with line breaks xml_find_all(page, ".//br") %>% xml_add_sibling("p", "\n") @@ -71,7 +72,10 @@ getInfoBox <- function(article) { return(table) } -### Get Introduction Text from Wikipedia page that contains birthdate +#' Get Introduction Text from Wikipedia page that contains birthdate +#' +#' @param article article in HTML-format +#' @return string introduction text from wikipedia article getIntroduction <- function(article) { # Read page as html page <- read_html(article) @@ -90,8 +94,8 @@ getIntroduction <- function(article) { html_nodes("p") %>% html_text() - # there will be some leading paragraphs containing only "\n" - # remove those leading paragraphs + # There will be some leading paragraphs containing only "\n" + # Remove those leading paragraphs remove <- c("\n") cleaned <- setdiff(paragraphs, remove) introduction <- cleaned[1] -- GitLab