diff --git a/r/GetBirthplace.R b/r/GetBirthplace.R new file mode 100644 index 0000000000000000000000000000000000000000..8726598874e32feeffa2c015ec5bb43cfec90adc --- /dev/null +++ b/r/GetBirthplace.R @@ -0,0 +1,64 @@ +#!/usr/bin/env Rscript + +# Author: Lukas + +## librarys + +library(rvest) +library(stringr) +library(data.table) + +#' This script extracts Birthplace from physicist texts +#' Try to get the infobox and extract the birthplace +#' If there is no infobox, 0 will be returned as +#' birthplace is hard to extract from text +#' +#' @param article Article in HTML-format +#' @return String with birthplace of the physicist|0 +getBirthplace <- function(article) { + + # If there is no infobox we return 0 + if(!grepl("vcard", article)) { + return(0) + } + + # Use infobox to get Birthplace + infoBox <- getInfoBox(article) + + # Get 'Born' field + birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content + + # Remove everything in front of the "\n" + # Rest is birthplace + birthplace <- gsub(".*\\\n", "", birthplace) + + # return birthplace + return(birthplace) +} + +### Converts info box to table +getInfoBox <- function(article) { + # Read page as html + page <- read_html(article) + + # Extracting text from the html will erase all <br> tags, + # this will replace them with line breaks + + xml_find_all(page, ".//br") %>% + xml_add_sibling("p", "\n") + + xml_find_all(page, ".//br") %>% + xml_remove() + + # Get the info box + # Will throw an error if there isnt any, so that should be checked beforehand + + table <- page %>% + html_nodes("table.vcard") %>% + html_table(fill = TRUE) %>% + .[[1]] + + colnames(table) <- c("Desc", "Content") + + return(table) +}