#!/usr/bin/env Rscript # Author: Lukas ## librarys library(rvest) library(stringr) library(data.table) ### This script extracts Birthplace from physicist texts getBirthplace <- function(article) { # If there is no infobox we return 0 if(!grepl("vcard", article)) { return(0) } # Use infobox to get Birthplace infoBox <- getInfoBox(article) # get Born field birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content # remove everything before the "\n" # rest is birthplace birthplace <- gsub(".*\\\n", "", birthplace) # return birthplace return(birthplace) } ### Uses Davids function to get infobox ### Converts info box to table getInfoBox <- function(article) { # Read page as html page <- read_html(article) # Extracting text from the html will erase all <br> tags, # this will replace them with line breaks xml_find_all(page, ".//br") %>% xml_add_sibling("p", "\n") xml_find_all(page, ".//br") %>% xml_remove() # Get the info box # Will throw an error if there isnt any, so that should be checked beforehand table <- page %>% html_nodes("table.vcard") %>% html_table(fill = TRUE) %>% .[[1]] colnames(table) <- c("Desc", "Content") return(table) }