#!/usr/bin/env Rscript

# Author: Lukas

## librarys

library(rvest)
library(stringr)
library(data.table)

### This script extracts Birthplace from physicist texts
getBirthplace <- function(article) {
  
  # If there is no infobox we return 0
  if(!grepl("vcard", article)) {
      return(0)
  }
  
  # Use infobox to get Birthplace
  infoBox <- getInfoBox(article)
  
  # get Born field
  birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
  
  # remove everything before the "\n"
  # rest is birthplace
  birthplace <- gsub(".*\\\n", "", birthplace)
  
  # return birthplace
  return(birthplace)
}

### Uses Davids function to get infobox
### Converts info box to table
getInfoBox <- function(article) {
  # Read page as html
  page <- read_html(article)
  
  # Extracting text from the html will erase all <br> tags,
  # this will replace them with line breaks
  
  xml_find_all(page, ".//br") %>%
    xml_add_sibling("p", "\n")
  
  xml_find_all(page, ".//br") %>%
    xml_remove()
  
  # Get the info box
  # Will throw an error if there isnt any, so that should be checked beforehand
  
  table <- page %>%
    html_nodes("table.vcard") %>%
    html_table(fill = TRUE) %>%
    .[[1]]
  
  colnames(table) <- c("Desc", "Content")
  
  return(table)
}