From dfd51e056eb875960567bc8a6a678123b61ec324 Mon Sep 17 00:00:00 2001
From: Lukas Gehrke <lukasgehrke@Lukass-MacBook-Pro.local>
Date: Wed, 2 Jan 2019 09:09:15 +0100
Subject: [PATCH] Fuegt birthplace R-Skript hinzu.

---
 r/GetBirthplace.R | 59 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 r/GetBirthplace.R

diff --git a/r/GetBirthplace.R b/r/GetBirthplace.R
new file mode 100644
index 0000000..c0dd8b3
--- /dev/null
+++ b/r/GetBirthplace.R
@@ -0,0 +1,59 @@
+#!/usr/bin/env Rscript
+
+# Author: Lukas
+
+## librarys
+
+library(rvest)
+library(stringr)
+library(data.table)
+
+### This script extracts Birthplace from physicist texts
+getBirthplace <- function(article) {
+  
+  # If there is no infobox we return 0
+  if(!grepl("vcard", article)) {
+      return(0)
+  }
+  
+  # Use infobox to get Birthplace
+  infoBox <- getInfoBox(article)
+  
+  # get Born field
+  birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
+  
+  # remove everything before the round brackets
+  # rest is birthplace
+  birthplace <- gsub(".*\\\n", "", birthplace)
+  
+  # return birthplace
+  return(birthplace)
+}
+
+### Uses Davids function to get infobox
+### Converts info box to table
+getInfoBox <- function(article) {
+  # Read page as html
+  page <- read_html(article)
+  
+  # Extracting text from the html will erase all <br> tags,
+  # this will replace them with line breaks
+  
+  xml_find_all(page, ".//br") %>%
+    xml_add_sibling("p", "\n")
+  
+  xml_find_all(page, ".//br") %>%
+    xml_remove()
+  
+  # Get the info box
+  # Will throw an error if there isnt any, so that should be checked beforehand
+  
+  table <- page %>%
+    html_nodes("table.vcard") %>%
+    html_table(fill = TRUE) %>%
+    .[[1]]
+  
+  colnames(table) <- c("Desc", "Content")
+  
+  return(table)
+}
-- 
GitLab