From dfd51e056eb875960567bc8a6a678123b61ec324 Mon Sep 17 00:00:00 2001 From: Lukas Gehrke <lukasgehrke@Lukass-MacBook-Pro.local> Date: Wed, 2 Jan 2019 09:09:15 +0100 Subject: [PATCH] Fuegt birthplace R-Skript hinzu. --- r/GetBirthplace.R | 59 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 r/GetBirthplace.R diff --git a/r/GetBirthplace.R b/r/GetBirthplace.R new file mode 100644 index 0000000..c0dd8b3 --- /dev/null +++ b/r/GetBirthplace.R @@ -0,0 +1,59 @@ +#!/usr/bin/env Rscript + +# Author: Lukas + +## librarys + +library(rvest) +library(stringr) +library(data.table) + +### This script extracts Birthplace from physicist texts +getBirthplace <- function(article) { + + # If there is no infobox we return 0 + if(!grepl("vcard", article)) { + return(0) + } + + # Use infobox to get Birthplace + infoBox <- getInfoBox(article) + + # get Born field + birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content + + # remove everything before the round brackets + # rest is birthplace + birthplace <- gsub(".*\\\n", "", birthplace) + + # return birthplace + return(birthplace) +} + +### Uses Davids function to get infobox +### Converts info box to table +getInfoBox <- function(article) { + # Read page as html + page <- read_html(article) + + # Extracting text from the html will erase all <br> tags, + # this will replace them with line breaks + + xml_find_all(page, ".//br") %>% + xml_add_sibling("p", "\n") + + xml_find_all(page, ".//br") %>% + xml_remove() + + # Get the info box + # Will throw an error if there isnt any, so that should be checked beforehand + + table <- page %>% + html_nodes("table.vcard") %>% + html_table(fill = TRUE) %>% + .[[1]] + + colnames(table) <- c("Desc", "Content") + + return(table) +} -- GitLab