Skip to content
Snippets Groups Projects
Commit 79e45a76 authored by Jonas Wolff's avatar Jonas Wolff
Browse files

Merge branch '22-r-skript-fur-birthplace-erstellen' into 'master'

Resolve "R-Skript für Birthplace erstellen"

Closes #22

See merge request !22
parents 09f9606d d296ae8e
No related branches found
No related tags found
1 merge request!22Resolve "R-Skript für Birthplace erstellen"
#!/usr/bin/env Rscript
# Author: Lukas
## librarys
library(rvest)
library(stringr)
library(data.table)
#' This script extracts Birthplace from physicist texts
#' Try to get the infobox and extract the birthplace
#' If there is no infobox, 0 will be returned as
#' birthplace is hard to extract from text
#'
#' @param article Article in HTML-format
#' @return String with birthplace of the physicist|0
getBirthplace <- function(article) {
# If there is no infobox we return 0
if(!grepl("vcard", article)) {
return(0)
}
# Use infobox to get Birthplace
infoBox <- getInfoBox(article)
# Get 'Born' field
birthplace <- infoBox[infoBox$Desc %like% "Born",]$Content
# Remove everything in front of the "\n"
# Rest is birthplace
birthplace <- gsub(".*\\\n", "", birthplace)
# return birthplace
return(birthplace)
}
### Converts info box to table
getInfoBox <- function(article) {
# Read page as html
page <- read_html(article)
# Extracting text from the html will erase all <br> tags,
# this will replace them with line breaks
xml_find_all(page, ".//br") %>%
xml_add_sibling("p", "\n")
xml_find_all(page, ".//br") %>%
xml_remove()
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
table <- page %>%
html_nodes("table.vcard") %>%
html_table(fill = TRUE) %>%
.[[1]]
colnames(table) <- c("Desc", "Content")
return(table)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment