Skip to content
Snippets Groups Projects
functions.sh 7.25 KiB
Newer Older
Sebastian Hellmann's avatar
Sebastian Hellmann committed
#!/bin/bash

Sebastian Hellmann's avatar
Sebastian Hellmann committed

HELP="description:
marvin_extraction_run.sh and databus-release.sh take one argument, which is the extraction group
selects download.\$GROUP.properties and extraction.\$GROUP.properties from extractionConfig dir and uses \$GROUP as a path.

usage: 
./marvin_extraction_run.sh {test|generic|mappings|wikidata|text}
"

##############
# setup paths
##############

ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
CONFIGDIR="$ROOT/extractionConfiguration"
Sebastian Hellmann's avatar
Sebastian Hellmann committed
DIEFDIR="$ROOT/marvin-extraction/extraction-framework"
LOGDIR="$ROOT/marvin-extraction/logs/$(date +%Y-%m-%d)"  && mkdir -p $LOGDIR
EXTRACTIONBASEDIR="$ROOT/marvin-extraction/wikidumps" && mkdir -p $EXTRACTIONBASEDIR
kurzum's avatar
kurzum committed
DATABUSDIR="$ROOT/databus-poms"

##############
# functions
##############

Sebastian Hellmann's avatar
Sebastian Hellmann committed
# extract data
Sebastian Hellmann's avatar
Sebastian Hellmann committed
extractDumps() {
Sebastian Hellmann's avatar
Sebastian Hellmann committed
    cd $DIEFDIR/dump;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
    # exception for generic, 1. spark, 2. as English is big and has to be run separately
Sebastian Hellmann's avatar
Sebastian Hellmann committed
    if [ "$GROUP" = "generic" ]
    then
Sebastian Hellmann's avatar
Sebastian Hellmann committed
       >&2 ../run sparkextraction $CONFIGDIR/extraction.generic.properties;
       >&2 ../run sparkextraction $CONFIGDIR/extraction.generic.en.properties;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
    elif ["$GROUP" = "text" ]
    then
root's avatar
root committed
      #>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.en.properties;
      >&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
    else
Your Name's avatar
Your Name committed
	# run for all
	>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties;
    fi

Sebastian Hellmann's avatar
Sebastian Hellmann committed
}

Sebastian Hellmann's avatar
Sebastian Hellmann committed
# post-processing
postProcessing() {
Sebastian Hellmann's avatar
Sebastian Hellmann committed

    cd $DIEFDIR/scripts;
    echo "post-processing of $GROUP"
    
Sebastian Hellmann's avatar
Sebastian Hellmann committed
    if [ "$GROUP" = "mappings" ]
    then
Sebastian Hellmann's avatar
Sebastian Hellmann committed
        >&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded;
        >&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 mappingbased-objects-uncleaned _redirected .ttl.bz2 @downloaded;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
        >&2 ../run TypeConsistencyCheck type.consistency.check.properties;
    elif [ "$GROUP" = "wikidata" ]
    then
Sebastian Hellmann's avatar
Sebastian Hellmann committed
        >&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects transitive-redirects .ttl.bz2 wikidata
        >&2 ../run MapObjectUris $EXTRACTIONBASEDIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata
Sebastian Hellmann's avatar
Sebastian Hellmann committed
        >&2 ../run TypeConsistencyCheck type.consistency.check.properties;
    elif [ "$GROUP" = "generic" ] 
    then
Sebastian Hellmann's avatar
Sebastian Hellmann committed
        >&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded;
        >&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 disambiguations,infobox-properties,page-links,persondata,topical-concepts _redirected .ttl.bz2 @downloaded;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
		# todo untested line
		for i in $(find $EXTRACTIONBASEDIR -name "*._redirects.ttl.bz2") ; do cp $i $LOGDIR ; rename -f 's/_redirected//' $i ; done
Sebastian Hellmann's avatar
Sebastian Hellmann committed
    elif [ "$GROUP" = "text" ]
Sebastian Hellmann's avatar
Sebastian Hellmann committed
    then
Sebastian Hellmann's avatar
Sebastian Hellmann committed
        echo "check whether text has post-processing"
Sebastian Hellmann's avatar
Sebastian Hellmann committed

    elif [ "$GROUP" = "test" ]
    then 
        >&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded;
        >&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 mappingbased-objects-uncleaned _redirected .ttl.bz2 @downloaded;
        >&2 ../run TypeConsistencyCheckManual mappingbased-objects instance-types ro;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
    fi
}

# compress log files
Sebastian Hellmann's avatar
Sebastian Hellmann committed
# log files from same day get overwritten, only latest is kept
Sebastian Hellmann's avatar
Sebastian Hellmann committed
archiveLogFiles() {
Sebastian Hellmann's avatar
Sebastian Hellmann committed
    for f in $(find $LOGDIR -type f ); do lbzip2 -f $f; done;
Sebastian Hellmann's avatar
Sebastian Hellmann committed

Sebastian Hellmann's avatar
Sebastian Hellmann committed
##########################
# Databus Mapping
##########################

Your Name's avatar
Your Name committed
# switch case for some language exceptions
Sebastian Hellmann's avatar
Sebastian Hellmann committed
mapLangToContVar() {
    lang=$(echo "$1" | sed 's|wiki||g')
    case "$lang" in
        "bat_smg") echo "_lang=batsmg";;
        "zh_min_nan") echo "_lang=nan";;
        "zh_yue") echo "_lang=yue";;
        "data") echo "";;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
        "commons" ) echo "_commons";;
        *) echo "_lang=$lang";;
    esac
}


mapNamesToDatabus() {

    case "$1" in

Your Name's avatar
Your Name committed
	# generic
        "article-templates-nested") echo "article-templates_nested";;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
        "citation-data") echo "citations_data";;
        "citation-links") echo "citations_links";;
        "commons-page-links") echo "commons-sameas-links";;
        "page-ids") echo "page_ids";;
        "page-length") echo "page_length";;
        "page-links") echo "wikilinks";;
        "article-categories") echo "categories_articles";;
        "category-labels") echo "categories_labels";;
        "skos-categories") echo "categories_skos";;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
        "revision-ids") echo "revisions_ids";;
        "revision-uris") echo "revisions_uris";;
Your Name's avatar
Your Name committed

       # mappings
	"mappingbased-objects-disjoint-domain") echo "mappingbased-objects_disjointDomain";;
	"mappingbased-objects-disjoint-range")  echo "mappingbased-objects_disjointRange";;

	# wikidata
	"alias-nmw") echo "alias_nmw";;
	"description-nmw") echo "description_nmw";;
	"labels-nmw") echo "labels_nmw";;
	"mappingbased-properties-reified-qualifiers") echo "mappingbased-properties-reified_qualifiers";;
	"mappingbased-objects-uncleaned-redirected") echo "mappingbased-objects";;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
	"revision-ids") echo "revisions_ids";;
	"revision-uris") echo "revisions_uris";;
Your Name's avatar
Your Name committed
	"wikidata-duplicate-iri-split") echo "debug_duplicateirisplit";;
	"wikidata-r2r-mapping-errors") echo "debug_r2rmappingerrors";;
	"wikidata-type-like-statements") echo "debug_typelikestatements";;
	"transitive-redirects") echo "redirects_transitive";;

	# both mappings and wikidata
	"instance-types") echo "instance-types_specific";;
	"instance-types-transitive") echo "instance-types_transitive";;
Sebastian Hellmann's avatar
Sebastian Hellmann committed

        *) echo "$1";;
    esac
}

kurzum's avatar
kurzum committed
# creates links in databus dir
mapAndLink() {
	# each individual file
	# split filename
	# how to use ${string##/*}
	# https://www.tldp.org/LDP/abs/html/string-manipulation.html#Substring%20Removal#Substring Removal
	file="${path##*/}"
	version="${file#*-}"
	version="${version%%-*}"
	version="${version:0:4}.${version:4:2}.${version:6:2}"
	lang="${file%%-*}"
	extraction="${file#*-*-}"
	extraction="${extraction%%.*}"
Your Name's avatar
Your Name committed
	extraction=$(echo -n $extraction | sed 's|interlanguage-links-|interlanguage-links_lang=|') # generic exception

	extensions="${file#*.}"

	# map names and languages
	mapped="$(mapNamesToDatabus $extraction)"
	contVars="$(mapLangToContVar $lang)"
	if [[ "$mapped" == *"_"* ]]; then
		contVars="${contVars}_${mapped#*_}"
	fi
	artifact="${mapped%%_*}"
	targetFolder="$DATABUSDIR/dbpedia/$GROUP/$artifact/$version"
Your Name's avatar
Your Name committed
	targetFile="$artifact$contVars.$extensions"

	if [ -d "$DATABUSDIR/dbpedia/$GROUP/$artifact" ]; then
		mkdir -p $targetFolder
	else
kurzum's avatar
kurzum committed
		echo "[DEBUG]\"$artifact\" (artifact not found, might not be in group $GROUP) $path" >&2;
Your Name's avatar
Your Name committed
	fi

	# TODO proper handling of "_redirected"
	# TODO see above, redirected are moved to logdir and overwrite the unredirected
kurzum's avatar
kurzum committed
	# concerns only generic:
Your Name's avatar
Your Name committed
	# < enwiki/20191001/enwiki-20191001-disambiguations_redirected.ttl.bz2
	# < enwiki/20191001/enwiki-20191001-infobox-properties_redirected.ttl.bz2
	# < enwiki/20191001/enwiki-20191001-page-links_redirected.ttl.bz2
	# < enwiki/20191001/enwiki-20191001-persondata_redirected.ttl.bz2
	# < enwiki/20191001/enwiki-20191001-topical-concepts_redirected.ttl.bz2

	# copy
	# TODO enable after testing
kurzum's avatar
kurzum committed
	#cp -n "$path" "$targetFolder/$targetFile"
	ln -s "$path" "$targetFolder/$targetFile"
Your Name's avatar
Your Name committed
	echo -e "< $path\n> $targetFolder/$targetFile\n----------------------"

diefCommitLink() {
Sebastian Hellmann's avatar
Sebastian Hellmann committed

	cd $DIEFDIR
	echo "https://github.com/dbpedia/extraction-framework/commit/$(git rev-parse @)"
}