#!/bin/bash HELP="description: marvin_extraction_run.sh and databus-release.sh take one argument, which is the extraction group selects download.\$GROUP.properties and extraction.\$GROUP.properties from extractionConfig dir and uses \$GROUP as a path. usage: ./marvin_extraction_run.sh {test|generic|mappings|wikidata|text} " ############## # setup paths ############## ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" CONFIGDIR="$ROOT/extractionConfiguration" DIEFDIR="$ROOT/marvin-extraction/extraction-framework" LOGDIR="$ROOT/marvin-extraction/logs/$(date +%Y-%m-%d)" && mkdir -p $LOGDIR EXTRACTIONBASEDIR="$ROOT/marvin-extraction/wikidumps" && mkdir -p $EXTRACTIONBASEDIR DATABUSDIR="$ROOT/databus-poms" ############## # functions ############## # extract data extractDumps() { cd $DIEFDIR/dump; # exception for generic, 1. spark, 2. as English is big and has to be run separately if [ "$GROUP" = "generic" ] then >&2 ../run sparkextraction $CONFIGDIR/extraction.generic.properties; >&2 ../run sparkextraction $CONFIGDIR/extraction.generic.en.properties; elif ["$GROUP" = "text" ] then #>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.en.properties; >&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties; else # run for all >&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties; fi } # post-processing postProcessing() { cd $DIEFDIR/scripts; echo "post-processing of $GROUP" if [ "$GROUP" = "mappings" ] then >&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded; >&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 mappingbased-objects-uncleaned _redirected .ttl.bz2 @downloaded; >&2 ../run TypeConsistencyCheck type.consistency.check.properties; elif [ "$GROUP" = "wikidata" ] then >&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects transitive-redirects .ttl.bz2 wikidata >&2 ../run MapObjectUris $EXTRACTIONBASEDIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata >&2 ../run TypeConsistencyCheck type.consistency.check.properties; elif [ "$GROUP" = "generic" ] then >&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded; >&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 disambiguations,infobox-properties,page-links,persondata,topical-concepts _redirected .ttl.bz2 @downloaded; # todo untested line for i in $(find $EXTRACTIONBASEDIR -name "*._redirects.ttl.bz2") ; do cp $i $LOGDIR ; rename -f 's/_redirected//' $i ; done elif [ "$GROUP" = "text" ] then echo "check whether text has post-processing" elif [ "$GROUP" = "test" ] then >&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded; >&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 mappingbased-objects-uncleaned _redirected .ttl.bz2 @downloaded; >&2 ../run TypeConsistencyCheckManual mappingbased-objects instance-types ro; fi } # compress log files # log files from same day get overwritten, only latest is kept archiveLogFiles() { for f in $(find $LOGDIR -type f ); do lbzip2 -f $f; done; } ########################## # Databus Mapping ########################## # switch case for some language exceptions mapLangToContVar() { lang=$(echo "$1" | sed 's|wiki||g') case "$lang" in "bat_smg") echo "_lang=batsmg";; "zh_min_nan") echo "_lang=nan";; "zh_yue") echo "_lang=yue";; "data") echo "";; "commons" ) echo "_commons";; *) echo "_lang=$lang";; esac } mapNamesToDatabus() { case "$1" in # generic "article-templates-nested") echo "article-templates_nested";; "citation-data") echo "citations_data";; "citation-links") echo "citations_links";; "commons-page-links") echo "commons-sameas-links";; "page-ids") echo "page_ids";; "page-length") echo "page_length";; "page-links") echo "wikilinks";; "article-categories") echo "categories_articles";; "category-labels") echo "categories_labels";; "skos-categories") echo "categories_skos";; "revision-ids") echo "revisions_ids";; "revision-uris") echo "revisions_uris";; # mappings "mappingbased-objects-disjoint-domain") echo "mappingbased-objects_disjointDomain";; "mappingbased-objects-disjoint-range") echo "mappingbased-objects_disjointRange";; # wikidata "alias-nmw") echo "alias_nmw";; "description-nmw") echo "description_nmw";; "labels-nmw") echo "labels_nmw";; "mappingbased-properties-reified-qualifiers") echo "mappingbased-properties-reified_qualifiers";; "mappingbased-objects-uncleaned-redirected") echo "mappingbased-objects";; "revision-ids") echo "revisions_ids";; "revision-uris") echo "revisions_uris";; "wikidata-duplicate-iri-split") echo "debug_duplicateirisplit";; "wikidata-r2r-mapping-errors") echo "debug_r2rmappingerrors";; "wikidata-type-like-statements") echo "debug_typelikestatements";; "transitive-redirects") echo "redirects_transitive";; # both mappings and wikidata "instance-types") echo "instance-types_specific";; "instance-types-transitive") echo "instance-types_transitive";; *) echo "$1";; esac } # creates links in databus dir mapAndLink() { # each individual file path=$1 # split filename # how to use ${string##/*} # https://www.tldp.org/LDP/abs/html/string-manipulation.html#Substring%20Removal#Substring Removal file="${path##*/}" version="${file#*-}" version="${version%%-*}" version="${version:0:4}.${version:4:2}.${version:6:2}" lang="${file%%-*}" extraction="${file#*-*-}" extraction="${extraction%%.*}" extraction=$(echo -n $extraction | sed 's|interlanguage-links-|interlanguage-links_lang=|') # generic exception extensions="${file#*.}" # map names and languages mapped="$(mapNamesToDatabus $extraction)" contVars="$(mapLangToContVar $lang)" if [[ "$mapped" == *"_"* ]]; then contVars="${contVars}_${mapped#*_}" fi artifact="${mapped%%_*}" targetFolder="$DATABUSDIR/dbpedia/$GROUP/$artifact/$version" targetFile="$artifact$contVars.$extensions" if [ -d "$DATABUSDIR/dbpedia/$GROUP/$artifact" ]; then mkdir -p $targetFolder else echo "[DEBUG]\"$artifact\" (artifact not found, might not be in group $GROUP) $path" >&2; fi # TODO proper handling of "_redirected" # TODO see above, redirected are moved to logdir and overwrite the unredirected # concerns only generic: # < enwiki/20191001/enwiki-20191001-disambiguations_redirected.ttl.bz2 # < enwiki/20191001/enwiki-20191001-infobox-properties_redirected.ttl.bz2 # < enwiki/20191001/enwiki-20191001-page-links_redirected.ttl.bz2 # < enwiki/20191001/enwiki-20191001-persondata_redirected.ttl.bz2 # < enwiki/20191001/enwiki-20191001-topical-concepts_redirected.ttl.bz2 # copy # TODO enable after testing #cp -n "$path" "$targetFolder/$targetFile" ln -s "$path" "$targetFolder/$targetFile" echo -e "< $path\n> $targetFolder/$targetFile\n----------------------" } diefCommitLink() { cd $DIEFDIR echo "https://github.com/dbpedia/extraction-framework/commit/$(git rev-parse @)" }