Newer
Older
HELP="description:
marvin_extraction_run.sh and databus-release.sh take one argument, which is the extraction group
selects download.\$GROUP.properties and extraction.\$GROUP.properties from extractionConfig dir and uses \$GROUP as a path.
usage:
./marvin_extraction_run.sh {test|generic|mappings|wikidata|text}
"
##############
# setup paths
##############
ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
CONFIGDIR="$ROOT/extractionConfiguration"
DIEFDIR="$ROOT/marvin-extraction/extraction-framework"
LOGDIR="$ROOT/marvin-extraction/logs/$(date +%Y-%m-%d)" && mkdir -p $LOGDIR
EXTRACTIONBASEDIR="$ROOT/marvin-extraction/wikidumps" && mkdir -p $EXTRACTIONBASEDIR
##############
# functions
##############
# exception for generic, 1. spark, 2. as English is big and has to be run separately
>&2 ../run sparkextraction $CONFIGDIR/extraction.generic.properties;
>&2 ../run sparkextraction $CONFIGDIR/extraction.generic.en.properties;
#>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.en.properties;
>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties;
# run for all
>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties;
fi
cd $DIEFDIR/scripts;
echo "post-processing of $GROUP"
>&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded;
>&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 mappingbased-objects-uncleaned _redirected .ttl.bz2 @downloaded;
>&2 ../run TypeConsistencyCheck type.consistency.check.properties;
elif [ "$GROUP" = "wikidata" ]
then
>&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects transitive-redirects .ttl.bz2 wikidata
>&2 ../run MapObjectUris $EXTRACTIONBASEDIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata
>&2 ../run TypeConsistencyCheck type.consistency.check.properties;
elif [ "$GROUP" = "generic" ]
then
>&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded;
>&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 disambiguations,infobox-properties,page-links,persondata,topical-concepts _redirected .ttl.bz2 @downloaded;
# todo untested line
for i in $(find $EXTRACTIONBASEDIR -name "*._redirects.ttl.bz2") ; do cp $i $LOGDIR ; rename -f 's/_redirected//' $i ; done
elif [ "$GROUP" = "test" ]
then
>&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded;
>&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 mappingbased-objects-uncleaned _redirected .ttl.bz2 @downloaded;
>&2 ../run TypeConsistencyCheckManual mappingbased-objects instance-types ro;
# log files from same day get overwritten, only latest is kept
for f in $(find $LOGDIR -type f ); do lbzip2 -f $f; done;
##########################
# Databus Mapping
##########################
mapLangToContVar() {
lang=$(echo "$1" | sed 's|wiki||g')
case "$lang" in
"bat_smg") echo "_lang=batsmg";;
"zh_min_nan") echo "_lang=nan";;
"zh_yue") echo "_lang=yue";;
"commons" ) echo "_commons";;
*) echo "_lang=$lang";;
esac
}
mapNamesToDatabus() {
case "$1" in
# generic
"article-templates-nested") echo "article-templates_nested";;
"citation-data") echo "citations_data";;
"citation-links") echo "citations_links";;
"commons-page-links") echo "commons-sameas-links";;
"page-ids") echo "page_ids";;
"page-length") echo "page_length";;
"page-links") echo "wikilinks";;
"article-categories") echo "categories_articles";;
"category-labels") echo "categories_labels";;
"skos-categories") echo "categories_skos";;
"revision-ids") echo "revisions_ids";;
"revision-uris") echo "revisions_uris";;
# mappings
"mappingbased-objects-disjoint-domain") echo "mappingbased-objects_disjointDomain";;
"mappingbased-objects-disjoint-range") echo "mappingbased-objects_disjointRange";;
# wikidata
"alias-nmw") echo "alias_nmw";;
"description-nmw") echo "description_nmw";;
"labels-nmw") echo "labels_nmw";;
"mappingbased-properties-reified-qualifiers") echo "mappingbased-properties-reified_qualifiers";;
"mappingbased-objects-uncleaned-redirected") echo "mappingbased-objects";;
"revision-ids") echo "revisions_ids";;
"revision-uris") echo "revisions_uris";;
"wikidata-duplicate-iri-split") echo "debug_duplicateirisplit";;
"wikidata-r2r-mapping-errors") echo "debug_r2rmappingerrors";;
"wikidata-type-like-statements") echo "debug_typelikestatements";;
"transitive-redirects") echo "redirects_transitive";;
# both mappings and wikidata
"instance-types") echo "instance-types_specific";;
"instance-types-transitive") echo "instance-types_transitive";;
# creates links in databus dir
mapAndLink() {
# each individual file
# split filename
# how to use ${string##/*}
# https://www.tldp.org/LDP/abs/html/string-manipulation.html#Substring%20Removal#Substring Removal
file="${path##*/}"
version="${file#*-}"
version="${version%%-*}"
version="${version:0:4}.${version:4:2}.${version:6:2}"
extraction="${file#*-*-}"
extraction="${extraction%%.*}"
extraction=$(echo -n $extraction | sed 's|interlanguage-links-|interlanguage-links_lang=|') # generic exception
extensions="${file#*.}"
# map names and languages
mapped="$(mapNamesToDatabus $extraction)"
contVars="$(mapLangToContVar $lang)"
if [[ "$mapped" == *"_"* ]]; then
contVars="${contVars}_${mapped#*_}"
fi
artifact="${mapped%%_*}"
targetFolder="$DATABUSDIR/dbpedia/$GROUP/$artifact/$version"
targetFile="$artifact$contVars.$extensions"
if [ -d "$DATABUSDIR/dbpedia/$GROUP/$artifact" ]; then
mkdir -p $targetFolder
else
echo "[DEBUG]\"$artifact\" (artifact not found, might not be in group $GROUP) $path" >&2;
fi
# TODO proper handling of "_redirected"
# TODO see above, redirected are moved to logdir and overwrite the unredirected
# < enwiki/20191001/enwiki-20191001-disambiguations_redirected.ttl.bz2
# < enwiki/20191001/enwiki-20191001-infobox-properties_redirected.ttl.bz2
# < enwiki/20191001/enwiki-20191001-page-links_redirected.ttl.bz2
# < enwiki/20191001/enwiki-20191001-persondata_redirected.ttl.bz2
# < enwiki/20191001/enwiki-20191001-topical-concepts_redirected.ttl.bz2
# copy
# TODO enable after testing
#cp -n "$path" "$targetFolder/$targetFile"
ln -s "$path" "$targetFolder/$targetFile"
echo -e "< $path\n> $targetFolder/$targetFile\n----------------------"
cd $DIEFDIR
echo "https://github.com/dbpedia/extraction-framework/commit/$(git rev-parse @)"
}