-
Sebastian Hellmann authoredSebastian Hellmann authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
prepareGenericArtifacts.sh 3.06 KiB
#!/bin/bash
set -e
# [CONFIG]
#extracted dumps (basedir)
BASEDIR="/data/extraction/wikidumps/"
#databus-maven-plugin project, containing release pom
DATABUSMVNPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/generic"
#explicit databus version or empty for all
DUMPDATE=
#if true show dumy output
TRYRUN=false
# [DATASET-MAPPING]
mapLang() {
lang=$(echo "$1" | sed 's|wiki||g')
case "$lang" in
"bat_smg") echo "_lang=batsmg";;
"zh_min_nan") echo "_lang=nan";;
"zh_yue") echo "_lang=yue";;
"wikidata") echo "";;
*) echo "_lang=$lang";;
esac
}
mapExtraction() {
case "$1" in
"article-templates-nested") echo "article-templates_nested";;
"citation-data") echo "citations_data";;
"citation-links") echo "citations_links";;
"commons-page-links") echo "commons-sameas-links";;
"page-ids") echo "page_ids";;
"page-length") echo "page_length";;
"page-links") echo "wikilinks";;
"article-categories") echo "categories_articles";;
"category-labels") echo "categories_labels";;
"skos-categories") echo "categories_skos";;
"revision-ids") echo "revisions_ids";;
"revision-uris") echo "revisions_uris";;
*) echo "$1";;
esac
}
# [FUNCTIONS]
collectExtractionFun() {
#how to use ${string##/*}
#https://www.tldp.org/LDP/abs/html/string-manipulation.html#Substring%20Removal#Substring Removal
for path in $(find "$BASEDIR" -name "*.ttl.bz2"); do
file="${path##*/}"
version="${file#*-}"
version="${version%%-*}"
version="${version:0:4}.${version:4:2}.${version:6:2}"
if [ "$DUMPDATE" = "$version" ] || [ -z "$DUMPDATE" ] ; then
lang="${file%%-*}"
extraction="${file#*-*-}"
extraction="${extraction%%.*}"
extension="${file#*.}"
mapped="$(mapExtraction $extraction)"
artifact="${mapped%%_*}"
contVars="$(mapLang $lang)"
if [[ "$mapped" == *"_"* ]]; then
contVars="${contVars}_${mapped#*_}"
fi
targetArVe="$artifact/$version"
targetFile="$artifact$contVars.$extension"
if [ -d "$DATABUSMVNPOMDIR/$artifact" ]; then
if [ ! -d "$DATABUSMVNPOMDIR/$targetArVe" ]; then
mkdir -p "$DATABUSMVNPOMDIR/$targetArVe"
fi
if $TRYRUN; then
echo "$path -> $DATABUSMVNPOMDIR/$targetArVe/$targetFile"
else
cp -vn "$path" "$DATABUSMVNPOMDIR/$targetArVe/$targetFile"
fi
else
>&2 echo "unmapped/notexist artifact: $artifact | $mapped | $extraction"
fi
fi
done
}
renameRedirected() {
cd $DATABUSMVNPOMDIR;
# for f in $(find . -name "*_redirected*" ); do rename -n 's/_redirected\.ttl\.bz2$/\.ttl\.bz2$/' $f; done
for f in $(find . -name "*_redirected*" ); do rename -n 's/_redirected//' $f; done
}
# [Main]
main() {
collectExtractionFun;
}
main;