From d0bce1804a75cd6daae7a1d7f665fd2bbd917401 Mon Sep 17 00:00:00 2001
From: Marvin Hofer <vehnem@yahoo.de>
Date: Wed, 4 Sep 2019 15:09:24 +0200
Subject: [PATCH] cleanup wikidata-release.sh

---
 generic/collectExtraction.sh          |  12 +-
 generic/generic-release.sh            |   8 +-
 mappings/collectExtraction.sh         |  12 +-
 mappings/mappings-release.sh          |   6 +-
 wikidata/schedule/wikidata-release.sh | 140 -----------------
 wikidata/wikidata-release.sh          | 214 ++++++++++++++++++++++++++
 6 files changed, 235 insertions(+), 157 deletions(-)
 delete mode 100755 wikidata/schedule/wikidata-release.sh
 create mode 100755 wikidata/wikidata-release.sh

diff --git a/generic/collectExtraction.sh b/generic/collectExtraction.sh
index cdecb23..02849bd 100644
--- a/generic/collectExtraction.sh
+++ b/generic/collectExtraction.sh
@@ -8,7 +8,7 @@ set -e
 BASEDIR="/data/extraction/wikidumps/"
 
 #databus-maven-plugin project, containing release pom
-DATABUSMVNDIR="/data/extraction/databus-maven-plugin/dbpedia/generic"
+DATABUSMVNPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/generic"
 
 #explicit databus version or empty for all
 DUMPDATE=
@@ -95,17 +95,17 @@ collectExtractionFun() {
             targetArVe="$artifact/$version"
             targetFile="$artifact$contVars.$extension"
 
-            if [ -d "$DATABUSMVNDIR/$artifact" ]; then
+            if [ -d "$DATABUSMVNPOMDIR/$artifact" ]; then
 
-                if [ ! -d "$DATABUSMVNDIR/$targetArVe" ]; then
+                if [ ! -d "$DATABUSMVNPOMDIR/$targetArVe" ]; then
 
-                    mkdir -p "$DATABUSMVNDIR/$targetArVe"
+                    mkdir -p "$DATABUSMVNPOMDIR/$targetArVe"
                 fi
 
                 if $TRYRUN; then
-                    echo "$path -> $DATABUSMVNDIR/$targetArVe/$targetFile"
+                    echo "$path -> $DATABUSMVNPOMDIR/$targetArVe/$targetFile"
                 else
-                    cp -vn "$path" "$DATABUSMVNDIR/$targetArVe/$targetFile"
+                    cp -vn "$path" "$DATABUSMVNPOMDIR/$targetArVe/$targetFile"
                 fi
             else
 
diff --git a/generic/generic-release.sh b/generic/generic-release.sh
index 0fbd11c..e9c64ce 100644
--- a/generic/generic-release.sh
+++ b/generic/generic-release.sh
@@ -19,7 +19,8 @@ DATABUSMAVENPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/generic";
 #override release pom.xml properties
 RELEASEPUBLISHER="https://vehnem.github.io/webid.ttl#this";
 RELEASEPACKAGEDIR="/data/extraction/release";
-RELEASEDOWNLOADURL="http://dbpedia-mappings.tib.eu/release";
+RELEASEDOWNLOADURL="http://dbpedia-generic.tib.eu/release";
+RELEASELABELPREFIX=""
 
 #logging directory
 LOGS="/data/extraction/logs/$(date +%Y-%m-%d)";
@@ -78,7 +79,7 @@ postProcessing() {
 prepareRelease() {
     #own config
     cd $SCRIPTROOT;
-    collectExtraction.sh;
+    bash collectExtraction.sh;
 }
 
 setNewVersion() {
@@ -91,7 +92,8 @@ deployRelease() {
     mvn deploy \
 	-Ddatabus.publisher="$RELEASEPUBLISHER" \
 	-Ddatabus.packageDirectory="$RELEASEPACKAGEDIR/\${project.groupId}/\${project.artifactId}" \
-	-Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}";
+	-Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}" \
+	-Ddatabus.labelPrefix="$RELEASELABELPREFIX";
 }
 
 compressLogs() {
diff --git a/mappings/collectExtraction.sh b/mappings/collectExtraction.sh
index d4d3716..77113b0 100644
--- a/mappings/collectExtraction.sh
+++ b/mappings/collectExtraction.sh
@@ -8,7 +8,7 @@ set -e
 BASEDIR="/data/extraction/wikidumps/"
 
 #databus-maven-plugin project, containing release pom
-DATABUSMVNDIR="/data/extraction/databus-maven-plugin/dbpedia/generic"
+DATABUSMVNPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/generic"
 
 #explicit databus version or empty for all
 DUMPDATE=
@@ -91,17 +91,17 @@ copyToMavenPlugin() {
             targetArVe="$artifact/$version"
             targetFile="$artifact$contVars.$extension"
 
-            if [ -d "$DATABUSMVNDIR/$artifact" ]; then
+            if [ -d "$DATABUSMVNPOMDIR/$artifact" ]; then
 
-                if [ ! -d "$DATABUSMVNDIR/$targetArVe" ]; then
+                if [ ! -d "$DATABUSMVNPOMDIR/$targetArVe" ]; then
 
-                    mkdir -p "$DATABUSMVNDIR/$targetArVe"
+                    mkdir -p "$DATABUSMVNPOMDIR/$targetArVe"
                 fi
 
                 if $TRYRUN; then
-                    echo "$path -> $DATABUSMVNDIR/$targetArVe/$targetFile"
+                    echo "$path -> $DATABUSMVNPOMDIR/$targetArVe/$targetFile"
                 else
-                    cp -vn "$path" "$DATABUSMVNDIR/$targetArVe/$targetFile"
+                    cp -vn "$path" "$DATABUSMVNPOMDIR/$targetArVe/$targetFile"
                 fi
             else
 
diff --git a/mappings/mappings-release.sh b/mappings/mappings-release.sh
index ce0c405..98f99b2 100755
--- a/mappings/mappings-release.sh
+++ b/mappings/mappings-release.sh
@@ -20,6 +20,7 @@ DATABUSMAVENPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/mappings";
 RELEASEPUBLISHER="https://vehnem.github.io/webid.ttl#this";
 RELEASEPACKAGEDIR="/data/extraction/release";
 RELEASEDOWNLOADURL="http://dbpedia-mappings.tib.eu/release";
+RELEASELABELPREFIX=""
 
 #logging directory
 LOGS="/data/extraction/logs/$(date +%Y-%m-%d)";
@@ -77,7 +78,7 @@ postProcessing() {
 prepareRelease() {
     #own config
     cd $SCRIPTROOT;
-    collectExtraction.sh;
+    bash collectExtraction.sh;
 }
 
 setNewVersion() {
@@ -90,7 +91,8 @@ deployRelease() {
     mvn deploy \
 	-Ddatabus.publisher="$RELEASEPUBLISHER" \
 	-Ddatabus.packageDirectory="$RELEASEPACKAGEDIR/\${project.groupId}/\${project.artifactId}" \
-	-Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}";
+	-Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}" \
+	-Ddatabus.labelPrefix="$RELEASELABELPREFIX";
 }
 
 compressLogs() {
diff --git a/wikidata/schedule/wikidata-release.sh b/wikidata/schedule/wikidata-release.sh
deleted file mode 100755
index 3b8369c..0000000
--- a/wikidata/schedule/wikidata-release.sh
+++ /dev/null
@@ -1,140 +0,0 @@
-#!/bin/bash
-# Wikidata DBpedia release script, version 1.0
-
-set -e
-
-TIME_DATE="2019-07-01"  #$(date +%Y-%m-%d)
-
-EXTRACT_DIR=/home/extractor/extraction-framework
-MVN_LOGS=/data/extraction/logs/mvn
-#DATA_DIR=`awk -F= '/base-dir/{print $NF}' $EXTRACT_DIR/core/src/main/resources/universal.properties | head -n1`
-DATA_DIR=/data/extraction/wikidumps/
-#WWW_DIR=/var/www/html/wikidata
-
-function download-ontology(){
- cd $EXTRACT_DIR/core;
- ../run download-ontology;
-}
-
-function recompile(){
- cd $EXTRACT_DIR;
- mvn clean install;
-}
-
-function download-r2r-mapping(){
- cd $EXTRACT_DIR/core/src/main/resources && curl https://raw.githubusercontent.com/dbpedia/extraction-framework/master/core/src/main/resources/wikidatar2r.json > wikidatar2r.json
-}
-
-function download-xml-dump(){
- cd $EXTRACT_DIR/dump;
- ../run download download.wikidata.properties \
- > $MVN_LOGS/$TIME_DATE-wikidata.download.out \
- 2> $MVN_LOGS/$TIME_DATE-wikidata.download.err;
-}
-
-function raw-extractor(){
- cd $EXTRACT_DIR/dump;
- #Run only .WikidataRawExtractor
- ../run extraction extraction.wikidataraw.properties; 
-}
-
-function subclassof-script(){
- cd $EXTRACT_DIR/scripts;
- ../run WikidataSubClassOf process.wikidata.subclassof.properties; 
-}
-
-function all-other-extractors(){
- cd $EXTRACT_DIR/dump; 
- # Run all other extractors
- ../run extraction extraction.wikidataexceptraw.properties
-}
-
-function all-extractors(){
- cd $EXTRACT_DIR/dump;
- # Run all extractors to run extraction
- ../run extraction extraction.wikidata.properties;
-# > $MVN_LOGS/$TIME_DATE-wikidata.extraction.out \
-# 2> $MVN_LOGS/$TIME_DATE-wikidata.extraction.err;
-
-}
-
-function post-processing(){
- cd $EXTRACT_DIR/scripts;
- ../run ResolveTransitiveLinks $DATA_DIR redirects transitive-redirects .ttl.bz2 wikidata
- ../run MapObjectUris $DATA_DIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata
-}
-
-function type-consistency-check(){
- cd $EXTRACT_DIR/scripts;
- ../run TypeConsistencyCheck type.consistency.check.properties;
-}
-
-function sync-with-www(){
- rsync -avz $DATA_DIR/wikidatawiki/ $WWW_DIR/;
-
- #We don't need index.html
- find $WWW_DIR/ | grep index.html | xargs rm -rf;
-}
-
-function databus-preparation(){
-  cd $DATA_DIR;
-  bash ~/databusPrep.sh $WWW_DIR/ src/main/databus;
-}
-
-function delete-old-extractions(){
- #Delete extractions older than 1 month, i.e. keep 1-2 results in www.
- find $WWW_DIR/ -type d -ctime +20 | xargs rm -rf;
-
- #Remove everything in Dump dir, do we need to keep them?
- rm -rf $DATA_DIR/wikidatawiki/*;
-}
-
-function remove-date-from-files(){
- #Go to the last changed directory
- cd "$(\ls -1dt $WWW_DIR/*/ | head -n 1)";
-
- #Remove date (numbers) from files
- for i in *; do  mv "$i" "`echo $i| sed 's/[0-9]..//g'`"; done;
-}
-
-function main() {
- #delete-old-extractions; #to have some space for new extraction
-
-# touch download.process;
-
- download-ontology;
- download-r2r-mapping;
- download-xml-dump;
- recompile;
- all-extractors;
-
- post-processing;
- type-consistency-check;
-
- cd /data/extraction/wikidumps;
- ./prep.sh;
-
- cd /data/extraction/databus-maven-plugin/dbpedia/wikidata;
- mvn package;
- mvn databus:deploy;
-
-#----
-# below not configured yet
-#----
-
- ##Result of subclassof-script is used in next extraction.
- #subclassof-script;
- #databus-preparation;
- #Sync extraction with www
- #sync-with-www
- #remove-date-from-files
-
-#This was the previous extraction process. Now we don't need to run rawextractor separately
-# raw-extractor;
-# subclassof-script;
-# all-other-extractors;
-# post-processing;
-}
-
-main
-
diff --git a/wikidata/wikidata-release.sh b/wikidata/wikidata-release.sh
new file mode 100755
index 0000000..0525aaf
--- /dev/null
+++ b/wikidata/wikidata-release.sh
@@ -0,0 +1,214 @@
+#!/bin/bash
+
+set -e
+
+SCRIPTROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+# [CONFIG]
+
+#extraction-framework
+EXTRACTIONFRAMEWORKDIR="/home/extractor/extraction-framework";
+
+#extracted dumps (basedir)
+BASEDIR="/data/extraction/wikidumps";
+
+#databus-maven-plugin project, containing release pom
+#https://github.com/dbpedia/databus-maven-plugin/blob/master/dbpedia/wikidata/pom.xml
+DATABUSMAVENPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/wikidata";
+
+#override release pom.xml properties
+RELEASEPUBLISHER="https://vehnem.github.io/webid.ttl#this";
+RELEASEPACKAGEDIR="/data/extraction/release";
+RELEASEDOWNLOADURL="http://dbpedia-wikidata.tib.eu/release";
+RELEASELABELPREFIX=""
+
+#logging directory
+LOGS="/data/extraction/logs/$(date +%Y-%m-%d)";
+mkdir -p $LOGS;
+
+# [FUNCTIONS]
+
+execWithLogging() {
+    #arg(0) = $1 := "function name"
+    $1 > "$LOGS/$1.out" 2> "$LOGS/$1.err";
+}
+
+downloadOntology() {
+    cd $EXTRACTIONFRAMEWORKDIR/core;
+    ../run download-ontology;
+}
+
+downloadMappings() {
+    cd $EXTRACTIONFRAMEWORKDIR/core;
+    ../run download-mappings;
+}
+
+downloadR2R() {
+    cd $EXTRACTIONFRAMEWORKDIR/core/src/main/resources && curl https://raw.githubusercontent.com/dbpedia/extraction-framework/master/core/src/main/resources/wikidatar2r.json > wikidatar2r.json
+}
+
+downloadDumps() {
+    cd $EXTRACTIONFRAMEWORKDIR/dump;
+    ../run download $SCRIPTROOT/download.wikidata.properties;
+}
+
+buildExtractionFramework() {
+    cd $EXTRACTIONFRAMEWORKDIR;
+    mvn clean install;
+}
+
+runExtraction(){
+    cd $EXTRACTIONFRAMEWORKDIR/dump;
+    ../run extraction extraction.wikidata.properties;
+}
+
+resolveTransitiveLinks() {
+    cd $EXTRACTIONFRAMEWORKDIR/scripts;
+    ../run ResolveTransitiveLinks $BASEDIR redirects transitive-redirects .ttl.bz2 wikidata
+}
+
+mapObjectUris() {
+    cd $EXTRACTIONFRAMEWORKDIR/scripts;
+    ../run MapObjectUris $BASEDIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata
+}
+
+typeConsistencyCheck(){
+    cd $EXTRACTIONFRAMEWORKDIR/scripts;
+    ../run TypeConsistencyCheck type.consistency.check.properties;
+}
+
+postProcessing() {
+    echo "$(date) | extraction-framework| resole transitive links" >&2;
+    execWithLogging resolveTransitiveLinks;
+    echo "$(date) | extraction-framework| map object uris" >&2;
+    execWithLogging mapObjectUris;
+    echo "$(date) | extraction-framework| type consistency check" >&2;
+    execWithLogging typeConsistencyCheck;
+}
+
+prepareRelease() {
+    #own config
+    #    cd $SCRIPTROOT;
+    #    collectExtraction.sh;
+    cd $SCRIPTROOT/schedule;
+    bash prep.sh
+}
+
+setNewVersion() {
+    cd $DATABUSMAVENPOMDIR;
+    mvn versions:set -DnewVersion=$(ls * | grep '^[0-9]\{4\}.[0-9]\{2\}.[0-9]\{2\}$' | sort -u  | tail -1);
+}
+
+deployRelease() {
+    cd $DATABUSMAVENPOMDIR;
+    mvn deploy \
+	-Ddatabus.publisher="$RELEASEPUBLISHER" \
+	-Ddatabus.packageDirectory="$RELEASEPACKAGEDIR/\${project.groupId}/\${project.artifactId}" \
+	-Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}" \
+	-Ddatabus.labelPrefix="$RELEASELABELPREFIX";
+}
+
+compressLogs() {
+    for f in $(find $LOGS -type f ); do lbzip2 $f; done;
+}
+
+# [MAIN]
+main() {
+
+    #download
+    echo "$(date) | extraction-framework | start download ontology" >&2;
+    execWithLogging downloadOntology;
+    echo "$(date) | extraction-framework | start download mappings" >&2;
+    execWithLogging downloadMappings;
+    echo "$(date) | extraction-framework | start download r2r mappings" >&2;
+    execWithLogging downloadR2R;
+    echo "$(date) | extraction-framework | start download dumps" >&2;
+    execWithLogging downloadDumps;
+
+    #extraction
+    echo "$(date) | extraction-framework | mvn clean install" >&2;
+    execWithLogging buildExtractionFramework;
+    echo "$(date) | extraction-framework | start extraction" >&2;
+    execWithLogging runExtraction;
+    echo "$(date) | extraction-framework | post processing" >&2;
+    postProcessing;
+
+    #release
+    echo "$(date) | databus-maven-plugin | collect extracted datasets" >&2;
+    execWithLogging prepareRelease;
+    echo "$(date) | databus-maven-plugin | mvn versions:set" >&2;
+    execWithLogging setNewVersion;
+    echo "$(date) | databus-maven-plugin | mvn deploy" >&2;
+    execWithLogging deployRelease;
+
+    #cleanup
+    echo "$(date) | main | compress log files" >&2;
+    compressLogs;
+
+    #(DEPRECATED FUNCTIONS) below not configured yet
+
+    ##Result of subclassof-script is used in next extraction.
+    #subclassof-script;
+    #databus-preparation;
+    #Sync extraction with www
+    #sync-with-www
+    #remove-date-from-files
+
+    #This was the previous extraction process. Now we don't need to run rawextractor separately
+    # raw-extractor;
+    # subclassof-script;
+    # all-other-extractors;
+    # post-processing;
+}
+
+main
+
+# [DEPRECATED]
+
+DATA_DIR=/data/extraction/wikidumps/
+WWW_DIR=/var/www/html/wikidata
+
+function sync-with-www(){
+    rsync -avz $DATA_DIR/wikidatawiki/ $WWW_DIR/;
+
+    #We don't need index.html
+    find $WWW_DIR/ | grep index.html | xargs rm -rf;
+}
+
+function databus-preparation(){
+    cd $DATA_DIR;
+    bash ~/databusPrep.sh $WWW_DIR/ src/main/databus;
+}
+
+function delete-old-extractions(){
+    #Delete extractions older than 1 month, i.e. keep 1-2 results in www.
+    find $WWW_DIR/ -type d -ctime +20 | xargs rm -rf;
+
+    #Remove everything in Dump dir, do we need to keep them?
+    rm -rf $DATA_DIR/wikidatawiki/*;
+}
+
+function remove-date-from-files(){
+    #Go to the last changed directory
+    cd "$(\ls -1dt $WWW_DIR/*/ | head -n 1)";
+
+    #Remove date (numbers) from files
+    for i in *; do  mv "$i" "`echo $i| sed 's/[0-9]..//g'`"; done;
+}
+
+function raw-extractor(){
+    cd $EXTRACTIONFRAMEWORKDIR/dump;
+    #Run only .WikidataRawExtractor
+    ../run extraction extraction.wikidataraw.properties;
+}
+
+function subclassof-script(){
+    cd $EXTRACTIONFRAMEWORKDIR/scripts;
+    ../run WikidataSubClassOf process.wikidata.subclassof.properties;
+}
+
+function all-other-extractors(){
+    cd $EXTRACTIONFRAMEWORKDIR/dump;
+    # Run all other extractors
+    ../run extraction extraction.wikidataexceptraw.properties
+}
-- 
GitLab