From d0bce1804a75cd6daae7a1d7f665fd2bbd917401 Mon Sep 17 00:00:00 2001 From: Marvin Hofer <vehnem@yahoo.de> Date: Wed, 4 Sep 2019 15:09:24 +0200 Subject: [PATCH] cleanup wikidata-release.sh --- generic/collectExtraction.sh | 12 +- generic/generic-release.sh | 8 +- mappings/collectExtraction.sh | 12 +- mappings/mappings-release.sh | 6 +- wikidata/schedule/wikidata-release.sh | 140 ----------------- wikidata/wikidata-release.sh | 214 ++++++++++++++++++++++++++ 6 files changed, 235 insertions(+), 157 deletions(-) delete mode 100755 wikidata/schedule/wikidata-release.sh create mode 100755 wikidata/wikidata-release.sh diff --git a/generic/collectExtraction.sh b/generic/collectExtraction.sh index cdecb23..02849bd 100644 --- a/generic/collectExtraction.sh +++ b/generic/collectExtraction.sh @@ -8,7 +8,7 @@ set -e BASEDIR="/data/extraction/wikidumps/" #databus-maven-plugin project, containing release pom -DATABUSMVNDIR="/data/extraction/databus-maven-plugin/dbpedia/generic" +DATABUSMVNPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/generic" #explicit databus version or empty for all DUMPDATE= @@ -95,17 +95,17 @@ collectExtractionFun() { targetArVe="$artifact/$version" targetFile="$artifact$contVars.$extension" - if [ -d "$DATABUSMVNDIR/$artifact" ]; then + if [ -d "$DATABUSMVNPOMDIR/$artifact" ]; then - if [ ! -d "$DATABUSMVNDIR/$targetArVe" ]; then + if [ ! -d "$DATABUSMVNPOMDIR/$targetArVe" ]; then - mkdir -p "$DATABUSMVNDIR/$targetArVe" + mkdir -p "$DATABUSMVNPOMDIR/$targetArVe" fi if $TRYRUN; then - echo "$path -> $DATABUSMVNDIR/$targetArVe/$targetFile" + echo "$path -> $DATABUSMVNPOMDIR/$targetArVe/$targetFile" else - cp -vn "$path" "$DATABUSMVNDIR/$targetArVe/$targetFile" + cp -vn "$path" "$DATABUSMVNPOMDIR/$targetArVe/$targetFile" fi else diff --git a/generic/generic-release.sh b/generic/generic-release.sh index 0fbd11c..e9c64ce 100644 --- a/generic/generic-release.sh +++ b/generic/generic-release.sh @@ -19,7 +19,8 @@ DATABUSMAVENPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/generic"; #override release pom.xml properties RELEASEPUBLISHER="https://vehnem.github.io/webid.ttl#this"; RELEASEPACKAGEDIR="/data/extraction/release"; -RELEASEDOWNLOADURL="http://dbpedia-mappings.tib.eu/release"; +RELEASEDOWNLOADURL="http://dbpedia-generic.tib.eu/release"; +RELEASELABELPREFIX="" #logging directory LOGS="/data/extraction/logs/$(date +%Y-%m-%d)"; @@ -78,7 +79,7 @@ postProcessing() { prepareRelease() { #own config cd $SCRIPTROOT; - collectExtraction.sh; + bash collectExtraction.sh; } setNewVersion() { @@ -91,7 +92,8 @@ deployRelease() { mvn deploy \ -Ddatabus.publisher="$RELEASEPUBLISHER" \ -Ddatabus.packageDirectory="$RELEASEPACKAGEDIR/\${project.groupId}/\${project.artifactId}" \ - -Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}"; + -Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}" \ + -Ddatabus.labelPrefix="$RELEASELABELPREFIX"; } compressLogs() { diff --git a/mappings/collectExtraction.sh b/mappings/collectExtraction.sh index d4d3716..77113b0 100644 --- a/mappings/collectExtraction.sh +++ b/mappings/collectExtraction.sh @@ -8,7 +8,7 @@ set -e BASEDIR="/data/extraction/wikidumps/" #databus-maven-plugin project, containing release pom -DATABUSMVNDIR="/data/extraction/databus-maven-plugin/dbpedia/generic" +DATABUSMVNPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/generic" #explicit databus version or empty for all DUMPDATE= @@ -91,17 +91,17 @@ copyToMavenPlugin() { targetArVe="$artifact/$version" targetFile="$artifact$contVars.$extension" - if [ -d "$DATABUSMVNDIR/$artifact" ]; then + if [ -d "$DATABUSMVNPOMDIR/$artifact" ]; then - if [ ! -d "$DATABUSMVNDIR/$targetArVe" ]; then + if [ ! -d "$DATABUSMVNPOMDIR/$targetArVe" ]; then - mkdir -p "$DATABUSMVNDIR/$targetArVe" + mkdir -p "$DATABUSMVNPOMDIR/$targetArVe" fi if $TRYRUN; then - echo "$path -> $DATABUSMVNDIR/$targetArVe/$targetFile" + echo "$path -> $DATABUSMVNPOMDIR/$targetArVe/$targetFile" else - cp -vn "$path" "$DATABUSMVNDIR/$targetArVe/$targetFile" + cp -vn "$path" "$DATABUSMVNPOMDIR/$targetArVe/$targetFile" fi else diff --git a/mappings/mappings-release.sh b/mappings/mappings-release.sh index ce0c405..98f99b2 100755 --- a/mappings/mappings-release.sh +++ b/mappings/mappings-release.sh @@ -20,6 +20,7 @@ DATABUSMAVENPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/mappings"; RELEASEPUBLISHER="https://vehnem.github.io/webid.ttl#this"; RELEASEPACKAGEDIR="/data/extraction/release"; RELEASEDOWNLOADURL="http://dbpedia-mappings.tib.eu/release"; +RELEASELABELPREFIX="" #logging directory LOGS="/data/extraction/logs/$(date +%Y-%m-%d)"; @@ -77,7 +78,7 @@ postProcessing() { prepareRelease() { #own config cd $SCRIPTROOT; - collectExtraction.sh; + bash collectExtraction.sh; } setNewVersion() { @@ -90,7 +91,8 @@ deployRelease() { mvn deploy \ -Ddatabus.publisher="$RELEASEPUBLISHER" \ -Ddatabus.packageDirectory="$RELEASEPACKAGEDIR/\${project.groupId}/\${project.artifactId}" \ - -Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}"; + -Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}" \ + -Ddatabus.labelPrefix="$RELEASELABELPREFIX"; } compressLogs() { diff --git a/wikidata/schedule/wikidata-release.sh b/wikidata/schedule/wikidata-release.sh deleted file mode 100755 index 3b8369c..0000000 --- a/wikidata/schedule/wikidata-release.sh +++ /dev/null @@ -1,140 +0,0 @@ -#!/bin/bash -# Wikidata DBpedia release script, version 1.0 - -set -e - -TIME_DATE="2019-07-01" #$(date +%Y-%m-%d) - -EXTRACT_DIR=/home/extractor/extraction-framework -MVN_LOGS=/data/extraction/logs/mvn -#DATA_DIR=`awk -F= '/base-dir/{print $NF}' $EXTRACT_DIR/core/src/main/resources/universal.properties | head -n1` -DATA_DIR=/data/extraction/wikidumps/ -#WWW_DIR=/var/www/html/wikidata - -function download-ontology(){ - cd $EXTRACT_DIR/core; - ../run download-ontology; -} - -function recompile(){ - cd $EXTRACT_DIR; - mvn clean install; -} - -function download-r2r-mapping(){ - cd $EXTRACT_DIR/core/src/main/resources && curl https://raw.githubusercontent.com/dbpedia/extraction-framework/master/core/src/main/resources/wikidatar2r.json > wikidatar2r.json -} - -function download-xml-dump(){ - cd $EXTRACT_DIR/dump; - ../run download download.wikidata.properties \ - > $MVN_LOGS/$TIME_DATE-wikidata.download.out \ - 2> $MVN_LOGS/$TIME_DATE-wikidata.download.err; -} - -function raw-extractor(){ - cd $EXTRACT_DIR/dump; - #Run only .WikidataRawExtractor - ../run extraction extraction.wikidataraw.properties; -} - -function subclassof-script(){ - cd $EXTRACT_DIR/scripts; - ../run WikidataSubClassOf process.wikidata.subclassof.properties; -} - -function all-other-extractors(){ - cd $EXTRACT_DIR/dump; - # Run all other extractors - ../run extraction extraction.wikidataexceptraw.properties -} - -function all-extractors(){ - cd $EXTRACT_DIR/dump; - # Run all extractors to run extraction - ../run extraction extraction.wikidata.properties; -# > $MVN_LOGS/$TIME_DATE-wikidata.extraction.out \ -# 2> $MVN_LOGS/$TIME_DATE-wikidata.extraction.err; - -} - -function post-processing(){ - cd $EXTRACT_DIR/scripts; - ../run ResolveTransitiveLinks $DATA_DIR redirects transitive-redirects .ttl.bz2 wikidata - ../run MapObjectUris $DATA_DIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata -} - -function type-consistency-check(){ - cd $EXTRACT_DIR/scripts; - ../run TypeConsistencyCheck type.consistency.check.properties; -} - -function sync-with-www(){ - rsync -avz $DATA_DIR/wikidatawiki/ $WWW_DIR/; - - #We don't need index.html - find $WWW_DIR/ | grep index.html | xargs rm -rf; -} - -function databus-preparation(){ - cd $DATA_DIR; - bash ~/databusPrep.sh $WWW_DIR/ src/main/databus; -} - -function delete-old-extractions(){ - #Delete extractions older than 1 month, i.e. keep 1-2 results in www. - find $WWW_DIR/ -type d -ctime +20 | xargs rm -rf; - - #Remove everything in Dump dir, do we need to keep them? - rm -rf $DATA_DIR/wikidatawiki/*; -} - -function remove-date-from-files(){ - #Go to the last changed directory - cd "$(\ls -1dt $WWW_DIR/*/ | head -n 1)"; - - #Remove date (numbers) from files - for i in *; do mv "$i" "`echo $i| sed 's/[0-9]..//g'`"; done; -} - -function main() { - #delete-old-extractions; #to have some space for new extraction - -# touch download.process; - - download-ontology; - download-r2r-mapping; - download-xml-dump; - recompile; - all-extractors; - - post-processing; - type-consistency-check; - - cd /data/extraction/wikidumps; - ./prep.sh; - - cd /data/extraction/databus-maven-plugin/dbpedia/wikidata; - mvn package; - mvn databus:deploy; - -#---- -# below not configured yet -#---- - - ##Result of subclassof-script is used in next extraction. - #subclassof-script; - #databus-preparation; - #Sync extraction with www - #sync-with-www - #remove-date-from-files - -#This was the previous extraction process. Now we don't need to run rawextractor separately -# raw-extractor; -# subclassof-script; -# all-other-extractors; -# post-processing; -} - -main - diff --git a/wikidata/wikidata-release.sh b/wikidata/wikidata-release.sh new file mode 100755 index 0000000..0525aaf --- /dev/null +++ b/wikidata/wikidata-release.sh @@ -0,0 +1,214 @@ +#!/bin/bash + +set -e + +SCRIPTROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# [CONFIG] + +#extraction-framework +EXTRACTIONFRAMEWORKDIR="/home/extractor/extraction-framework"; + +#extracted dumps (basedir) +BASEDIR="/data/extraction/wikidumps"; + +#databus-maven-plugin project, containing release pom +#https://github.com/dbpedia/databus-maven-plugin/blob/master/dbpedia/wikidata/pom.xml +DATABUSMAVENPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/wikidata"; + +#override release pom.xml properties +RELEASEPUBLISHER="https://vehnem.github.io/webid.ttl#this"; +RELEASEPACKAGEDIR="/data/extraction/release"; +RELEASEDOWNLOADURL="http://dbpedia-wikidata.tib.eu/release"; +RELEASELABELPREFIX="" + +#logging directory +LOGS="/data/extraction/logs/$(date +%Y-%m-%d)"; +mkdir -p $LOGS; + +# [FUNCTIONS] + +execWithLogging() { + #arg(0) = $1 := "function name" + $1 > "$LOGS/$1.out" 2> "$LOGS/$1.err"; +} + +downloadOntology() { + cd $EXTRACTIONFRAMEWORKDIR/core; + ../run download-ontology; +} + +downloadMappings() { + cd $EXTRACTIONFRAMEWORKDIR/core; + ../run download-mappings; +} + +downloadR2R() { + cd $EXTRACTIONFRAMEWORKDIR/core/src/main/resources && curl https://raw.githubusercontent.com/dbpedia/extraction-framework/master/core/src/main/resources/wikidatar2r.json > wikidatar2r.json +} + +downloadDumps() { + cd $EXTRACTIONFRAMEWORKDIR/dump; + ../run download $SCRIPTROOT/download.wikidata.properties; +} + +buildExtractionFramework() { + cd $EXTRACTIONFRAMEWORKDIR; + mvn clean install; +} + +runExtraction(){ + cd $EXTRACTIONFRAMEWORKDIR/dump; + ../run extraction extraction.wikidata.properties; +} + +resolveTransitiveLinks() { + cd $EXTRACTIONFRAMEWORKDIR/scripts; + ../run ResolveTransitiveLinks $BASEDIR redirects transitive-redirects .ttl.bz2 wikidata +} + +mapObjectUris() { + cd $EXTRACTIONFRAMEWORKDIR/scripts; + ../run MapObjectUris $BASEDIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata +} + +typeConsistencyCheck(){ + cd $EXTRACTIONFRAMEWORKDIR/scripts; + ../run TypeConsistencyCheck type.consistency.check.properties; +} + +postProcessing() { + echo "$(date) | extraction-framework| resole transitive links" >&2; + execWithLogging resolveTransitiveLinks; + echo "$(date) | extraction-framework| map object uris" >&2; + execWithLogging mapObjectUris; + echo "$(date) | extraction-framework| type consistency check" >&2; + execWithLogging typeConsistencyCheck; +} + +prepareRelease() { + #own config + # cd $SCRIPTROOT; + # collectExtraction.sh; + cd $SCRIPTROOT/schedule; + bash prep.sh +} + +setNewVersion() { + cd $DATABUSMAVENPOMDIR; + mvn versions:set -DnewVersion=$(ls * | grep '^[0-9]\{4\}.[0-9]\{2\}.[0-9]\{2\}$' | sort -u | tail -1); +} + +deployRelease() { + cd $DATABUSMAVENPOMDIR; + mvn deploy \ + -Ddatabus.publisher="$RELEASEPUBLISHER" \ + -Ddatabus.packageDirectory="$RELEASEPACKAGEDIR/\${project.groupId}/\${project.artifactId}" \ + -Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}" \ + -Ddatabus.labelPrefix="$RELEASELABELPREFIX"; +} + +compressLogs() { + for f in $(find $LOGS -type f ); do lbzip2 $f; done; +} + +# [MAIN] +main() { + + #download + echo "$(date) | extraction-framework | start download ontology" >&2; + execWithLogging downloadOntology; + echo "$(date) | extraction-framework | start download mappings" >&2; + execWithLogging downloadMappings; + echo "$(date) | extraction-framework | start download r2r mappings" >&2; + execWithLogging downloadR2R; + echo "$(date) | extraction-framework | start download dumps" >&2; + execWithLogging downloadDumps; + + #extraction + echo "$(date) | extraction-framework | mvn clean install" >&2; + execWithLogging buildExtractionFramework; + echo "$(date) | extraction-framework | start extraction" >&2; + execWithLogging runExtraction; + echo "$(date) | extraction-framework | post processing" >&2; + postProcessing; + + #release + echo "$(date) | databus-maven-plugin | collect extracted datasets" >&2; + execWithLogging prepareRelease; + echo "$(date) | databus-maven-plugin | mvn versions:set" >&2; + execWithLogging setNewVersion; + echo "$(date) | databus-maven-plugin | mvn deploy" >&2; + execWithLogging deployRelease; + + #cleanup + echo "$(date) | main | compress log files" >&2; + compressLogs; + + #(DEPRECATED FUNCTIONS) below not configured yet + + ##Result of subclassof-script is used in next extraction. + #subclassof-script; + #databus-preparation; + #Sync extraction with www + #sync-with-www + #remove-date-from-files + + #This was the previous extraction process. Now we don't need to run rawextractor separately + # raw-extractor; + # subclassof-script; + # all-other-extractors; + # post-processing; +} + +main + +# [DEPRECATED] + +DATA_DIR=/data/extraction/wikidumps/ +WWW_DIR=/var/www/html/wikidata + +function sync-with-www(){ + rsync -avz $DATA_DIR/wikidatawiki/ $WWW_DIR/; + + #We don't need index.html + find $WWW_DIR/ | grep index.html | xargs rm -rf; +} + +function databus-preparation(){ + cd $DATA_DIR; + bash ~/databusPrep.sh $WWW_DIR/ src/main/databus; +} + +function delete-old-extractions(){ + #Delete extractions older than 1 month, i.e. keep 1-2 results in www. + find $WWW_DIR/ -type d -ctime +20 | xargs rm -rf; + + #Remove everything in Dump dir, do we need to keep them? + rm -rf $DATA_DIR/wikidatawiki/*; +} + +function remove-date-from-files(){ + #Go to the last changed directory + cd "$(\ls -1dt $WWW_DIR/*/ | head -n 1)"; + + #Remove date (numbers) from files + for i in *; do mv "$i" "`echo $i| sed 's/[0-9]..//g'`"; done; +} + +function raw-extractor(){ + cd $EXTRACTIONFRAMEWORKDIR/dump; + #Run only .WikidataRawExtractor + ../run extraction extraction.wikidataraw.properties; +} + +function subclassof-script(){ + cd $EXTRACTIONFRAMEWORKDIR/scripts; + ../run WikidataSubClassOf process.wikidata.subclassof.properties; +} + +function all-other-extractors(){ + cd $EXTRACTIONFRAMEWORKDIR/dump; + # Run all other extractors + ../run extraction extraction.wikidataexceptraw.properties +} -- GitLab