diff --git a/README.md b/README.md index 272b59d6c7fadca48b03155fda01d85c3d8fea20..3765eaad6f718362db314399cfb85677d971ca53 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,13 @@ # MARVIN-config -Configuration files for MARVIN on the TIB servers, public for forking the architecture +MARVIN is the release bot that does automated DBpedia releases each month on three different servers for generic, mappings, wikidata extraction. + +The repository at https://git.informatik.uni-leipzig.de/dbpedia-assoc/marvin-config can be used to fork the architecture for creating extensions, developing new extractors or debugging old ones. + +Fixes and patches will be manually deployed via `git pull` from the `master` branch of the [DBpedia Extraction Framework](https://github.com/dbpedia/extraction-framework/). + +The architecture and workflow can also be forked and adapted to completely different extractions and derive operations outside of the DBpedia framework. + # Acknowledgements We thank Sören Auer and the Technische Informationsbibliothek (TIB) for providing three servers to run: @@ -9,20 +16,20 @@ We thank Sören Auer and the Technische Informationsbibliothek (TIB) for providi * community-provided extractors on Wikipedia, Wikidata or other sources * enrichment, cleaning and parsing services, so-called [Databus mods](https://github.com/dbpedia/databus-mods/) for open data on the Databus -This contribution by TIB is a great push towards incentivizing Open Data and establishing a global and national research and innovation data infrastructure. +This contribution by TIB to DBpedia & its community is a great push towards incentivizing Open Data and establishing a global and national research and innovation data infrastructure. # Workflow ## Downloading the wikimedia dumps TODO -## Running the extraction +## Update and Run the extraction TODO -## Deploy on Databus +## Deploy MARVIN on Databus TODO -## Run Databus-Derive (clone and parse) +## [Manual] Run Databus-Derive (clone and parse) On the respective server there is a user marvin-fetch, that has access to `/data/derive` containing the pom.xml of https://github.com/dbpedia/databus-maven-plugin/tree/master/dbpedia ``` @@ -37,15 +44,12 @@ SELECT distinct (?derive) WHERE { BIND (CONCAT("<version>",?artifact,"/${databus.deriveversion}</version>") as ?derive) } order by asc(?derive) +``` ``` -####### -# This is still manual, will be a cronjob soon -####### su marvin-fetch tmux a -t derive - WHAT=mappings NEWVERSION=2019.08.30 # prepare @@ -53,12 +57,24 @@ cd /data/derive/databus-maven-plugin/dbpedia/$WHAT git pull mvn versions:set -DnewVersion=$NEWVERSION # run -mvn -T 23 databus-derive:clone -Ddatabus.deriveversion=$NEWVERSION +mvn databus-derive:clone -Ddatabus.deriveversion=$NEWVERSION ``` -## Move data to download server (internal) +## [Manual] pull data to downloads.dbpedia.org server run marvin-fetch.sh script in databus/dbpedia folder +``` +cd /media/bigone/25TB/releases/databus-maven-plugin/dbpedia +./marvin-fetch.sh wikidata 2019.08.01 + +``` + ## Deploy official files +``` +cd /media/bigone/25TB/releases/databus-maven-plugin/dbpedia/mappings +mvn clean +mvn validate +mvn -T 8 deploy +``` diff --git a/generic/generic-release.sh b/generic/generic-release.sh index e9c64ce5b0edf2ab5655ffac06e65c72e473a86d..dbaf7098956f6e4882444877bf445132f2f062e0 100644 --- a/generic/generic-release.sh +++ b/generic/generic-release.sh @@ -20,7 +20,8 @@ DATABUSMAVENPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/generic"; RELEASEPUBLISHER="https://vehnem.github.io/webid.ttl#this"; RELEASEPACKAGEDIR="/data/extraction/release"; RELEASEDOWNLOADURL="http://dbpedia-generic.tib.eu/release"; -RELEASELABELPREFIX="" +RELEASELABELPREFIX="(pre-release)" +RELEASECOMMENTPREFIX="(MARVIN is the DBpedia bot, that runs the DBpedia Information Extraction Framework (DIEF) and releases the data as is, i.e. unparsed, unsorted, not redirected for debugging the software. After its releases, data is cleaned and persisted under the dbpedia account.)" #logging directory LOGS="/data/extraction/logs/$(date +%Y-%m-%d)"; @@ -93,7 +94,8 @@ deployRelease() { -Ddatabus.publisher="$RELEASEPUBLISHER" \ -Ddatabus.packageDirectory="$RELEASEPACKAGEDIR/\${project.groupId}/\${project.artifactId}" \ -Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}" \ - -Ddatabus.labelPrefix="$RELEASELABELPREFIX"; + -Ddatabus.labelPrefix="$RELEASELABELPREFIX" \ + -Ddatabus.commentPrefix="$RELEASECOMMENTPREFIX"; } compressLogs() { @@ -137,4 +139,6 @@ main() { compressLogs; } -execWithLogging main; +if [ ! -f "$SCRIPTROOT/generic-release.pid" ]; then + (execWithLogging main; rm "$SCRIPTROOT/generic-release.pid") & echo $! > "$SCRIPTROOT/generic-release.pid" +fi diff --git a/marvin-fetch.sh b/marvin-fetch.sh index f4665ca4e96ade0a3f787fbba72fb96999ebe068..ab284695f8f6d7fb34ca1e98548b4760fc22a549 100755 --- a/marvin-fetch.sh +++ b/marvin-fetch.sh @@ -1,16 +1,15 @@ #!/bin/bash - -# ./marvin-fetch.sh wikidata 2019.08.01 dbpedia-wikidata.tib.eu +# ./marvin-fetch.sh wikidata 2019.08.01 GROUP=$1 VERSION=$2 -SERVER=$3 +SERVER=dbpedia-$1.tib.eu # get artifacts ARTIFACTS=`xmlstarlet sel -N my=http://maven.apache.org/POM/4.0.0 -t -v "/my:project/my:modules/my:module" $GROUP/pom.xml` -for a in $ARTIFACTS ; do -echo $i -#scp -rv marvin-fetch@$SERVER:/data/databus-maven-plugin/dbpedia/$GROUP/$a/$VERSION $GROUP/$a/ -rsync -av -e ssh --ignore-existing marvin-fetch@$SERVER:/data/databus-maven-plugin/dbpedia/$GROUP/$a/$VERSION $GROUP/$a +for ARTIFACT in $ARTIFACTS ; do + echo $ARTIFACT + #scp -rv marvin-fetch@$SERVER:/data/databus-maven-plugin/dbpedia/$GROUP/$a/$VERSION $GROUP/$a/ + rsync -av -e ssh --ignore-existing marvin-fetch@$SERVER:/data/derive/databus-maven-plugin/dbpedia/$GROUP/$ARTIFACT/$VERSION $GROUP/$ARTIFACT done