From 4146ba9d6ae3e1934f274f82d51bca7acf3aa6d9 Mon Sep 17 00:00:00 2001 From: Your Name <you@example.com> Date: Wed, 28 Aug 2019 20:17:38 +0200 Subject: [PATCH] wikidata --- wikidata/crontab.bak | 23 +++++ wikidata/schedule/wikidata-release.sh | 140 ++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 wikidata/crontab.bak create mode 100755 wikidata/schedule/wikidata-release.sh diff --git a/wikidata/crontab.bak b/wikidata/crontab.bak new file mode 100644 index 0000000..4df7c31 --- /dev/null +++ b/wikidata/crontab.bak @@ -0,0 +1,23 @@ +# Edit this file to introduce tasks to be run by cron. +# +# Each task to run has to be defined through a single line +# indicating with different fields when the task will be run +# and what command to run for the task +# +# To define the time you can provide concrete values for +# minute (m), hour (h), day of month (dom), month (mon), +# and day of week (dow) or use '*' in these fields (for 'any').# +# Notice that tasks will be started based on the cron's system +# daemon's notion of time and timezones. +# +# Output of the crontab jobs (including errors) is sent through +# email to the user the crontab file belongs to (unless redirected). +# +# For example, you can run a backup of all your user accounts +# at 5 a.m every week with: +# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/ +# +# For more information see the manual pages of crontab(5) and cron(8) +# +# m h dom mon dow command +0 0 7 * * /bin/bash -c '/home/extractor/schedule/wikidata-release.sh' >/dev/null 2>&1 diff --git a/wikidata/schedule/wikidata-release.sh b/wikidata/schedule/wikidata-release.sh new file mode 100755 index 0000000..3b8369c --- /dev/null +++ b/wikidata/schedule/wikidata-release.sh @@ -0,0 +1,140 @@ +#!/bin/bash +# Wikidata DBpedia release script, version 1.0 + +set -e + +TIME_DATE="2019-07-01" #$(date +%Y-%m-%d) + +EXTRACT_DIR=/home/extractor/extraction-framework +MVN_LOGS=/data/extraction/logs/mvn +#DATA_DIR=`awk -F= '/base-dir/{print $NF}' $EXTRACT_DIR/core/src/main/resources/universal.properties | head -n1` +DATA_DIR=/data/extraction/wikidumps/ +#WWW_DIR=/var/www/html/wikidata + +function download-ontology(){ + cd $EXTRACT_DIR/core; + ../run download-ontology; +} + +function recompile(){ + cd $EXTRACT_DIR; + mvn clean install; +} + +function download-r2r-mapping(){ + cd $EXTRACT_DIR/core/src/main/resources && curl https://raw.githubusercontent.com/dbpedia/extraction-framework/master/core/src/main/resources/wikidatar2r.json > wikidatar2r.json +} + +function download-xml-dump(){ + cd $EXTRACT_DIR/dump; + ../run download download.wikidata.properties \ + > $MVN_LOGS/$TIME_DATE-wikidata.download.out \ + 2> $MVN_LOGS/$TIME_DATE-wikidata.download.err; +} + +function raw-extractor(){ + cd $EXTRACT_DIR/dump; + #Run only .WikidataRawExtractor + ../run extraction extraction.wikidataraw.properties; +} + +function subclassof-script(){ + cd $EXTRACT_DIR/scripts; + ../run WikidataSubClassOf process.wikidata.subclassof.properties; +} + +function all-other-extractors(){ + cd $EXTRACT_DIR/dump; + # Run all other extractors + ../run extraction extraction.wikidataexceptraw.properties +} + +function all-extractors(){ + cd $EXTRACT_DIR/dump; + # Run all extractors to run extraction + ../run extraction extraction.wikidata.properties; +# > $MVN_LOGS/$TIME_DATE-wikidata.extraction.out \ +# 2> $MVN_LOGS/$TIME_DATE-wikidata.extraction.err; + +} + +function post-processing(){ + cd $EXTRACT_DIR/scripts; + ../run ResolveTransitiveLinks $DATA_DIR redirects transitive-redirects .ttl.bz2 wikidata + ../run MapObjectUris $DATA_DIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata +} + +function type-consistency-check(){ + cd $EXTRACT_DIR/scripts; + ../run TypeConsistencyCheck type.consistency.check.properties; +} + +function sync-with-www(){ + rsync -avz $DATA_DIR/wikidatawiki/ $WWW_DIR/; + + #We don't need index.html + find $WWW_DIR/ | grep index.html | xargs rm -rf; +} + +function databus-preparation(){ + cd $DATA_DIR; + bash ~/databusPrep.sh $WWW_DIR/ src/main/databus; +} + +function delete-old-extractions(){ + #Delete extractions older than 1 month, i.e. keep 1-2 results in www. + find $WWW_DIR/ -type d -ctime +20 | xargs rm -rf; + + #Remove everything in Dump dir, do we need to keep them? + rm -rf $DATA_DIR/wikidatawiki/*; +} + +function remove-date-from-files(){ + #Go to the last changed directory + cd "$(\ls -1dt $WWW_DIR/*/ | head -n 1)"; + + #Remove date (numbers) from files + for i in *; do mv "$i" "`echo $i| sed 's/[0-9]..//g'`"; done; +} + +function main() { + #delete-old-extractions; #to have some space for new extraction + +# touch download.process; + + download-ontology; + download-r2r-mapping; + download-xml-dump; + recompile; + all-extractors; + + post-processing; + type-consistency-check; + + cd /data/extraction/wikidumps; + ./prep.sh; + + cd /data/extraction/databus-maven-plugin/dbpedia/wikidata; + mvn package; + mvn databus:deploy; + +#---- +# below not configured yet +#---- + + ##Result of subclassof-script is used in next extraction. + #subclassof-script; + #databus-preparation; + #Sync extraction with www + #sync-with-www + #remove-date-from-files + +#This was the previous extraction process. Now we don't need to run rawextractor separately +# raw-extractor; +# subclassof-script; +# all-other-extractors; +# post-processing; +} + +main + -- GitLab