diff --git a/README.md b/README.md index b028265f2ca7a623c90e7b0ed54ed3238fb90ae7..272b59d6c7fadca48b03155fda01d85c3d8fea20 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,64 @@ # MARVIN-config -Configuration files for MARVIN on the TIB servers, public \ No newline at end of file +Configuration files for MARVIN on the TIB servers, public for forking the architecture + +# Acknowledgements +We thank Sören Auer and the Technische Informationsbibliothek (TIB) for providing three servers to run: + +* the main DBpedia extraction on a monthly basis +* community-provided extractors on Wikipedia, Wikidata or other sources +* enrichment, cleaning and parsing services, so-called [Databus mods](https://github.com/dbpedia/databus-mods/) for open data on the Databus + +This contribution by TIB is a great push towards incentivizing Open Data and establishing a global and national research and innovation data infrastructure. + +# Workflow + +## Downloading the wikimedia dumps +TODO + +## Running the extraction +TODO + +## Deploy on Databus +TODO + +## Run Databus-Derive (clone and parse) +On the respective server there is a user marvin-fetch, that has access to `/data/derive` containing the pom.xml of https://github.com/dbpedia/databus-maven-plugin/tree/master/dbpedia + +``` +# query to get all versions fro derive in xml syntax to paste directly into pom.xml +PREFIX dataid: <http://dataid.dbpedia.org/ns/core#> +SELECT distinct (?derive) WHERE { + + ?dataset dataid:group <https://databus.dbpedia.org/marvin/generic> . + ?dataset dataid:artifact ?artifact . + ?dataset dataid:version ?version . + ?dataset dct:hasVersion "2019.08.30"^^xsd:string + BIND (CONCAT("<version>",?artifact,"/${databus.deriveversion}</version>") as ?derive) +} +order by asc(?derive) + + +``` +####### +# This is still manual, will be a cronjob soon +####### +su marvin-fetch +tmux a -t derive + +WHAT=mappings +NEWVERSION=2019.08.30 +# prepare +cd /data/derive/databus-maven-plugin/dbpedia/$WHAT +git pull +mvn versions:set -DnewVersion=$NEWVERSION +# run +mvn -T 23 databus-derive:clone -Ddatabus.deriveversion=$NEWVERSION +``` + +## Move data to download server (internal) +run marvin-fetch.sh script in databus/dbpedia folder + +## Deploy official files + + diff --git a/generic/README.md b/generic/README.md new file mode 100644 index 0000000000000000000000000000000000000000..efe5f5baa6a7268b96d1fcaa24ac6bd2aaae3095 --- /dev/null +++ b/generic/README.md @@ -0,0 +1,8 @@ +# Generic Extraction Config + + +### Cronjob + +``` +0 0 7 * * /bin/bash -c '/home/extractor/marvin-config/generic/generic-release.sh' >/dev/null 2>&1 +``` \ No newline at end of file diff --git a/generic/collectExtraction.sh b/generic/collectExtraction.sh new file mode 100644 index 0000000000000000000000000000000000000000..4061596746f073a78f3830bd440070066310f2d0 --- /dev/null +++ b/generic/collectExtraction.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +set -e + +# [CONFIG] + +#extracted dumps (basedir) +BASEDIR="/data/extraction/wikidumps/" + +#databus-maven-plugin project, containing release pom +DATABUSMVNPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/generic" + +#explicit databus version or empty for all +DUMPDATE= + +#if true show dumy output +TRYRUN=false + +# [DATASET-MAPPING] + +mapLang() { + + lang=$(echo "$1" | sed 's|wiki||g') + + case "$lang" in + + "bat_smg") echo "_lang=batsmg";; + "zh_min_nan") echo "_lang=nan";; + "zh_yue") echo "_lang=yue";; + + "wikidata") echo "";; + + *) echo "_lang=$lang";; + esac +} + +mapExtraction() { + + case "$1" in + "article-templates-nested") echo "article-templates_nested";; + + "citation-data") echo "citations_data";; + "citation-links") echo "citations_links";; + + "commons-page-links") echo "commons-sameas-links";; + + "page-ids") echo "page_ids";; + "page-length") echo "page_length";; + "page-links") echo "wikilinks";; + + "article-categories") echo "categories_articles";; + "category-labels") echo "categories_labels";; + "skos-categories") echo "categories_skos";; + + "revision-ids") echo "revisions_ids";; + "revision-uris") echo "revisions_uris";; + + *) echo "$1";; + esac +} + +# [FUNCTIONS] + +collectExtractionFun() { + + #how to use ${string##/*} + #https://www.tldp.org/LDP/abs/html/string-manipulation.html#Substring%20Removal#Substring Removal + + for path in $(find "$BASEDIR" -name "*.ttl.bz2"); do + + file="${path##*/}" + + version="${file#*-}" + version="${version%%-*}" + version="${version:0:4}.${version:4:2}.${version:6:2}" + + if [ "$DUMPDATE" = "$version" ] || [ -z "$DUMPDATE" ] ; then + + lang="${file%%-*}" + + extraction="${file#*-*-}" + extraction="${extraction%%.*}" + + extension="${file#*.}" + + mapped="$(mapExtraction $extraction)" + + artifact="${mapped%%_*}" + + contVars="$(mapLang $lang)" + if [[ "$mapped" == *"_"* ]]; then + contVars="${contVars}_${mapped#*_}" + fi + + targetArVe="$artifact/$version" + targetFile="$artifact$contVars.$extension" + + if [ -d "$DATABUSMVNPOMDIR/$artifact" ]; then + + if [ ! -d "$DATABUSMVNPOMDIR/$targetArVe" ]; then + + mkdir -p "$DATABUSMVNPOMDIR/$targetArVe" + fi + + if $TRYRUN; then + echo "$path -> $DATABUSMVNPOMDIR/$targetArVe/$targetFile" + else + cp -vn "$path" "$DATABUSMVNPOMDIR/$targetArVe/$targetFile" + fi + else + + >&2 echo "unmapped/notexist artifact: $artifact | $mapped | $extraction" + fi + fi + done +} + +renameRedirected() { + cd $DATABUSMVNPOMDIR; +# for f in $(find . -name "*_redirected*" ); do rename -n 's/_redirected\.ttl\.bz2$/\.ttl\.bz2$/' $f; done + for f in $(find . -name "*_redirected*" ); do rename -n 's/_redirected//' $f; done +} + +# [Main] + +main() { + collectExtractionFun; +} + +main; diff --git a/generic/crontab.bak b/generic/crontab.bak deleted file mode 100644 index 572767eb7b086c773f437e92292a825fa4770546..0000000000000000000000000000000000000000 --- a/generic/crontab.bak +++ /dev/null @@ -1,24 +0,0 @@ -# Edit this file to introduce tasks to be run by cron. -# -# Each task to run has to be defined through a single line -# indicating with different fields when the task will be run -# and what command to run for the task -# -# To define the time you can provide concrete values for -# minute (m), hour (h), day of month (dom), month (mon), -# and day of week (dow) or use '*' in these fields (for 'any').# -# Notice that tasks will be started based on the cron's system -# daemon's notion of time and timezones. -# -# Output of the crontab jobs (including errors) is sent through -# email to the user the crontab file belongs to (unless redirected). -# -# For example, you can run a backup of all your user accounts -# at 5 a.m every week with: -# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/ -# -# For more information see the manual pages of crontab(5) and cron(8) -# -# m h dom mon dow command -#0 0 7 * * /bin/bash -c '/home/extractor/schedule/generic-release.sh' >/dev/null 2>&1 - diff --git a/generic/download.generic.properties b/generic/download.generic.properties new file mode 100644 index 0000000000000000000000000000000000000000..8d0fb170951e54f5a666de65c389e09d9d4a4703 --- /dev/null +++ b/generic/download.generic.properties @@ -0,0 +1,21 @@ +# Default download server. It lists mirrors which may be faster. +# base-url=https://dumps.wikimedia.org/ +# base-url=https://ftp.acc.umu.se/mirror/wikimedia.org/dumps/ +base-url=http://dumps.wikimedia.your.org/ + +# the source file name +# should be the same as in universal.properties +# source=pages-articles.xml.bz2 + +# languages to download +languages=en,af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue + +# Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space. +unzip=false + +# Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds. +retry-max=5 +retry-millis=10000 + +#for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used +dump-date= diff --git a/generic/generic-release.sh b/generic/generic-release.sh new file mode 100644 index 0000000000000000000000000000000000000000..dbaf7098956f6e4882444877bf445132f2f062e0 --- /dev/null +++ b/generic/generic-release.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +set -e + +SCRIPTROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"; + +# [CONFIG] + +#extraction-framework +EXTRACTIONFRAMEWORKDIR="/home/extractor/extraction-framework"; + +#extracted dumps (basedir) +BASEDIR="/data/extraction/wikidumps"; + +#databus-maven-plugin project, containing release pom +#https://github.com/dbpedia/databus-maven-plugin/blob/master/dbpedia/generic/pom.xml +DATABUSMAVENPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/generic"; + +#override release pom.xml properties +RELEASEPUBLISHER="https://vehnem.github.io/webid.ttl#this"; +RELEASEPACKAGEDIR="/data/extraction/release"; +RELEASEDOWNLOADURL="http://dbpedia-generic.tib.eu/release"; +RELEASELABELPREFIX="(pre-release)" +RELEASECOMMENTPREFIX="(MARVIN is the DBpedia bot, that runs the DBpedia Information Extraction Framework (DIEF) and releases the data as is, i.e. unparsed, unsorted, not redirected for debugging the software. After its releases, data is cleaned and persisted under the dbpedia account.)" + +#logging directory +LOGS="/data/extraction/logs/$(date +%Y-%m-%d)"; +mkdir -p $LOGS; + +# [FUNCTIONS] + +execWithLogging() { + #arg(0) = $1 := "function name" + $1 > "$LOGS/$1.out" 2> "$LOGS/$1.err"; +} + +downloadOntology() { + cd $EXTRACTIONFRAMEWORKDIR/core; + ../run download-ontology; +} + +downloadMappings() { + cd $EXTRACTIONFRAMEWORKDIR/core; + ../run download-mappings; +} + +downloadDumps() { + cd $EXTRACTIONFRAMEWORKDIR/dump; + ../run download $SCRIPTROOT/download.generic.properties; +} + +buildExtractionFramework() { + cd $EXTRACTIONFRAMEWORKDIR; + mvn clean install; +} + +runExtraction() { + cd $EXTRACTIONFRAMEWORKDIR/dump; + ../run sparkextraction $SCRIPTROOT/sparkextraction.generic.properties; + ../run sparkextraction $SCRIPTROOT/sparkextraction.generic.en.properties; +} + +resolveTransitiveLinks() { + cd $EXTRACTIONFRAMEWORKDIR/scripts; + ../run ResolveTransitiveLinks $BASEDIR redirects redirects_transitive .ttl.bz2 @downloaded; +} + +mapObjectUris() { + cd $EXTRACTIONFRAMEWORKDIR/scripts; + ../run MapObjectUris $BASEDIR redirects_transitive .ttl.bz2 disambiguations,infobox-properties,page-links,persondata,topical-concepts _redirected .ttl.bz2 @downloaded; +} + +postProcessing() { + echo "$(date) | extraction-framework| resole transitive links" >&2; + execWithLogging resolveTransitiveLinks; + echo "$(date) | extraction-framework| map object uris" >&2; + execWithLogging mapObjectUris; +} + +prepareRelease() { + #own config + cd $SCRIPTROOT; + bash collectExtraction.sh; +} + +setNewVersion() { + cd $DATABUSMAVENPOMDIR; + mvn versions:set -DnewVersion=$(ls * | grep '^[0-9]\{4\}.[0-9]\{2\}.[0-9]\{2\}$' | sort -u | tail -1); +} + +deployRelease() { + cd $DATABUSMAVENPOMDIR; + mvn deploy \ + -Ddatabus.publisher="$RELEASEPUBLISHER" \ + -Ddatabus.packageDirectory="$RELEASEPACKAGEDIR/\${project.groupId}/\${project.artifactId}" \ + -Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}" \ + -Ddatabus.labelPrefix="$RELEASELABELPREFIX" \ + -Ddatabus.commentPrefix="$RELEASECOMMENTPREFIX"; +} + +compressLogs() { + for f in $(find $LOGS -type f ); do lbzip2 $f; done; +} + +# [MAIN] + +main() { + + echo "--------------------" >&2; + echo " Generic Extraction " >&2; + echo "--------------------" >&2; + + #download + echo "$(date) | extraction-framework | start download ontology" >&2; + execWithLogging downloadOntology; + echo "$(date) | extraction-framework | start download mappings" >&2; + execWithLogging downloadMappings; + echo "$(date) | extraction-framework | start download dumps" >&2; + execWithLogging downloadDumps; + + #extraction + echo "$(date) | extraction-framework | mvn clean install" >&2; + execWithLogging buildExtractionFramework; + echo "$(date) | extraction-framework | start extraction" >&2; + execWithLogging runExtraction; + echo "$(date) | extraction-framework | post processing" >&2; + postProcessing; + + #release + echo "$(date) | databus-maven-plugin | collect extracted datasets" >&2; + execWithLogging prepareRelease; + echo "$(date) | databus-maven-plugin | mvn versions:set" >&2; + execWithLogging setNewVersion; + echo "$(date) | databus-maven-plugin | mvn deploy" >&2; + execWithLogging deployRelease; + + #cleanup + echo "$(date) | main | compress log files" >&2; + compressLogs; +} + +if [ ! -f "$SCRIPTROOT/generic-release.pid" ]; then + (execWithLogging main; rm "$SCRIPTROOT/generic-release.pid") & echo $! > "$SCRIPTROOT/generic-release.pid" +fi diff --git a/generic/schedule/generic-release.sh b/generic/schedule/generic-release.sh deleted file mode 100644 index 917da69ba1343293c14f5b466865a2a438a1d23e..0000000000000000000000000000000000000000 --- a/generic/schedule/generic-release.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash - -EXTRACTIONFRAMEWORK="/home/extractor/extraction-framework" - -DATABUSMAVENPLUGIN="/data/extraction/databus-maven-plugin/dbpedia/generic" - -LOGS= - -downloadOntology() { - echo " (date) | extraction-framework | start download ontology" >&2; - cd $EXTRACTIONFRAMEWORK/core; - ../run download-ontology; -} - -downloadMappings() { - echo "$(date) | extraction-framework | start download mappings" >&2; - cd $EXTRACTIONFRAMEWORK/core; - ../run download-mappings; -} - -downloadDumps() { - echo "$(date) | extraction-framework | start download dumps" >&2; - cd $EXTRACTIONFRAMEWORK/dump; - ../run download download.spark.properties; -} - -buildExtractionFramework() { - echo "$(date) | extraction-framework | mvn clean install" >&2; - cd $EXTRACTIONFRAMEWORK; - mvn clean install; -} - -runExtraction() { - echo "$(date) | extraction-framework | start extraction" >&2; - cd $EXTRACTIONFRAMEWORK/dump; - ../run sparkextraction extraction.spark.properties; -} - -prepareRelease() { - echo "$(date) | databus-maven-plugin | collect extracted datasetes" >&2; - cd $DATABUSMAVENPLUGIN; - ./collectExtraction.sh; -} - -deployRelease() { - echo "$(date) | databus-maven-plugin | mvn package" >&2; - cd $DATABUSMAVENPLUGIN; - mvn package; - echo "$(date) | databus-maven-plugin | mvn databus:deploy" >&2; - mvn databus:deploy; -} - -main() { - - echo "--------------------" - echo " Generic Extraction " - echo "--------------------" - - downloadOntology; - downloadMappings; - downloadDumps; - - buildExtractionFramework; - runExtraction; - - prepareRelease; - deployRelease; -} - diff --git a/generic/sparkextraction.generic.en.properties b/generic/sparkextraction.generic.en.properties new file mode 100644 index 0000000000000000000000000000000000000000..e3b7cac4fdbb501ce50f6e20fc97eafadfff6f4a --- /dev/null +++ b/generic/sparkextraction.generic.en.properties @@ -0,0 +1,31 @@ +# download and extraction target dir +#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties + +# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. +# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd- +# where xx is the wiki code and yyyymmdd is the dump date. + +# default: +#source=# moved to $extraction-framework/core/src/main/resources/universal.properties + +spark-master=local[32] + +# use only directories that contain a 'download-complete' file? Default is false. +require-download-complete=false + +# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings' +# excluded en, seemed too big for local[32] +#languages=af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue +languages=en + +# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings" + +extractors=.ArticleCategoriesExtractor,.ArticlePageExtractor,.ArticleTemplatesExtractor,.CategoryLabelExtractor,\ +.ExternalLinksExtractor,.GeoExtractor,.InfoboxExtractor,.InterLanguageLinksExtractor,.LabelExtractor,.PageIdExtractor,\ +.PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,\ +.WikiPageLengthExtractor,.WikiPageOutDegreeExtractor + +extractors.en=.CitationExtractor,.DisambiguationExtractor,.HomepageExtractor,.PersondataExtractor,.PndExtractor,.TopicalConceptsExtractor,.AnchorTextExtractor,.CommonsResourceExtractor + +# If we need to Exclude Non-Free Images in this Extraction, set this to true +copyrightCheck=false diff --git a/generic/sparkextraction.generic.properties b/generic/sparkextraction.generic.properties new file mode 100644 index 0000000000000000000000000000000000000000..1455bff9e7bbcce67e2d12ad612c28d1cd60cc59 --- /dev/null +++ b/generic/sparkextraction.generic.properties @@ -0,0 +1,121 @@ +# download and extraction target dir +#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties + +# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. +# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd- +# where xx is the wiki code and yyyymmdd is the dump date. + +# default: +#source=# moved to $extraction-framework/core/src/main/resources/universal.properties + +spark-master=local[32] + +# use only directories that contain a 'download-complete' file? Default is false. +require-download-complete=false + +# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings' +# excluded en, seemed too big for local[32] +languages=af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue + +# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings" + +extractors=.ArticleCategoriesExtractor,.ArticlePageExtractor,.ArticleTemplatesExtractor,.CategoryLabelExtractor,\ +.ExternalLinksExtractor,.GeoExtractor,.InfoboxExtractor,.InterLanguageLinksExtractor,.LabelExtractor,.PageIdExtractor,\ +.PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,\ +.WikiPageLengthExtractor,.WikiPageOutDegreeExtractor + +extractors.ar=.TopicalConceptsExtractor + +extractors.be= + +extractors.bg= + +extractors.bn= + +extractors.ca=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor + +extractors.ced= + +extractors.commons=.ContributorExtractor,.TemplateParameterExtractor,.FileTypeExtractor,.GalleryExtractor,.ImageAnnotationExtractor,.CommonsKMLExtractor,.DBpediaResourceExtractor + +extractors.cs= + +extractors.cy= + +extractors.da= + +extractors.de=.DisambiguationExtractor,.HomepageExtractor,.PersondataExtractor,.PndExtractor,.CommonsResourceExtractor + +extractors.el=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor + +extractors.en=.CitationExtractor,.DisambiguationExtractor,.HomepageExtractor,.PersondataExtractor,.PndExtractor,.TopicalConceptsExtractor,.AnchorTextExtractor,.CommonsResourceExtractor + +extractors.eo= + +extractors.es=,.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor,.CommonsResourceExtractor + +extractors.et= + +extractors.eu=,.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor + +extractors.fa= + +extractors.fi= + +extractors.fr=.DisambiguationExtractor,.HomepageExtractor,.PndExtractor,.TopicalConceptsExtractor,.fr.PopulationExtractor,.CommonsResourceExtractor + +extractors.ga=.HomepageExtractor + +extractors.gl= + +extractors.hi= + +extractors.hr= + +extractors.hu= + +extractors.id= + +extractors.it=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor + +extractors.ja=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor,.CommonsResourceExtractor + +extractors.ko=.DisambiguationExtractor + +extractors.lt= + +extractors.lv= + +extractors.nl=.DisambiguationExtractor,.CommonsResourceExtractor + +extractors.mk= + +extractors.mt= + +extractors.pl=.DisambiguationExtractor,.HomepageExtractor + +extractors.pt=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor,.CommonsResourceExtractor + +extractors.ru=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor + +extractors.sk= + +extractors.sl= + +extractors.sr= + +extractors.tr= + +extractors.ur= + +extractors.vi= + +extractors.war= + +#only the raw extractor here: all other wikidata extractors are executed in an separate extraction for wikidata (see: extraction.wikidata.properties) +#extractors.wikidata=.WikidataSameAsExtractor,.WikidataRawExtractor + +extractors.zh= + +# If we need to Exclude Non-Free Images in this Extraction, set this to true +copyrightCheck=false diff --git a/mappings/README.md b/mappings/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5a2f0cc14b6b017fe7313a304976cb96fd9435d1 --- /dev/null +++ b/mappings/README.md @@ -0,0 +1,8 @@ +# Mappingbased Extraction Config + + +### Cronjob + +``` +0 0 7 * * /bin/bash -c '/home/extractor/marvin-config/mappings/mappings-release.sh' >/dev/null 2>&1 +``` \ No newline at end of file diff --git a/mappings/collectExtraction.sh b/mappings/collectExtraction.sh new file mode 100644 index 0000000000000000000000000000000000000000..4baa1190fa9217dc5e790e7cb29eacd04972fd11 --- /dev/null +++ b/mappings/collectExtraction.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +set -e + +# [CONFIG] + +#extracted dumps (basedir) +BASEDIR="/data/extraction/wikidumps/" + +#databus-maven-plugin project, containing release pom +DATABUSMVNPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/generic" + +#explicit databus version or empty for all +DUMPDATE= + +#if true show dumy output +TRYRUN=false + +# [DATASET-MAPPING] + +mapLang() { + + lang=$(echo "$1" | sed 's|wiki||g') + + case "$lang" in + + "bat_smg") echo "_lang=batsmg";; + "zh_min_nan") echo "_lang=nan";; + "zh_yue") echo "_lang=yue";; + + "wikidata") echo "";; + + "commons" ) echo "_commons";; + + *) echo "_lang=$lang";; + esac +} + +mapExtraction() { + + case "$1" in + + "instance-types-transitive") echo "instance-types_transitive";; + "mappingbased-objects-disjoint-domain") echo "mappingbased-objects_disjointDomain";; + "mappingbased-objects-disjoint-range") echo "mappingbased-objects_disjointRange";; + + *) echo "$1";; + esac +} + +# [FUNCTIONS] + +copyToMavenPlugin() { + + # https://www.tldp.org/LDP/abs/html/string-manipulation.html#Substring%20Removal#Substring Removal + # ${string##/*} + + for path in $(find "$BASEDIR" -name "*.ttl.bz2"); do + + file="${path##*/}" + + version="${file#*-}" + version="${version%%-*}" + version="${version:0:4}.${version:4:2}.${version:6:2}" + + if [ "$DUMPDATE" = "$version" ] || [ -z "$DUMPDATE" ] ; then + + lang="${file%%-*}" + + extraction="${file#*-*-}" + extraction="${extraction%%.*}" + + extension="${file#*.}" + + mapped="$(mapExtraction $extraction)" + + artifact="${mapped%%_*}" + + contVars="$(mapLang $lang)" + if [[ "$mapped" == *"_"* ]]; then + contVars="${contVars}_${mapped#*_}" + fi + + targetArVe="$artifact/$version" + targetFile="$artifact$contVars.$extension" + + if [ -d "$DATABUSMVNPOMDIR/$artifact" ]; then + + if [ ! -d "$DATABUSMVNPOMDIR/$targetArVe" ]; then + + mkdir -p "$DATABUSMVNPOMDIR/$targetArVe" + fi + + if $TRYRUN; then + echo "$path -> $DATABUSMVNPOMDIR/$targetArVe/$targetFile" + else + cp -vn "$path" "$DATABUSMVNPOMDIR/$targetArVe/$targetFile" + fi + else + + >&2 echo "unmapped/notexist artifact: $artifact | $mapped | $extraction" + fi + fi + done +} + + +# [MAIN] + +main() { + copyToMavenPlugin +} + +main diff --git a/mappings/crontab.bak b/mappings/crontab.bak deleted file mode 100644 index 26e508ba38102f864ab614a30607df52c9bff797..0000000000000000000000000000000000000000 --- a/mappings/crontab.bak +++ /dev/null @@ -1,24 +0,0 @@ -# Edit this file to introduce tasks to be run by cron. -# -# Each task to run has to be defined through a single line -# indicating with different fields when the task will be run -# and what command to run for the task -# -# To define the time you can provide concrete values for -# minute (m), hour (h), day of month (dom), month (mon), -# and day of week (dow) or use '*' in these fields (for 'any').# -# Notice that tasks will be started based on the cron's system -# daemon's notion of time and timezones. -# -# Output of the crontab jobs (including errors) is sent through -# email to the user the crontab file belongs to (unless redirected). -# -# For example, you can run a backup of all your user accounts -# at 5 a.m every week with: -# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/ -# -# For more information see the manual pages of crontab(5) and cron(8) -# -# m h dom mon dow command -0 0 7 * * /bin/bash -c '/home/extractor/schedule/run-extraction.sh' >/dev/null 2>&1 - diff --git a/mappings/download.mappings.properties b/mappings/download.mappings.properties new file mode 100644 index 0000000000000000000000000000000000000000..b67a1aaeb990b04d04aad406854c944857f51f44 --- /dev/null +++ b/mappings/download.mappings.properties @@ -0,0 +1,19 @@ +# Default download server. It lists mirrors which may be faster. +base-url=https://dumps.wikimedia.org/ + +# the source file name +# should be the same as in universal.properties +# source=pages-articles.xml.bz2 + +# languages to download +languages=@mappings + +# Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space. +unzip=false + +# Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds. +retry-max=5 +retry-millis=10000 + +#for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used +dump-date= \ No newline at end of file diff --git a/mappings/extraction.mappings.properties b/mappings/extraction.mappings.properties new file mode 100644 index 0000000000000000000000000000000000000000..5983b57fe2b7857a1fe9df7db97ab466feb23a80 --- /dev/null +++ b/mappings/extraction.mappings.properties @@ -0,0 +1,115 @@ +# download and extraction target dir +#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties + +# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. +# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd- +# where xx is the wiki code and yyyymmdd is the dump date. + +# default: +#source=# moved to $extraction-framework/core/src/main/resources/universal.properties + +# use only directories that contain a 'download-complete' file? Default is false. +require-download-complete=false + +# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings' +languages=@mappings + +# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings" + +extractors=.MappingExtractor + +#extractors.ar=.MappingExtractor,.TopicalConceptsExtractor +# +#extractors.be=.MappingExtractor +# +#extractors.bg=.MappingExtractor +# +#extractors.bn=.MappingExtractor +# +#extractors.ca=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor +# +#extractors.ced=.MappingExtractor +# +#extractors.commons=.MappingExtractor,.ContributorExtractor,.TemplateParameterExtractor,.FileTypeExtractor,.GalleryExtractor,.ImageAnnotationExtractor,.CommonsKMLExtractor,.DBpediaResourceExtractor +# +#extractors.cs=.MappingExtractor +# +#extractors.cy=.MappingExtractor +# +#extractors.da=.MappingExtractor +# +#extractors.de=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.PersondataExtractor,.PndExtractor,.CommonsResourceExtractor +# +#extractors.el=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor +# +#extractors.en=.MappingExtractor,.CitationExtractor,.DisambiguationExtractor,.GenderExtractor,.HomepageExtractor,.ImageExtractorNew,.PersondataExtractor,.PndExtractor,.TopicalConceptsExtractor,.AnchorTextExtractor,.CommonsResourceExtractor +# +#extractors.eo=.MappingExtractor +# +#extractors.es=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor,.CommonsResourceExtractor +# +#extractors.et=.MappingExtractor +# +#extractors.eu=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor +# +#extractors.fa=.MappingExtractor +# +#extractors.fi=.MappingExtractor +# +#extractors.fr=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.PndExtractor,.TopicalConceptsExtractor,.fr.PopulationExtractor,.CommonsResourceExtractor +# +#extractors.ga=.MappingExtractor,.HomepageExtractor +# +#extractors.gl=.MappingExtractor +# +#extractors.hi=.MappingExtractor +# +#extractors.hr=.MappingExtractor +# +#extractors.hu=.MappingExtractor +# +#extractors.id=.MappingExtractor +# +#extractors.it=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor +# +#extractors.ja=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor,.CommonsResourceExtractor +# +#extractors.ko=.MappingExtractor,.DisambiguationExtractor +# +#extractors.lt=.MappingExtractor +# +#extractors.lv=.MappingExtractor +# +#extractors.nl=.MappingExtractor,.DisambiguationExtractor,.ImageExtractorNew,.CommonsResourceExtractor +# +#extractors.mk=.MappingExtractor +# +#extractors.mt=.MappingExtractor +# +#extractors.pl=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew +# +#extractors.pt=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor,.CommonsResourceExtractor +# +#extractors.ru=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor +# +#extractors.sk=.MappingExtractor +# +#extractors.sl=.MappingExtractor +# +#extractors.sr=.MappingExtractor +# +#extractors.tr=.MappingExtractor +# +#extractors.ur=.MappingExtractor +# +#extractors.vi=.MappingExtractor +# +#extractors.war=.MappingExtractor + +#only the raw extractor here: all other wikidata extractors are executed in an separate extraction for wikidata (see: extraction.wikidata.properties) +#extractors.wikidata=.WikidataSameAsExtractor,.WikidataRawExtractor + +#extractors.zh=.MappingExtractor + +# If we need to Exclude Non-Free Images in this Extraction, set this to true +copyrightCheck=false diff --git a/mappings/mappings-release.sh b/mappings/mappings-release.sh new file mode 100755 index 0000000000000000000000000000000000000000..c4bd76655621dbcb9ecbbedf65bb22117cd5fbe1 --- /dev/null +++ b/mappings/mappings-release.sh @@ -0,0 +1,143 @@ +#!/bin/bash + +set -e + +SCRIPTROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# [CONFIG] + +#extraction-framework +EXTRACTIONFRAMEWORKDIR="/home/extractor/extraction-framework"; + +#extracted dumps (basedir) +BASEDIR="/data/extraction/wikidumps"; + +#databus-maven-plugin project, containing release pom +#https://github.com/dbpedia/databus-maven-plugin/blob/master/dbpedia/mappings/pom.xml +DATABUSMAVENPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/mappings"; + +#override release pom.xml properties +RELEASEPUBLISHER="https://vehnem.github.io/webid.ttl#this"; +RELEASEPACKAGEDIR="/data/extraction/release"; +RELEASEDOWNLOADURL="http://dbpedia-mappings.tib.eu/release"; +RELEASELABELPREFIX="(pre-release)" +RELEASECOMMENTPREFIX="(MARVIN is the DBpedia bot, that runs the DBpedia Information Extraction Framework (DIEF) and releases the data as is, i.e. unparsed, unsorted, not redirected for debugging the software. After its releases, data is cleaned and persisted under the dbpedia account.)" + +#logging directory +LOGS="/data/extraction/logs/$(date +%Y-%m-%d)"; +mkdir -p $LOGS; + +# [FUNCTIONS] + +execWithLogging() { + #arg(0) = $1 := "function name" + $1 > "$LOGS/$1.out" 2> "$LOGS/$1.err"; +} + +downloadOntology() { + cd $EXTRACTIONFRAMEWORKDIR/core; + ../run download-ontology; +} + +downloadMappings() { + cd $EXTRACTIONFRAMEWORKDIR/core; + ../run download-mappings; +} + +downloadDumps() { + cd $EXTRACTIONFRAMEWORKDIR/dump; + ../run download $SCRIPTROOT/download.mappings.properties; +} + +buildExtractionFramework() { + cd $EXTRACTIONFRAMEWORKDIR; + mvn clean install; +} + +runExtraction() { + cd $EXTRACTIONFRAMEWORKDIR/dump; + ../run extraction $SCRIPTROOT/extraction.mappings.properties; +} + +resolveTransitiveLinks() { + cd $EXTRACTIONFRAMEWORKDIR/scripts; + ../run ResolveTransitiveLinks $BASEDIR redirects redirects_transitive .ttl.bz2 @downloaded; +} + +mapObjectUris() { + cd $EXTRACTIONFRAMEWORKDIR/scripts; + ../run MapObjectUris $BASEDIR redirects_transitive .ttl.bz2 disambiguations,infobox-properties,page-links,persondata,topical-concepts _redirected .ttl.bz2 @downloaded; +} + +postProcessing() { + echo "$(date) | extraction-framework| resole transitive links" >&2; + execWithLogging resolveTransitiveLinks; + echo "$(date) | extraction-framework| map object uris" >&2; + execWithLogging mapObjectUris; +} + +prepareRelease() { + #own config + cd $SCRIPTROOT; + bash collectExtraction.sh; +} + +setNewVersion() { + cd $DATABUSMAVENPOMDIR; + mvn versions:set -DnewVersion=$(ls * | grep '^[0-9]\{4\}.[0-9]\{2\}.[0-9]\{2\}$' | sort -u | tail -1); +} + +deployRelease() { + cd $DATABUSMAVENPOMDIR; + mvn deploy \ + -Ddatabus.publisher="$RELEASEPUBLISHER" \ + -Ddatabus.packageDirectory="$RELEASEPACKAGEDIR/\${project.groupId}/\${project.artifactId}" \ + -Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}" \ + -Ddatabus.labelPrefix="$RELEASELABELPREFIX" \ + -Ddatabus.commentPrefix="$RELEASECOMMENTPREFIX"; +} + +compressLogs() { + for f in $(find $LOGS -type f ); do lbzip2 $f; done; +} + +# [MAIN] + +main() { + + echo "-------------------------" >&2; + echo " Mappings-based Extraction " >&2; + echo "-------------------------" >&2; + + #download + echo "$(date) | extraction-framework | start download ontology" >&2; + execWithLogging downloadOntology; + echo "$(date) | extraction-framework | start download mappings" >&2; + execWithLogging downloadMappings; + echo "$(date) | extraction-framework | start download dumps" >&2; + execWithLogging downloadDumps; + + #extraction + echo "$(date) | extraction-framework | mvn clean install" >&2; + execWithLogging buildExtractionFramework; + echo "$(date) | extraction-framework | start extraction" >&2; + execWithLogging runExtraction; + # echo "$(date) | extraction-framework | post processing" >&2; + # postProcessing; + + #release + echo "$(date) | databus-maven-plugin | collect extracted datasets" >&2; + execWithLogging prepareRelease; + echo "$(date) | databus-maven-plugin | mvn versions:set" >&2; + execWithLogging setNewVersion; + echo "$(date) | databus-maven-plugin | mvn deploy" >&2; + execWithLogging deployRelease; + + #cleanup + echo "$(date) | main | compress log files" >&2; + compressLogs; +} + +if [ ! -f "$SCRIPTROOT/generic-release.pid" ]; then + (execWithLogging main; rm "$SCRIPTROOT/generic-release.pid") & echo $! > "$SCRIPTROOT/generic-release.pid" +fi diff --git a/mappings/schedule/extraction-cronjob.sh b/mappings/schedule/extraction-cronjob.sh deleted file mode 100644 index 1ca7b0dd9edf44f63a59c014b85ba301f275d41c..0000000000000000000000000000000000000000 --- a/mappings/schedule/extraction-cronjob.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -/home/extractor/schedule/run-extraction.sh diff --git a/mappings/schedule/extractionToPlugin.sh b/mappings/schedule/extractionToPlugin.sh index b8684bc269fb39f06adf65eca763b54296d00150..abfeab8518fdad2be0ae2e29c8b9257fdc9222c9 100755 --- a/mappings/schedule/extractionToPlugin.sh +++ b/mappings/schedule/extractionToPlugin.sh @@ -5,7 +5,6 @@ databus_folder="/data/extraction/databus-maven-plugin/dbpedia/" #databus_maven_plugin_structure="src/main/databus" databus_maven_plugin_structure="" - # @ALL|@GENERIC|@MAPPINGS or seperated by SPACE artifacts="@MAPPINGS" filter_extension=".ttl.*" @@ -32,13 +31,18 @@ function name_to_variant { function merge_to_artifact { case $1 in "instance-types-transitive") echo "instance-types";; + "mappingbased-objects-disjoint-domain") echo "mappingbased-objects";; + "mappingbased-objects-disjoint-range") echo "mappingbased-objects";; *) echo $1;; + esac } function additional_content_variants { case $1 in "instance-types-transitive") echo "_transitive";; + "mappingbased-objects-disjoint-domain") echo "_disjointDomain";; + "mappingbased-objects-disjoint-range") echo "_disjointRange";; *) echo "";; esac } @@ -120,7 +124,7 @@ function prepare_databus_artifacts { case $artifacts in "@MAPPINGS") - prepare_databus_artifacts "instance-types instance-types-transitive mappingbased-literals mappingbased-objects-uncleaned specific-mappingbased-properties geo-coordinates-mappingbased" ;; + prepare_databus_artifacts "instance-types instance-types-transitive mappingbased-literals mappingbased-objects-uncleaned mappingbased-objects mappingbased-objects-disjoint-domain mappingbased-objects-disjoint-range specific-mappingbased-properties geo-coordinates-mappingbased" ;; "@GENERIC") echo "TODO prepare: @GENERIC" ;; "@ALL") diff --git a/mappings/schedule/run-extraction.sh b/mappings/schedule/run-extraction.sh deleted file mode 100755 index eaef90497872d851adb52436260bcaed7693f023..0000000000000000000000000000000000000000 --- a/mappings/schedule/run-extraction.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -set -e - -rootDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -mvnLogs=/data/extraction/logs/mvn -currentDate=$(date +%Y-%m-%d) -extraFrameW=/home/extractor/extraction-framework/ -dataPlugDir=/data/extraction/databus-maven-plugin/dbpedia/mappings/ - -function download-xml(){ - cd $extraFrameW/dump; - ../run download download.mappings.properties \ - > $mvnLogs/$currentDate-mappingbased.download.out \ - 2> $mvnLogs/$currentDate-mappingbased.download.err; -} - -function download-ontology(){ - cd $extraFrameW/dump; - ../run download-ontology; -} - -function download-mappings(){ - cd $extraFrameW/dump; - ../run download-mappings; -} - -function extraction(){ - cd $extraFrameW/dump; - ../run extraction extraction.mappings.properties \ - > $mvnLogs/$currentDate-mappingbased.extraction.out \ - 2> $mvnLogs/$currentDate-mappingbased.extraction.err; -} - -function setNewestVersion(){ - cd $dataPlugDir - mvn versions:set -DnewVersion=$(ls * | grep '^[0-9]\{4\}.[0-9]\{2\}.[0-9]\{2\}$' | sort -u | tail -1) -} - -function package(){ - cd $dataPlugDir - mvn package -} - -function deploy(){ - cd $dataPlugDir - mvn deploy -} - - -function main(){ - download-xml; - download-ontology; - download-mappings; - - extraction; - - # sep. conf. - $rootDir/extractionToPlugin.sh; - - setNewestVersion; - package; - deploy; -} - -main diff --git a/marvin-fetch.sh b/marvin-fetch.sh index bb7abff2241b68e590297d5e64b5b124122a0ea6..ab284695f8f6d7fb34ca1e98548b4760fc22a549 100755 --- a/marvin-fetch.sh +++ b/marvin-fetch.sh @@ -1,7 +1,6 @@ #!/bin/bash # ./marvin-fetch.sh wikidata 2019.08.01 - GROUP=$1 VERSION=$2 SERVER=dbpedia-$1.tib.eu diff --git a/universal.properties b/universal.properties new file mode 100644 index 0000000000000000000000000000000000000000..6de96031e3a5487a184d06fd59c4a6533000db5a --- /dev/null +++ b/universal.properties @@ -0,0 +1,89 @@ +# NOTE: this properties files is imported in every extraction process and contains general parameters which only have to be set once for every release + +# The DBpedia version to be extracted (in this format: YYYY-MM) +dbpedia-version=2018-10 + +# Replace with your Wikipedia dump download directory (should not change over the course of a release) +base-dir=/data/extraction/wikidumps/ + +# The log file directory - used to store all log files created in the course of all extractions +log-dir=/data/extraction/logs/extraction/ + +# to forward extraction summaries and warnings via the slack API, use this option +-slack-webhook=https://hooks.slack.com/services/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 +-slack-username=username +-slack-summary-threshold=1000000 +-slack-exception-threshold=10 + +# wiki suffix: should be 'wiki' +wiki-name=wiki + +# wikidata mapping file +wikidata-property-mappings-file=wikidata-property-mappings.json + +###### Extract from part files ###### +# +# Please make sure that the regex actually matches the format used by ALL the wikis you are going to extract from!!!! +# One that should work in all cases is +# source=@pages-articles\\d*\\.xml(-p\\d+p\\d+)?\\.bz2 +# +# NOTE: when using the above regex you should make sure you do not have part files AND regular dump files together +# for the same wiki, e.g. frwiki-20131120-pages-articles1.xml.bz2 and frwiki-20131120-pages-articles.xml.bz2, as they +# BOTH will match and that will result in duplicate output data +# +# Example: +# enwiki => enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 hence @pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2 matches +# frwiki => frwiki-latest-pages-articles1.xml.bz2 hence @pages-articles\\d+\\.xml\\.bz2 matches (the previous regex does not!) +# commonswiki => it does not have part files! This is true for other wikis as well. +# +# source=@pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2 + +# In case of multistream chunks +# source=@pages-articles-multistream\\.xml\\.\\d+\\.bz2 + +# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. +# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd- +# where xx is the wiki code and yyyymmdd is the dump date. +#(default: pages-articles.xml.bz2): +source=pages-articles-multistream.xml.bz2 + +# Parallel Disc Processes: indicates how many parallel extraction processes can be executed +# when each involves reading files from the disc. +# This number is highly dependent on the number (RAID > 0) and type (SSD, HDD) of disc in use +# as well as the number of cores available. +parallel-processes=8 + +# if ontology and mapping files are not given or do not exist, +# download info from mappings.dbpedia.org +# by default both should be in the root folder ../ +ontology=../ontology.xml +mappings=../mappings + +# disambiguations file (default: "page_props.sql.gz") +disambiguations=page_props.sql.gz + +# Serialization URI policies and file formats. Quick guide: +# uri-policy keys: uri, generic, xml-safe, reject-long +# uri-policy position modifiers: -subjects, -predicates, -objects, -datatypes, -contexts +# uri-policy values: comma-separated languages or '*' for all languages +# format values: n-triples, n-quads, turtle-triples, turtle-quads, trix-triples, trix-quads +# See http://git.io/DBpedia-serialization-format-properties for details. + +# For backwards compatibility, en uses generic URIs. All others use local IRIs. +# uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*; reject-long:* +uri-policy.iri=generic:en;xml-safe-predicates:*;reject-long:* + +# Turtle is much more readable - use nice IRIs for all languages +format.ttl.bz2=turtle-triples;uri-policy.iri +# format.tql.bz2=turtle-quads;uri-policy.iri + +# Extraction Monitor: Compare triple counts to older dbpedia versions using the DatasetID-File, after extraction. +# expectedChanges=Float,Float : defines the expected interval for the triple-count changes +compare-dataset-ids=false +previous-base-dir=http://downloads.dbpedia.org/2016-10/core-i18n/ +expected-changes=-1.0,9.0 +summarize-exceptions=true + +# Options for the SparkExtraction +spark-master=local[32] +spark-local-dir=/data/extraction/spark.local.dir/ diff --git a/wikidata/README.md b/wikidata/README.md new file mode 100644 index 0000000000000000000000000000000000000000..41aa9ecdb61eb8dd14bf9a445723f7387d9d8d1f --- /dev/null +++ b/wikidata/README.md @@ -0,0 +1,8 @@ +# Wikidata Extraction Config + + +### Cronjob + +``` +0 0 7 * * /bin/bash -c '/home/extractor/marvin-config/wikidata/wikidata-release.sh' >/dev/null 2>&1 +``` \ No newline at end of file diff --git a/wikidata/download.wikidata.properties b/wikidata/download.wikidata.properties new file mode 100644 index 0000000000000000000000000000000000000000..4232064198ce69f15f95e3924d03586208f111bb --- /dev/null +++ b/wikidata/download.wikidata.properties @@ -0,0 +1,19 @@ +# Default download server. It lists mirrors which may be faster. +base-url=https://dumps.wikimedia.your.org/ + +# the source file name +# should be the same as in universal.properties +# source=pages-articles.xml.bz2 + +# languages to download +languages=wikidata + +# Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space. +unzip=false + +# Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds. +retry-max=5 +retry-millis=10000 + +#for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used +dump-date= diff --git a/wikidata/extraction.wikidata.properties b/wikidata/extraction.wikidata.properties new file mode 100644 index 0000000000000000000000000000000000000000..080a0f7d6143478a7211145ec7615e292dc580d7 --- /dev/null +++ b/wikidata/extraction.wikidata.properties @@ -0,0 +1,24 @@ +# this is used in a separate extraction for wikidata +# make sure to have the .WikidataRawExtractor already run before and run redirect script on wikidata_raw_unredirected before that!!! +#################################### +# download and extraction target dir +#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties + +# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. +# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd- +# where xx is the wiki code and yyyymmdd is the dump date. + +# default: +#source=# moved to $extraction-framework/core/src/main/resources/universal.properties + +# use only directories that contain a 'download-complete' file? Default is false. +require-download-complete=true + +# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings' +languages=wikidata + +# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings" + +extractors=.PageIdExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.WikiPageLengthExtractor + +extractors.wikidata=.WikidataR2RExtractor,.WikidataRawExtractor,.WikidataReferenceExtractor,.WikidataAliasExtractor,.WikidataLabelExtractor,.WikidataNameSpaceSameAsExtractor,.WikidataPropertyExtractor,.WikidataLabelExtractor,.WikidataDescriptionExtractor,.WikidataSameAsExtractor diff --git a/wikidata/schedule/prep.sh b/wikidata/schedule/prep.sh new file mode 100644 index 0000000000000000000000000000000000000000..7b67b0f59d8b1a6c0aec46368f42e522c55e6d66 --- /dev/null +++ b/wikidata/schedule/prep.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +set -e + +# [CONFIG] + +#extracted dumps (basedir) +BASEDIR="/data/extraction/wikidumps/" + +#databus-maven-plugin project, containing release pom +DATABUSMVNPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/wikidata" + +#explicit databus version or empty for all +DUMPDATE= + +#if true show dumy output +TRYRUN=false + +# [TODO] + +echo "----------------------------" +echo "Prepare Wikidata for Databus" +echo "----------------------------" + +cd $BASEDIR + +files=$(find wikidatawiki -name "*.ttl.bz2" | sort -u ) + +function exceptDataset { + case $1 in + "duplicate-iri-split") echo "debug";; + "r2r-mapping-errors") echo "debug";; + "type-like-statements") echo "debug";; + + *) echo "$1";; + esac +} + +function exceptName { + case $1 in + "duplicate-iri-split") echo "debug_duplicateirisplit";; + "r2r-mapping-errors") echo "debug_r2rmappingerrors";; + "type-like-statements") echo "debug_typelikestatements";; + + *) echo "$1";; + esac +} + +for file in $files; do + + name=${file##*/}; name=$(echo $name | cut -d"." -f1) + dumpVersion=${file%/*}; dumpVersion=${dumpVersion##*/} + version="${dumpVersion:0:4}.${dumpVersion:4:2}.${dumpVersion:6:2}" + + CONTVAR="" + if [[ $name == *"-nmw"* ]]; then + CONTVAR="${CONTVAR}_nmw" + fi + if [[ $name == *"-reified"* ]]; then + CONTVAR="${CONTVAR}_reified" + fi + if [[ $name == *"-reified-qualifiers"* ]]; then + CONTVAR="${CONTVAR}_qualifiers" + fi + if [[ $name == *"-redirected"* ]]; then + CONTVAR="${CONTVAR}_redirected" + fi + if [[ $name == *"-length"* ]]; then + CONTVAR="${CONTVAR}_length" + fi + if [[ $name == *"-ids"* ]]; then + CONTVAR="${CONTVAR}_ids" + fi + if [[ $name == *"-uris"* ]]; then + CONTVAR="${CONTVAR}_uris" + fi + if [[ $name == *"-transitive"* ]]; then + CONTVAR="${CONTVAR}_transitive" + fi + dataset=$(echo $name | sed -e "s/wikidatawiki-$dumpVersion-//g; s/-nmw//g; s/wikidata-//g; s/-reified//g; s/-qualifiers//g; s/-redirected//g; s/-ids//g; s/-length//g; s/-uris//g; s/-transitive//g; s/transitive-//g") + new_name="${dataset}${CONTVAR}" + + if [[ $dataset == *"interlanguage-links"* ]]; then + new_name="interlanguange-links_lang="$(echo $dataset | sed "s/interlanguage-links-//g") + dataset="interlanguange-links" + fi + + dataset=$(exceptDataset $dataset) + new_name=$(exceptName $new_name) + + new_name=$new_name$(echo ${file##*/} | sed "s/$name//g") + + mkdir -p $DATABUSMVNPOMDIR/$dataset/$version/ + cp -vn $file $DATABUSMVNPOMDIR/$dataset/$version/$new_name +done diff --git a/wikidata/schedule/wikidata-release.sh b/wikidata/schedule/wikidata-release.sh deleted file mode 100755 index 3b8369cc7fba8accb49825c033823d1eebee1917..0000000000000000000000000000000000000000 --- a/wikidata/schedule/wikidata-release.sh +++ /dev/null @@ -1,140 +0,0 @@ -#!/bin/bash -# Wikidata DBpedia release script, version 1.0 - -set -e - -TIME_DATE="2019-07-01" #$(date +%Y-%m-%d) - -EXTRACT_DIR=/home/extractor/extraction-framework -MVN_LOGS=/data/extraction/logs/mvn -#DATA_DIR=`awk -F= '/base-dir/{print $NF}' $EXTRACT_DIR/core/src/main/resources/universal.properties | head -n1` -DATA_DIR=/data/extraction/wikidumps/ -#WWW_DIR=/var/www/html/wikidata - -function download-ontology(){ - cd $EXTRACT_DIR/core; - ../run download-ontology; -} - -function recompile(){ - cd $EXTRACT_DIR; - mvn clean install; -} - -function download-r2r-mapping(){ - cd $EXTRACT_DIR/core/src/main/resources && curl https://raw.githubusercontent.com/dbpedia/extraction-framework/master/core/src/main/resources/wikidatar2r.json > wikidatar2r.json -} - -function download-xml-dump(){ - cd $EXTRACT_DIR/dump; - ../run download download.wikidata.properties \ - > $MVN_LOGS/$TIME_DATE-wikidata.download.out \ - 2> $MVN_LOGS/$TIME_DATE-wikidata.download.err; -} - -function raw-extractor(){ - cd $EXTRACT_DIR/dump; - #Run only .WikidataRawExtractor - ../run extraction extraction.wikidataraw.properties; -} - -function subclassof-script(){ - cd $EXTRACT_DIR/scripts; - ../run WikidataSubClassOf process.wikidata.subclassof.properties; -} - -function all-other-extractors(){ - cd $EXTRACT_DIR/dump; - # Run all other extractors - ../run extraction extraction.wikidataexceptraw.properties -} - -function all-extractors(){ - cd $EXTRACT_DIR/dump; - # Run all extractors to run extraction - ../run extraction extraction.wikidata.properties; -# > $MVN_LOGS/$TIME_DATE-wikidata.extraction.out \ -# 2> $MVN_LOGS/$TIME_DATE-wikidata.extraction.err; - -} - -function post-processing(){ - cd $EXTRACT_DIR/scripts; - ../run ResolveTransitiveLinks $DATA_DIR redirects transitive-redirects .ttl.bz2 wikidata - ../run MapObjectUris $DATA_DIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata -} - -function type-consistency-check(){ - cd $EXTRACT_DIR/scripts; - ../run TypeConsistencyCheck type.consistency.check.properties; -} - -function sync-with-www(){ - rsync -avz $DATA_DIR/wikidatawiki/ $WWW_DIR/; - - #We don't need index.html - find $WWW_DIR/ | grep index.html | xargs rm -rf; -} - -function databus-preparation(){ - cd $DATA_DIR; - bash ~/databusPrep.sh $WWW_DIR/ src/main/databus; -} - -function delete-old-extractions(){ - #Delete extractions older than 1 month, i.e. keep 1-2 results in www. - find $WWW_DIR/ -type d -ctime +20 | xargs rm -rf; - - #Remove everything in Dump dir, do we need to keep them? - rm -rf $DATA_DIR/wikidatawiki/*; -} - -function remove-date-from-files(){ - #Go to the last changed directory - cd "$(\ls -1dt $WWW_DIR/*/ | head -n 1)"; - - #Remove date (numbers) from files - for i in *; do mv "$i" "`echo $i| sed 's/[0-9]..//g'`"; done; -} - -function main() { - #delete-old-extractions; #to have some space for new extraction - -# touch download.process; - - download-ontology; - download-r2r-mapping; - download-xml-dump; - recompile; - all-extractors; - - post-processing; - type-consistency-check; - - cd /data/extraction/wikidumps; - ./prep.sh; - - cd /data/extraction/databus-maven-plugin/dbpedia/wikidata; - mvn package; - mvn databus:deploy; - -#---- -# below not configured yet -#---- - - ##Result of subclassof-script is used in next extraction. - #subclassof-script; - #databus-preparation; - #Sync extraction with www - #sync-with-www - #remove-date-from-files - -#This was the previous extraction process. Now we don't need to run rawextractor separately -# raw-extractor; -# subclassof-script; -# all-other-extractors; -# post-processing; -} - -main - diff --git a/wikidata/wikidata-release.sh b/wikidata/wikidata-release.sh new file mode 100755 index 0000000000000000000000000000000000000000..0525aaf29879c370447b6500932cc3d4ef04699f --- /dev/null +++ b/wikidata/wikidata-release.sh @@ -0,0 +1,214 @@ +#!/bin/bash + +set -e + +SCRIPTROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# [CONFIG] + +#extraction-framework +EXTRACTIONFRAMEWORKDIR="/home/extractor/extraction-framework"; + +#extracted dumps (basedir) +BASEDIR="/data/extraction/wikidumps"; + +#databus-maven-plugin project, containing release pom +#https://github.com/dbpedia/databus-maven-plugin/blob/master/dbpedia/wikidata/pom.xml +DATABUSMAVENPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/wikidata"; + +#override release pom.xml properties +RELEASEPUBLISHER="https://vehnem.github.io/webid.ttl#this"; +RELEASEPACKAGEDIR="/data/extraction/release"; +RELEASEDOWNLOADURL="http://dbpedia-wikidata.tib.eu/release"; +RELEASELABELPREFIX="" + +#logging directory +LOGS="/data/extraction/logs/$(date +%Y-%m-%d)"; +mkdir -p $LOGS; + +# [FUNCTIONS] + +execWithLogging() { + #arg(0) = $1 := "function name" + $1 > "$LOGS/$1.out" 2> "$LOGS/$1.err"; +} + +downloadOntology() { + cd $EXTRACTIONFRAMEWORKDIR/core; + ../run download-ontology; +} + +downloadMappings() { + cd $EXTRACTIONFRAMEWORKDIR/core; + ../run download-mappings; +} + +downloadR2R() { + cd $EXTRACTIONFRAMEWORKDIR/core/src/main/resources && curl https://raw.githubusercontent.com/dbpedia/extraction-framework/master/core/src/main/resources/wikidatar2r.json > wikidatar2r.json +} + +downloadDumps() { + cd $EXTRACTIONFRAMEWORKDIR/dump; + ../run download $SCRIPTROOT/download.wikidata.properties; +} + +buildExtractionFramework() { + cd $EXTRACTIONFRAMEWORKDIR; + mvn clean install; +} + +runExtraction(){ + cd $EXTRACTIONFRAMEWORKDIR/dump; + ../run extraction extraction.wikidata.properties; +} + +resolveTransitiveLinks() { + cd $EXTRACTIONFRAMEWORKDIR/scripts; + ../run ResolveTransitiveLinks $BASEDIR redirects transitive-redirects .ttl.bz2 wikidata +} + +mapObjectUris() { + cd $EXTRACTIONFRAMEWORKDIR/scripts; + ../run MapObjectUris $BASEDIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata +} + +typeConsistencyCheck(){ + cd $EXTRACTIONFRAMEWORKDIR/scripts; + ../run TypeConsistencyCheck type.consistency.check.properties; +} + +postProcessing() { + echo "$(date) | extraction-framework| resole transitive links" >&2; + execWithLogging resolveTransitiveLinks; + echo "$(date) | extraction-framework| map object uris" >&2; + execWithLogging mapObjectUris; + echo "$(date) | extraction-framework| type consistency check" >&2; + execWithLogging typeConsistencyCheck; +} + +prepareRelease() { + #own config + # cd $SCRIPTROOT; + # collectExtraction.sh; + cd $SCRIPTROOT/schedule; + bash prep.sh +} + +setNewVersion() { + cd $DATABUSMAVENPOMDIR; + mvn versions:set -DnewVersion=$(ls * | grep '^[0-9]\{4\}.[0-9]\{2\}.[0-9]\{2\}$' | sort -u | tail -1); +} + +deployRelease() { + cd $DATABUSMAVENPOMDIR; + mvn deploy \ + -Ddatabus.publisher="$RELEASEPUBLISHER" \ + -Ddatabus.packageDirectory="$RELEASEPACKAGEDIR/\${project.groupId}/\${project.artifactId}" \ + -Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL/\${project.groupId}/\${project.artifactId}/\${project.version}" \ + -Ddatabus.labelPrefix="$RELEASELABELPREFIX"; +} + +compressLogs() { + for f in $(find $LOGS -type f ); do lbzip2 $f; done; +} + +# [MAIN] +main() { + + #download + echo "$(date) | extraction-framework | start download ontology" >&2; + execWithLogging downloadOntology; + echo "$(date) | extraction-framework | start download mappings" >&2; + execWithLogging downloadMappings; + echo "$(date) | extraction-framework | start download r2r mappings" >&2; + execWithLogging downloadR2R; + echo "$(date) | extraction-framework | start download dumps" >&2; + execWithLogging downloadDumps; + + #extraction + echo "$(date) | extraction-framework | mvn clean install" >&2; + execWithLogging buildExtractionFramework; + echo "$(date) | extraction-framework | start extraction" >&2; + execWithLogging runExtraction; + echo "$(date) | extraction-framework | post processing" >&2; + postProcessing; + + #release + echo "$(date) | databus-maven-plugin | collect extracted datasets" >&2; + execWithLogging prepareRelease; + echo "$(date) | databus-maven-plugin | mvn versions:set" >&2; + execWithLogging setNewVersion; + echo "$(date) | databus-maven-plugin | mvn deploy" >&2; + execWithLogging deployRelease; + + #cleanup + echo "$(date) | main | compress log files" >&2; + compressLogs; + + #(DEPRECATED FUNCTIONS) below not configured yet + + ##Result of subclassof-script is used in next extraction. + #subclassof-script; + #databus-preparation; + #Sync extraction with www + #sync-with-www + #remove-date-from-files + + #This was the previous extraction process. Now we don't need to run rawextractor separately + # raw-extractor; + # subclassof-script; + # all-other-extractors; + # post-processing; +} + +main + +# [DEPRECATED] + +DATA_DIR=/data/extraction/wikidumps/ +WWW_DIR=/var/www/html/wikidata + +function sync-with-www(){ + rsync -avz $DATA_DIR/wikidatawiki/ $WWW_DIR/; + + #We don't need index.html + find $WWW_DIR/ | grep index.html | xargs rm -rf; +} + +function databus-preparation(){ + cd $DATA_DIR; + bash ~/databusPrep.sh $WWW_DIR/ src/main/databus; +} + +function delete-old-extractions(){ + #Delete extractions older than 1 month, i.e. keep 1-2 results in www. + find $WWW_DIR/ -type d -ctime +20 | xargs rm -rf; + + #Remove everything in Dump dir, do we need to keep them? + rm -rf $DATA_DIR/wikidatawiki/*; +} + +function remove-date-from-files(){ + #Go to the last changed directory + cd "$(\ls -1dt $WWW_DIR/*/ | head -n 1)"; + + #Remove date (numbers) from files + for i in *; do mv "$i" "`echo $i| sed 's/[0-9]..//g'`"; done; +} + +function raw-extractor(){ + cd $EXTRACTIONFRAMEWORKDIR/dump; + #Run only .WikidataRawExtractor + ../run extraction extraction.wikidataraw.properties; +} + +function subclassof-script(){ + cd $EXTRACTIONFRAMEWORKDIR/scripts; + ../run WikidataSubClassOf process.wikidata.subclassof.properties; +} + +function all-other-extractors(){ + cd $EXTRACTIONFRAMEWORKDIR/dump; + # Run all other extractors + ../run extraction extraction.wikidataexceptraw.properties +}