Skip to content
Snippets Groups Projects
Commit e6c9b41c authored by kurzum's avatar kurzum
Browse files

more granular scripting and logging

parent b2b32cd0
No related branches found
No related tags found
No related merge requests found
# Default download server. It lists mirrors which may be faster.
# base-url=https://dumps.wikimedia.org/
# base-url=https://ftp.acc.umu.se/mirror/wikimedia.org/dumps/
base-url=http://dumps.wikimedia.your.org/
# the source file name
# should be the same as in universal.properties
# source=pages-articles.xml.bz2
# languages to download
languages=en
# Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space.
unzip=false
# Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds.
retry-max=5
retry-millis=10000
#for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used
dump-date=
......@@ -8,7 +8,7 @@ base-url=http://dumps.wikimedia.your.org/
# source=pages-articles.xml.bz2
# languages to download
languages=en,af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue
languages=af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue
# Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space.
unzip=false
......
......@@ -6,7 +6,7 @@ marvin_extraction_run.sh and databus-release.sh take one argument, which is the
selects download.\$GROUP.properties and extraction.\$GROUP.properties from extractionConfig dir and uses \$GROUP as a path.
usage:
./marvin_extraction_run.sh {test|generic|mappings|wikidata|text|sparktestgeneric}
./marvin_extraction_run.sh {test|generic|generic.en|mappings|wikidata|text|sparktestgeneric}
"
##############
......@@ -28,20 +28,7 @@ EXTRACTIONBASEDIR="$MARVINEXTRACTIONDIR/wikidumps" && mkdir -p $EXTRACTIONBASEDI
# extract data
extractDumps() {
cd $DIEFDIR/dump;
# exception for generic, 1. spark, 2. as English is big and has to be run separately
if [ "$GROUP" = "generic" ]
then
>&2 ../run sparkextraction $CONFIGDIR/extraction.generic.properties;
>&2 ../run sparkextraction $CONFIGDIR/extraction.generic.en.properties;
elif ["$GROUP" = "text" ]
then
>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties;
else
# run for all
>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties;
fi
>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties;
}
......@@ -61,7 +48,7 @@ postProcessing() {
>&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects transitive-redirects .ttl.bz2 wikidata
>&2 ../run MapObjectUris $EXTRACTIONBASEDIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata
>&2 ../run TypeConsistencyCheck type.consistency.check.properties;
elif [ "$GROUP" = "generic" ]
elif [ "$GROUP" = "generic" ] || [ "$GROUP" = "generic.en" ]
then
>&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded;
>&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 disambiguations,infobox-properties,page-links,persondata,topical-concepts _redirected .ttl.bz2 @downloaded;
......
......@@ -15,7 +15,7 @@ source functions.sh
#################
GROUP=$1
if [ "$GROUP" != "generic" ] && [ "$GROUP" != "mappings" ] && [ "$GROUP" != "test" ] && [ "$GROUP" != "wikidata" ] && [ "$GROUP" != "sparktestgeneric" ] && [ "$GROUP" != "text" ] || [ -z "$GROUP" ]
if [ "$GROUP" != "generic" ] && [ "$GROUP" != "generic.en" ] && [ "$GROUP" != "mappings" ] && [ "$GROUP" != "test" ] && [ "$GROUP" != "wikidata" ] && [ "$GROUP" != "sparktestgeneric" ] && [ "$GROUP" != "text" ] || [ -z "$GROUP" ]
then
echo "$HELP"
exit 1
......@@ -39,7 +39,9 @@ cd $DIEFDIR/dump
echo "EXTRACT"
extractDumps &> $LOGDIR/extraction.log;
cd $DIEFDIR/dump;
>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties &> $LOGDIR/extraction.$GROUP.log;
echo "POST-PROCESSING"
postProcessing 2> $LOGDIR/postProcessing.log;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment