Commit 9a3b9156 authored by Sebastian Hellmann's avatar Sebastian Hellmann
Browse files

text extractions

parent 2875316d
# Default download server. It lists mirrors which may be faster.
# base-url=https://dumps.wikimedia.org/
# base-url=https://ftp.acc.umu.se/mirror/wikimedia.org/dumps/
base-url=http://dumps.wikimedia.your.org/
# the source file name
# should be the same as in universal.properties
# source=pages-articles.xml.bz2
# languages to download
TODO testing for english
languages=en
#,af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue
# Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space.
unzip=false
# Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds.
retry-max=5
retry-millis=10000
#for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used
dump-date=
...@@ -34,6 +34,9 @@ extractDumps() { ...@@ -34,6 +34,9 @@ extractDumps() {
then then
>&2 ../run sparkextraction $CONFIGDIR/extraction.generic.properties; >&2 ../run sparkextraction $CONFIGDIR/extraction.generic.properties;
>&2 ../run sparkextraction $CONFIGDIR/extraction.generic.en.properties; >&2 ../run sparkextraction $CONFIGDIR/extraction.generic.en.properties;
elif ["$GROUP" = "text" ]
then
>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.en.properties;
else else
# run for all # run for all
>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties; >&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties;
...@@ -66,7 +69,7 @@ postProcessing() { ...@@ -66,7 +69,7 @@ postProcessing() {
for i in $(find $EXTRACTIONBASEDIR -name "*._redirects.ttl.bz2") ; do cp $i $LOGDIR ; rename -f 's/_redirected//' $i ; done for i in $(find $EXTRACTIONBASEDIR -name "*._redirects.ttl.bz2") ; do cp $i $LOGDIR ; rename -f 's/_redirected//' $i ; done
elif [ "$GROUP" = "text" ] elif [ "$GROUP" = "text" ]
then then
echo "TODO" echo "check whether text has post-processing"
elif [ "$GROUP" = "test" ] elif [ "$GROUP" = "test" ]
then then
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment