Skip to content
Snippets Groups Projects
extraction.generic.properties 4.04 KiB
Newer Older
# download and extraction target dir
#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties

# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.

# default:
#source=# moved to $extraction-framework/core/src/main/resources/universal.properties

spark-master=local[32]

# use only directories that contain a 'download-complete' file? Default is false.
require-download-complete=false

# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
# excluded en, seemed too big for local[32]
languages=af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue

# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"

extractors=.ArticleCategoriesExtractor,.ArticlePageExtractor,.ArticleTemplatesExtractor,.CategoryLabelExtractor,\
.ExternalLinksExtractor,.GeoExtractor,.InfoboxExtractor,.InterLanguageLinksExtractor,.LabelExtractor,.PageIdExtractor,\
.PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,\
.WikiPageLengthExtractor,.WikiPageOutDegreeExtractor

extractors.ar=.TopicalConceptsExtractor

extractors.be=

extractors.bg=

extractors.bn=

extractors.ca=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor

extractors.ced=

extractors.commons=.ContributorExtractor,.TemplateParameterExtractor,.FileTypeExtractor,.GalleryExtractor,.ImageAnnotationExtractor,.CommonsKMLExtractor,.DBpediaResourceExtractor

kurzum's avatar
kurzum committed
extractors.cs=.DisambiguationExtractor

extractors.cy=

extractors.da=

extractors.de=.DisambiguationExtractor,.HomepageExtractor,.PersondataExtractor,.PndExtractor,.CommonsResourceExtractor

extractors.el=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor

extractors.en=.CitationExtractor,.DisambiguationExtractor,.HomepageExtractor,.PersondataExtractor,.PndExtractor,.TopicalConceptsExtractor,.AnchorTextExtractor,.CommonsResourceExtractor

extractors.eo=

extractors.es=,.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor,.CommonsResourceExtractor

extractors.et=

extractors.eu=,.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor

extractors.fa=

extractors.fi=

extractors.fr=.DisambiguationExtractor,.HomepageExtractor,.PndExtractor,.TopicalConceptsExtractor,.fr.PopulationExtractor,.CommonsResourceExtractor

extractors.ga=.HomepageExtractor

extractors.gl=

extractors.hi=

extractors.hr=

extractors.hu=

extractors.id=

extractors.it=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor

extractors.ja=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor,.CommonsResourceExtractor

extractors.ko=.DisambiguationExtractor

extractors.lt=

extractors.lv=

extractors.nl=.DisambiguationExtractor,.CommonsResourceExtractor

extractors.mk=

extractors.mt=

extractors.pl=.DisambiguationExtractor,.HomepageExtractor

extractors.pt=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor,.CommonsResourceExtractor

extractors.ru=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor

extractors.sk=

extractors.sl=

extractors.sr=

extractors.tr=

extractors.ur=

extractors.vi=

extractors.war=

#only the raw extractor here: all other wikidata extractors are executed in an separate extraction for wikidata (see: extraction.wikidata.properties)
#extractors.wikidata=.WikidataSameAsExtractor,.WikidataRawExtractor

extractors.zh=

# If we need to Exclude Non-Free Images in this Extraction, set this to true
copyrightCheck=false