# make sure to fill out the ../core/src/main/resources/universal.properties first and reinstall # The log file directory - used to store all log files created in the course of all extractions # #log-dir= see: ../core/src/main/resources/universal.properties # WikiPages failed to extract in the first try can be retried with this option (especially interesting when extraction from the mediawiki api) #retry-failed-pages=true # Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. # Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd- # where xx is the wiki code and yyyymmdd is the dump date. # default: # source=pages-articles.xml.bz2 # alternatives: # source=pages-articles.xml.gz # source=pages-articles.xml # use only directories that contain a 'download-complete' file? Default is false. require-download-complete=false # List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings' # NOTE sync with minidumps languages=en,af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue # default namespaces: Main, File, Category, Template # we only want abstracts for articles -> only main namespace # namespaces=Main # extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings" parallel-processes=2 # run-jobs-in-parallel=true extractors=.NifExtractor # if ontology and mapping files are not given or do not exist, download info from mappings.dbpedia.org # ontology=see universal.properties # mappings=see universal.properties # Serialization URI policies and file formats. Quick guide: # uri-policy keys: uri, generic, xml-safe, reject-long # uri-policy position modifiers: -subjects, -predicates, -objects, -datatypes, -contexts # uri-policy values: comma-separated languages or '*' for all languages # format values: n-triples, n-quads, turtle-triples, turtle-quads, trix-triples, trix-quads # See http://git.io/DBpedia-serialization-format-properties for details. # For backwards compatibility, en uses generic URIs. All others use local IRIs. # uri-policy.uri=uri:en; generic:en; xml-safe-predicates:* uri-policy.iri=generic:en; xml-safe-predicates:* # NT is unreadable anyway - might as well use URIs for en # format.nt.gz=n-triples;uri-policy.uri # format.nq.gz=n-quads;uri-policy.uri # Turtle is much more readable - use nice IRIs for all languages format.ttl.bz2=turtle-triples;uri-policy.iri #format.tql.bz2=turtle-quads;uri-policy.iri #the following parameters are for the mediawiki api connection used in nif and abstract extraction mwc-apiUrl=https://{{LANG}}.wikipedia.org/w/api.php mwc-maxRetries=5 mwc-connectMs=4000 mwc-readMs=30000 mwc-sleepFactor=2000 #parameters specific for the abstract extraction abstract-query=&format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=%s # the tag path of the XML tags under which the result is expected abstract-tags=api,query,pages,page,extract # the properties used to specify long- and short abstracts (should not change) short-abstracts-property=rdfs:comment long-abstracts-property=abstract # the short abstract is at least this long short-abstract-min-length=200 #parameters specific to the nif extraction #only extract abstract (not the whole page) nif-extract-abstract-only=false #the request query string nif-query=&format=xml&action=parse&prop=text&page=%s&pageid=%d #the xml path of the response nif-tags=api,parse,text # will leave out the long and short abstract datasets nif-isTestRun=false # will write all anchor texts for each nif instance nif-write-anchor=true # write only the anchor text for link instances nif-write-link-anchor=true