# make sure to fill out the ../core/src/main/resources/universal.properties first and reinstall # The log file directory - used to store all log files created in the course of all extractions # #log-dir= see: ../core/src/main/resources/universal.properties # WikiPages failed to extract in the first try can be retried with this option (especially interesting when extraction from the mediawiki api) retry-failed-pages=false # Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. # Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd- # where xx is the wiki code and yyyymmdd is the dump date. # default: # source=pages-articles.xml.bz2 # alternatives: # source=pages-articles.xml.gz # source=pages-articles.xml # use only directories that contain a 'download-complete' file? Default is false. require-download-complete=false # List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings' # NOTE sync with minidumps languages=af,als,am,an,arz,ast,azb,ba,bar,bat-smg,bpy,br,bs,bug,cdo,ce,ceb,ckb,cv,fo,fy,gd,he,hsb,ht,ia,ilo,io,is,jv,ka,kn,ku,ky,la,lb,li,lmo,mai,mg,min,ml,mn,mr,mrj,ms,mt,my,mzn,nah,nap,nds,ne,new,nn,no,oc,or,os,pa,pms,pnb,qu,sa,sah,scn,sco,sh,si,simple,sq,su,sw,ta,te,tg,th,tl,tt,uz,vec,wa,xmf,yo,zh-min-nan,zh-yue # default namespaces: Main, File, Category, Template # we only want abstracts for articles -> only main namespace namespaces=Main # extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings" extractors=.NifExtractor # if ontology and mapping files are not given or do not exist, download info from mappings.dbpedia.org # ontology=see universal.properties # mappings=see universal.properties # Serialization URI policies and file formats. Quick guide: # uri-policy keys: uri, generic, xml-safe, reject-long # uri-policy position modifiers: -subjects, -predicates, -objects, -datatypes, -contexts # uri-policy values: comma-separated languages or '*' for all languages # format values: n-triples, n-quads, turtle-triples, turtle-quads, trix-triples, trix-quads # See http://git.io/DBpedia-serialization-format-properties for details. # For backwards compatibility, en uses generic URIs. All others use local IRIs. # uri-policy.uri=uri:en; generic:en; xml-safe-predicates:* uri-policy.iri=generic:en; xml-safe-predicates:* # NT is unreadable anyway - might as well use URIs for en # format.nt.gz=n-triples;uri-policy.uri # format.nq.gz=n-quads;uri-policy.uri # Turtle is much more readable - use nice IRIs for all languages format.ttl.bz2=turtle-triples;uri-policy.iri #format.tql.bz2=turtle-quads;uri-policy.iri #the following parameters are for the mediawiki api connection used in nif and abstract extraction mwc-apiUrl=https://{{LANG}}.wikipedia.org/w/api.php mwc-maxRetries=5 mwc-connectMs=4000 mwc-readMs=30000 mwc-sleepFactor=2000 #parameters specific for the abstract extraction abstract-query=&format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=%s # the tag path of the XML tags under which the result is expected abstract-tags=api,query,pages,page,extract # the properties used to specify long- and short abstracts (should not change) short-abstracts-property=rdfs:comment long-abstracts-property=abstract # the short abstract is at least this long short-abstract-min-length=200 #parameters specific to the nif extraction #only extract abstract (not the whole page) nif-extract-abstract-only=false #the request query string nif-query=&format=xml&action=parse&prop=text&page=%s&pageid=%d #the xml path of the response nif-tags=api,parse,text # will leave out the long and short abstract datasets nif-isTestRun=false # will write all anchor texts for each nif instance nif-write-anchor=true # write only the anchor text for link instances nif-write-link-anchor=true