Skip to content
Snippets Groups Projects
extraction.text.properties 3.99 KiB
Newer Older
Sebastian Hellmann's avatar
Sebastian Hellmann committed
# make sure to fill out the ../core/src/main/resources/universal.properties first and reinstall

# The log file directory - used to store all log files created in the course of all extractions
#
#log-dir= see: ../core/src/main/resources/universal.properties

# WikiPages failed to extract in the first try can be retried with this option (especially interesting when extraction from the mediawiki api)
#retry-failed-pages=true
Sebastian Hellmann's avatar
Sebastian Hellmann committed

# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. 
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.
 
# default:
# source=pages-articles.xml.bz2

# alternatives:
# source=pages-articles.xml.gz
# source=pages-articles.xml

# use only directories that contain a 'download-complete' file? Default is false.
require-download-complete=false

# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
# NOTE sync with minidumps
languages=en,af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue
Sebastian Hellmann's avatar
Sebastian Hellmann committed
# default namespaces: Main, File, Category, Template
# we only want abstracts for articles -> only main namespace
Your Name's avatar
Your Name committed
# namespaces=Main
Sebastian Hellmann's avatar
Sebastian Hellmann committed

# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
Your Name's avatar
Your Name committed
parallel-processes=2
# run-jobs-in-parallel=true
Sebastian Hellmann's avatar
Sebastian Hellmann committed
extractors=.NifExtractor

# if ontology and mapping files are not given or do not exist, download info from mappings.dbpedia.org
# ontology=see universal.properties
# mappings=see universal.properties

# Serialization URI policies and file formats. Quick guide:
# uri-policy keys: uri, generic, xml-safe, reject-long
# uri-policy position modifiers: -subjects, -predicates, -objects, -datatypes, -contexts
# uri-policy values: comma-separated languages or '*' for all languages
# format values: n-triples, n-quads, turtle-triples, turtle-quads, trix-triples, trix-quads
# See http://git.io/DBpedia-serialization-format-properties for details.

# For backwards compatibility, en uses generic URIs. All others use local IRIs.
# uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*
uri-policy.iri=generic:en; xml-safe-predicates:*

# NT is unreadable anyway - might as well use URIs for en
# format.nt.gz=n-triples;uri-policy.uri
# format.nq.gz=n-quads;uri-policy.uri

# Turtle is much more readable - use nice IRIs for all languages
format.ttl.bz2=turtle-triples;uri-policy.iri
#format.tql.bz2=turtle-quads;uri-policy.iri


#the following parameters are for the mediawiki api connection used in nif and abstract extraction
mwc-apiUrl=https://{{LANG}}.wikipedia.org/w/api.php
mwc-maxRetries=5
mwc-connectMs=4000
mwc-readMs=30000
mwc-sleepFactor=2000

#parameters specific for the abstract extraction
abstract-query=&format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=%s
# the tag path of the XML tags under which the result is expected
abstract-tags=api,query,pages,page,extract
# the properties used to specify long- and short abstracts (should not change)
short-abstracts-property=rdfs:comment
long-abstracts-property=abstract
# the short abstract is at least this long
short-abstract-min-length=200

#parameters specific to the nif extraction

#only extract abstract (not the whole page)
nif-extract-abstract-only=false
#the request query string
nif-query=&format=xml&action=parse&prop=text&page=%s&pageid=%d
#the xml path of the response
nif-tags=api,parse,text
# will leave out the long and short abstract datasets
nif-isTestRun=false
# will write all anchor texts for each nif instance
nif-write-anchor=true
# write only the anchor text for link instances
nif-write-link-anchor=true