Skip to content
Snippets Groups Projects
extraction.text.properties 3.78 KiB
Newer Older
Sebastian Hellmann's avatar
Sebastian Hellmann committed
# make sure to fill out the ../core/src/main/resources/universal.properties first and reinstall

# The log file directory - used to store all log files created in the course of all extractions
#
#log-dir= see: ../core/src/main/resources/universal.properties

# WikiPages failed to extract in the first try can be retried with this option (especially interesting when extraction from the mediawiki api)
retry-failed-pages=false

# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. 
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.
 
# default:
# source=pages-articles.xml.bz2

# alternatives:
# source=pages-articles.xml.gz
# source=pages-articles.xml

# use only directories that contain a 'download-complete' file? Default is false.
require-download-complete=false

# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
# NOTE sync with minidumps
languages=af,als,am,an,arz,ast,azb,ba,bar,bat-smg,bpy,br,bs,bug,cdo,ce,ceb,ckb,cv,fo,fy,gd,he,hsb,ht,ia,ilo,io,is,jv,ka,kn,ku,ky,la,lb,li,lmo,mai,mg,min,ml,mn,mr,mrj,ms,mt,my,mzn,nah,nap,nds,ne,new,nn,no,oc,or,os,pa,pms,pnb,qu,sa,sah,scn,sco,sh,si,simple,sq,su,sw,ta,te,tg,th,tl,tt,uz,vec,wa,xmf,yo,zh-min-nan,zh-yue
# default namespaces: Main, File, Category, Template
# we only want abstracts for articles -> only main namespace
namespaces=Main

# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"

extractors=.NifExtractor

# if ontology and mapping files are not given or do not exist, download info from mappings.dbpedia.org
# ontology=see universal.properties
# mappings=see universal.properties

# Serialization URI policies and file formats. Quick guide:
# uri-policy keys: uri, generic, xml-safe, reject-long
# uri-policy position modifiers: -subjects, -predicates, -objects, -datatypes, -contexts
# uri-policy values: comma-separated languages or '*' for all languages
# format values: n-triples, n-quads, turtle-triples, turtle-quads, trix-triples, trix-quads
# See http://git.io/DBpedia-serialization-format-properties for details.

# For backwards compatibility, en uses generic URIs. All others use local IRIs.
# uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*
uri-policy.iri=generic:en; xml-safe-predicates:*

# NT is unreadable anyway - might as well use URIs for en
# format.nt.gz=n-triples;uri-policy.uri
# format.nq.gz=n-quads;uri-policy.uri

# Turtle is much more readable - use nice IRIs for all languages
format.ttl.bz2=turtle-triples;uri-policy.iri
#format.tql.bz2=turtle-quads;uri-policy.iri


#the following parameters are for the mediawiki api connection used in nif and abstract extraction
mwc-apiUrl=https://{{LANG}}.wikipedia.org/w/api.php
mwc-maxRetries=5
mwc-connectMs=4000
mwc-readMs=30000
mwc-sleepFactor=2000

#parameters specific for the abstract extraction
abstract-query=&format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=%s
# the tag path of the XML tags under which the result is expected
abstract-tags=api,query,pages,page,extract
# the properties used to specify long- and short abstracts (should not change)
short-abstracts-property=rdfs:comment
long-abstracts-property=abstract
# the short abstract is at least this long
short-abstract-min-length=200

#parameters specific to the nif extraction

#only extract abstract (not the whole page)
nif-extract-abstract-only=false
#the request query string
nif-query=&format=xml&action=parse&prop=text&page=%s&pageid=%d
#the xml path of the response
nif-tags=api,parse,text
# will leave out the long and short abstract datasets
nif-isTestRun=false
# will write all anchor texts for each nif instance
nif-write-anchor=true
# write only the anchor text for link instances
nif-write-link-anchor=true