Skip to content
Snippets Groups Projects
universal.properties.template 4.33 KiB
Newer Older
# NOTE: this properties files is imported in every extraction process and contains general parameters which only have to be set once for every release

# The DBpedia version to be extracted (in this format: YYYY-MM)
# SH Note: leaving it blank takes latest. 
dbpedia-version=

# Replace with your Wikipedia dump download directory (should not change over the course of a release)
# base-dir=/data/extraction/wikidumps/
# AUTOMATICALLY SET BY setup-dief.sh
# base-dir=$BASEDIR

# The log file directory - used to store all log files created in the course of all extractions
# log-dir=/data/extraction/logs/extraction/
# AUTOMATICALLY SET BY setup-dief.sh
# log-dir=$LOGDIR/extraction/
Your Name's avatar
Your Name committed
# Options for the SparkExtraction
spark-master=local[32]
# AUTOMATICALLY SET BY setup-dief.sh
# spark-local-dir=/data/marvin-config/marvin-extraction/spark.local.dir/



# to forward extraction summaries and warnings via the slack API, use this option
-slack-webhook=https://hooks.slack.com/services/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
-slack-username=username
-slack-summary-threshold=1000000
-slack-exception-threshold=10

# wiki suffix: should be 'wiki'
wiki-name=wiki

# wikidata mapping file
wikidata-property-mappings-file=wikidata-property-mappings.json

###### Extract from part files ######
#
# Please make sure that the regex actually matches the format used by ALL the wikis you are going to extract from!!!!
# One that should work in all cases is
# source=@pages-articles\\d*\\.xml(-p\\d+p\\d+)?\\.bz2
#
# NOTE: when using the above regex you should make sure you do not have part files AND regular dump files together
# for the same wiki, e.g. frwiki-20131120-pages-articles1.xml.bz2 and frwiki-20131120-pages-articles.xml.bz2, as they
# BOTH will match and that will result in duplicate output data
#
# Example:
# enwiki => enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 hence @pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2 matches
# frwiki => frwiki-latest-pages-articles1.xml.bz2 hence @pages-articles\\d+\\.xml\\.bz2 matches (the previous regex does not!)
# commonswiki => it does not have part files! This is true for other wikis as well.
#
# source=@pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2

# In case of multistream chunks
# source=@pages-articles-multistream\\.xml\\.\\d+\\.bz2

# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.
#(default: pages-articles.xml.bz2):
source=pages-articles-multistream.xml.bz2

# Parallel Disc Processes: indicates how many parallel extraction processes can be executed
# when each involves reading files from the disc.
# This number is highly dependent on the number (RAID > 0) and type (SSD, HDD) of disc in use
# as well as the number of cores available.
parallel-processes=8

# if ontology and mapping files are not given or do not exist,
# download info from mappings.dbpedia.org
# by default both should be in the root folder ../
ontology=../ontology.xml
mappings=../mappings

# disambiguations file (default: "page_props.sql.gz")
disambiguations=page_props.sql.gz

# Serialization URI policies and file formats. Quick guide:
# uri-policy keys: uri, generic, xml-safe, reject-long
# uri-policy position modifiers: -subjects, -predicates, -objects, -datatypes, -contexts
# uri-policy values: comma-separated languages or '*' for all languages
# format values: n-triples, n-quads, turtle-triples, turtle-quads, trix-triples, trix-quads
# See http://git.io/DBpedia-serialization-format-properties for details.

# For backwards compatibility, en uses generic URIs. All others use local IRIs.
# uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*; reject-long:*
uri-policy.iri=generic:en;xml-safe-predicates:*;reject-long:*

# Turtle is much more readable - use nice IRIs for all languages
format.ttl.bz2=turtle-triples;uri-policy.iri
# format.tql.bz2=turtle-quads;uri-policy.iri

# Extraction Monitor: Compare triple counts to older dbpedia versions using the DatasetID-File, after extraction.
# expectedChanges=Float,Float : defines the expected interval for the triple-count changes
compare-dataset-ids=false
previous-base-dir=http://downloads.dbpedia.org/2016-10/core-i18n/
expected-changes=-1.0,9.0
summarize-exceptions=true


###########################
## Abstract and NIF extraction