Newer
Older
# NOTE: this properties files is imported in every extraction process and contains general parameters which only have to be set once for every release
# The DBpedia version to be extracted (in this format: YYYY-MM)
# SH Note: leaving it blank takes latest.
dbpedia-version=
# Replace with your Wikipedia dump download directory (should not change over the course of a release)
# AUTOMATICALLY SET BY setup-dief.sh
# base-dir=$BASEDIR
# The log file directory - used to store all log files created in the course of all extractions
# log-dir=/data/extraction/logs/extraction/
# AUTOMATICALLY SET BY setup-dief.sh
# log-dir=$LOGDIR/extraction/
# Options for the SparkExtraction
spark-master=local[32]
# AUTOMATICALLY SET BY setup-dief.sh
# spark-local-dir=/data/marvin-config/marvin-extraction/spark.local.dir/
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# to forward extraction summaries and warnings via the slack API, use this option
-slack-webhook=https://hooks.slack.com/services/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
-slack-username=username
-slack-summary-threshold=1000000
-slack-exception-threshold=10
# wiki suffix: should be 'wiki'
wiki-name=wiki
# wikidata mapping file
wikidata-property-mappings-file=wikidata-property-mappings.json
###### Extract from part files ######
#
# Please make sure that the regex actually matches the format used by ALL the wikis you are going to extract from!!!!
# One that should work in all cases is
# source=@pages-articles\\d*\\.xml(-p\\d+p\\d+)?\\.bz2
#
# NOTE: when using the above regex you should make sure you do not have part files AND regular dump files together
# for the same wiki, e.g. frwiki-20131120-pages-articles1.xml.bz2 and frwiki-20131120-pages-articles.xml.bz2, as they
# BOTH will match and that will result in duplicate output data
#
# Example:
# enwiki => enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 hence @pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2 matches
# frwiki => frwiki-latest-pages-articles1.xml.bz2 hence @pages-articles\\d+\\.xml\\.bz2 matches (the previous regex does not!)
# commonswiki => it does not have part files! This is true for other wikis as well.
#
# source=@pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2
# In case of multistream chunks
# source=@pages-articles-multistream\\.xml\\.\\d+\\.bz2
# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.
#(default: pages-articles.xml.bz2):
source=pages-articles-multistream.xml.bz2
# Parallel Disc Processes: indicates how many parallel extraction processes can be executed
# when each involves reading files from the disc.
# This number is highly dependent on the number (RAID > 0) and type (SSD, HDD) of disc in use
# as well as the number of cores available.
parallel-processes=8
# if ontology and mapping files are not given or do not exist,
# download info from mappings.dbpedia.org
# by default both should be in the root folder ../
ontology=../ontology.xml
mappings=../mappings
# disambiguations file (default: "page_props.sql.gz")
disambiguations=page_props.sql.gz
# Serialization URI policies and file formats. Quick guide:
# uri-policy keys: uri, generic, xml-safe, reject-long
# uri-policy position modifiers: -subjects, -predicates, -objects, -datatypes, -contexts
# uri-policy values: comma-separated languages or '*' for all languages
# format values: n-triples, n-quads, turtle-triples, turtle-quads, trix-triples, trix-quads
# See http://git.io/DBpedia-serialization-format-properties for details.
# For backwards compatibility, en uses generic URIs. All others use local IRIs.
# uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*; reject-long:*
uri-policy.iri=generic:en;xml-safe-predicates:*;reject-long:*
# Turtle is much more readable - use nice IRIs for all languages
format.ttl.bz2=turtle-triples;uri-policy.iri
# format.tql.bz2=turtle-quads;uri-policy.iri
# Extraction Monitor: Compare triple counts to older dbpedia versions using the DatasetID-File, after extraction.
# expectedChanges=Float,Float : defines the expected interval for the triple-count changes
compare-dataset-ids=false
previous-base-dir=http://downloads.dbpedia.org/2016-10/core-i18n/
expected-changes=-1.0,9.0
summarize-exceptions=true
###########################
## Abstract and NIF extraction