# NOTE: this properties files is imported in every extraction process and contains general parameters which only have to be set once for every release # The DBpedia version to be extracted (in this format: YYYY-MM) # SH Note: leaving it blank takes latest. dbpedia-version= # Replace with your Wikipedia dump download directory (should not change over the course of a release) # base-dir=/data/extraction/wikidumps/ # AUTOMATICALLY SET BY setup-dief.sh # base-dir=$BASEDIR # The log file directory - used to store all log files created in the course of all extractions # log-dir=/data/extraction/logs/extraction/ # AUTOMATICALLY SET BY setup-dief.sh # log-dir=$LOGDIR/extraction/ # Options for the SparkExtraction spark-master=local[32] # AUTOMATICALLY SET BY setup-dief.sh # spark-local-dir=/data/marvin-config/marvin-extraction/spark.local.dir/ # to forward extraction summaries and warnings via the slack API, use this option -slack-webhook=https://hooks.slack.com/services/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 -slack-username=username -slack-summary-threshold=1000000 -slack-exception-threshold=10 # wiki suffix: should be 'wiki' wiki-name=wiki # wikidata mapping file wikidata-property-mappings-file=wikidata-property-mappings.json ###### Extract from part files ###### # # Please make sure that the regex actually matches the format used by ALL the wikis you are going to extract from!!!! # One that should work in all cases is # source=@pages-articles\\d*\\.xml(-p\\d+p\\d+)?\\.bz2 # # NOTE: when using the above regex you should make sure you do not have part files AND regular dump files together # for the same wiki, e.g. frwiki-20131120-pages-articles1.xml.bz2 and frwiki-20131120-pages-articles.xml.bz2, as they # BOTH will match and that will result in duplicate output data # # Example: # enwiki => enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 hence @pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2 matches # frwiki => frwiki-latest-pages-articles1.xml.bz2 hence @pages-articles\\d+\\.xml\\.bz2 matches (the previous regex does not!) # commonswiki => it does not have part files! This is true for other wikis as well. # # source=@pages-articles\\d+\\.xml-p\\d+p\\d+\\.bz2 # In case of multistream chunks # source=@pages-articles-multistream\\.xml\\.\\d+\\.bz2 # Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. # Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd- # where xx is the wiki code and yyyymmdd is the dump date. #(default: pages-articles.xml.bz2): source=pages-articles-multistream.xml.bz2 # Parallel Disc Processes: indicates how many parallel extraction processes can be executed # when each involves reading files from the disc. # This number is highly dependent on the number (RAID > 0) and type (SSD, HDD) of disc in use # as well as the number of cores available. parallel-processes=8 # if ontology and mapping files are not given or do not exist, # download info from mappings.dbpedia.org # by default both should be in the root folder ../ ontology=../ontology.xml mappings=../mappings # disambiguations file (default: "page_props.sql.gz") disambiguations=page_props.sql.gz # Serialization URI policies and file formats. Quick guide: # uri-policy keys: uri, generic, xml-safe, reject-long # uri-policy position modifiers: -subjects, -predicates, -objects, -datatypes, -contexts # uri-policy values: comma-separated languages or '*' for all languages # format values: n-triples, n-quads, turtle-triples, turtle-quads, trix-triples, trix-quads # See http://git.io/DBpedia-serialization-format-properties for details. # For backwards compatibility, en uses generic URIs. All others use local IRIs. # uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*; reject-long:* uri-policy.iri=generic:en;xml-safe-predicates:*;reject-long:* # Turtle is much more readable - use nice IRIs for all languages format.ttl.bz2=turtle-triples;uri-policy.iri # format.tql.bz2=turtle-quads;uri-policy.iri # Extraction Monitor: Compare triple counts to older dbpedia versions using the DatasetID-File, after extraction. # expectedChanges=Float,Float : defines the expected interval for the triple-count changes compare-dataset-ids=false previous-base-dir=http://downloads.dbpedia.org/2016-10/core-i18n/ expected-changes=-1.0,9.0 summarize-exceptions=true ########################### ## Abstract and NIF extraction