abstract

2875316d · Sebastian Hellmann · f0b16abd · 2875316d · 2875316d
Commit 2875316d authored 5 years ago by Sebastian Hellmann
--- a/extractionConfiguration/extraction.text.en.properties
+++ b/extractionConfiguration/extraction.text.en.properties
+# make sure to fill out the ../core/src/main/resources/universal.properties first and reinstall
+
+# Replace with your Wikipedia dump download directory (should not change over the course of a release)
+# The log file directory - used to store all log files created in the course of all extractions
+#
+#log-dir= see: ../core/src/main/resources/universal.properties
+
+# WikiPages failed to extract in the first try can be retried with this option (especially interesting when extraction from the mediawiki api)
+retry-failed-pages=false
+
+# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. 
+# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
+# where xx is the wiki code and yyyymmdd is the dump date.
+ 
+# default:
+# source=pages-articles.xml.bz2
+
+# alternatives:
+# source=pages-articles.xml.gz
+# source=pages-articles.xml
+
+# use only directories that contain a 'download-complete' file? Default is false.
+require-download-complete=false
+
+# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
+# NOTE sync with minidumps
+#languages=af,als,am,an,arz,ast,azb,ba,bar,bat-smg,bpy,br,bs,bug,cdo,ce,ceb,ckb,cv,fo,fy,gd,he,hsb,ht,ia,ilo,io,is,jv,ka,kn,ku,ky,la,lb,li,lmo,mai,mg,min,ml,mn,mr,mrj,ms,mt,my,mzn,nah,nap,nds,ne,new,nn,no,oc,or,os,pa,pms,pnb,qu,sa,sah,scn,sco,sh,si,simple,sq,su,sw,ta,te,tg,th,tl,tt,uz,vec,wa,xmf,yo,zh-min-nan,zh-yue
+languages=en
+# default namespaces: Main, File, Category, Template
+# we only want abstracts for articles -> only main namespace
+namespaces=Main
+
+# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
+
+extractors=.NifExtractor
+
+# if ontology and mapping files are not given or do not exist, download info from mappings.dbpedia.org
+# ontology=see universal.properties
+# mappings=see universal.properties
+
+# Serialization URI policies and file formats. Quick guide:
+# uri-policy keys: uri, generic, xml-safe, reject-long
+# uri-policy position modifiers: -subjects, -predicates, -objects, -datatypes, -contexts
+# uri-policy values: comma-separated languages or '*' for all languages
+# format values: n-triples, n-quads, turtle-triples, turtle-quads, trix-triples, trix-quads
+# See http://git.io/DBpedia-serialization-format-properties for details.
+
+# For backwards compatibility, en uses generic URIs. All others use local IRIs.
+# uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*
+uri-policy.iri=generic:en; xml-safe-predicates:*
+
+# NT is unreadable anyway - might as well use URIs for en
+# format.nt.gz=n-triples;uri-policy.uri
+# format.nq.gz=n-quads;uri-policy.uri
+
+# Turtle is much more readable - use nice IRIs for all languages
+format.ttl.bz2=turtle-triples;uri-policy.iri
+#format.tql.bz2=turtle-quads;uri-policy.iri
+
+
+#the following parameters are for the mediawiki api connection used in nif and abstract extraction
+mwc-apiUrl=https://{{LANG}}.wikipedia.org/w/api.php
+mwc-maxRetries=5
+mwc-connectMs=4000
+mwc-readMs=30000
+mwc-sleepFactor=2000
+
+#parameters specific for the abstract extraction
+abstract-query=&format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=%s
+# the tag path of the XML tags under which the result is expected
+abstract-tags=api,query,pages,page,extract
+# the properties used to specify long- and short abstracts (should not change)
+short-abstracts-property=rdfs:comment
+long-abstracts-property=abstract
+# the short abstract is at least this long
+short-abstract-min-length=200
+
+#parameters specific to the nif extraction
+
+#only extract abstract (not the whole page)
+nif-extract-abstract-only=false
+#the request query string
+nif-query=&format=xml&action=parse&prop=text&page=%s&pageid=%d
+#the xml path of the response
+nif-tags=api,parse,text
+# will leave out the long and short abstract datasets
+nif-isTestRun=false
+# will write all anchor texts for each nif instance
+nif-write-anchor=true
+# write only the anchor text for link instances
+nif-write-link-anchor=true
--- a/extractionConfiguration/extraction.text.properties
+++ b/extractionConfiguration/extraction.text.properties
+# make sure to fill out the ../core/src/main/resources/universal.properties first and reinstall
+
+# The log file directory - used to store all log files created in the course of all extractions
+#
+#log-dir= see: ../core/src/main/resources/universal.properties
+
+# WikiPages failed to extract in the first try can be retried with this option (especially interesting when extraction from the mediawiki api)
+retry-failed-pages=false
+
+# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. 
+# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
+# where xx is the wiki code and yyyymmdd is the dump date.
+ 
+# default:
+# source=pages-articles.xml.bz2
+
+# alternatives:
+# source=pages-articles.xml.gz
+# source=pages-articles.xml
+
+# use only directories that contain a 'download-complete' file? Default is false.
+require-download-complete=false
+
+# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
+# NOTE sync with minidumps
+languages=af,als,am,an,arz,ast,azb,ba,bar,bat-smg,bpy,br,bs,bug,cdo,ce,ceb,ckb,cv,fo,fy,gd,he,hsb,ht,ia,ilo,io,is,jv,ka,kn,ku,ky,la,lb,li,lmo,mai,mg,min,ml,mn,mr,mrj,ms,mt,my,mzn,nah,nap,nds,ne,new,nn,no,oc,or,os,pa,pms,pnb,qu,sa,sah,scn,sco,sh,si,simple,sq,su,sw,ta,te,tg,th,tl,tt,uz,vec,wa,xmf,yo,zh-min-nan,zh-yue
+# default namespaces: Main, File, Category, Template
+# we only want abstracts for articles -> only main namespace
+namespaces=Main
+
+# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
+
+extractors=.NifExtractor
+
+# if ontology and mapping files are not given or do not exist, download info from mappings.dbpedia.org
+# ontology=see universal.properties
+# mappings=see universal.properties
+
+# Serialization URI policies and file formats. Quick guide:
+# uri-policy keys: uri, generic, xml-safe, reject-long
+# uri-policy position modifiers: -subjects, -predicates, -objects, -datatypes, -contexts
+# uri-policy values: comma-separated languages or '*' for all languages
+# format values: n-triples, n-quads, turtle-triples, turtle-quads, trix-triples, trix-quads
+# See http://git.io/DBpedia-serialization-format-properties for details.
+
+# For backwards compatibility, en uses generic URIs. All others use local IRIs.
+# uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*
+uri-policy.iri=generic:en; xml-safe-predicates:*
+
+# NT is unreadable anyway - might as well use URIs for en
+# format.nt.gz=n-triples;uri-policy.uri
+# format.nq.gz=n-quads;uri-policy.uri
+
+# Turtle is much more readable - use nice IRIs for all languages
+format.ttl.bz2=turtle-triples;uri-policy.iri
+#format.tql.bz2=turtle-quads;uri-policy.iri
+
+
+#the following parameters are for the mediawiki api connection used in nif and abstract extraction
+mwc-apiUrl=https://{{LANG}}.wikipedia.org/w/api.php
+mwc-maxRetries=5
+mwc-connectMs=4000
+mwc-readMs=30000
+mwc-sleepFactor=2000
+
+#parameters specific for the abstract extraction
+abstract-query=&format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=%s
+# the tag path of the XML tags under which the result is expected
+abstract-tags=api,query,pages,page,extract
+# the properties used to specify long- and short abstracts (should not change)
+short-abstracts-property=rdfs:comment
+long-abstracts-property=abstract
+# the short abstract is at least this long
+short-abstract-min-length=200
+
+#parameters specific to the nif extraction
+
+#only extract abstract (not the whole page)
+nif-extract-abstract-only=false
+#the request query string
+nif-query=&format=xml&action=parse&prop=text&page=%s&pageid=%d
+#the xml path of the response
+nif-tags=api,parse,text
+# will leave out the long and short abstract datasets
+nif-isTestRun=false
+# will write all anchor texts for each nif instance
+nif-write-anchor=true
+# write only the anchor text for link instances
+nif-write-link-anchor=true