Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
dbpedia-assoc
MARVIN-config
Commits
2c78490a
Commit
2c78490a
authored
Oct 09, 2019
by
Marvin Hofer
Browse files
dbpedia_extraction_minimal_runscript.sh: update
parent
7be22d44
Changes
12
Hide whitespace changes
Inline
Side-by-side
config.d/download.generic.properties
0 → 100644
View file @
2c78490a
# Default download server. It lists mirrors which may be faster.
# base-url=https://dumps.wikimedia.org/
# base-url=https://ftp.acc.umu.se/mirror/wikimedia.org/dumps/
base-url
=
http://dumps.wikimedia.your.org/
# the source file name
# should be the same as in universal.properties
# source=pages-articles.xml.bz2
# languages to download
languages
=
en,af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue
# Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space.
unzip
=
false
# Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds.
retry-max
=
5
retry-millis
=
10000
#for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used
dump-date
=
config.d/download.mappings.properties
0 → 100644
View file @
2c78490a
# Default download server. It lists mirrors which may be faster.
base-url
=
http://dumps.wikimedia.your.org/
# the source file name
# should be the same as in universal.properties
# source=pages-articles.xml.bz2
# languages to download
languages
=
@mappings
# Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space.
unzip
=
false
# Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds.
retry-max
=
5
retry-millis
=
10000
#for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used
dump-date
=
config.d/download.wikidata.properties
0 → 100644
View file @
2c78490a
# Default download server. It lists mirrors which may be faster.
base-url
=
https://dumps.wikimedia.your.org/
# the source file name
# should be the same as in universal.properties
# source=pages-articles.xml.bz2
# languages to download
languages
=
wikidata
# Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space.
unzip
=
false
# Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds.
retry-max
=
5
retry-millis
=
10000
#for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used
dump-date
=
config.d/extraction.mappings.properties
0 → 100644
View file @
2c78490a
# download and extraction target dir
#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties
# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.
# default:
#source=# moved to $extraction-framework/core/src/main/resources/universal.properties
# use only directories that contain a 'download-complete' file? Default is false.
require-download-complete
=
false
# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
languages
=
@mappings
# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
extractors
=
.MappingExtractor
#extractors.ar=.MappingExtractor,.TopicalConceptsExtractor
#
#extractors.be=.MappingExtractor
#
#extractors.bg=.MappingExtractor
#
#extractors.bn=.MappingExtractor
#
#extractors.ca=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor
#
#extractors.ced=.MappingExtractor
#
#extractors.commons=.MappingExtractor,.ContributorExtractor,.TemplateParameterExtractor,.FileTypeExtractor,.GalleryExtractor,.ImageAnnotationExtractor,.CommonsKMLExtractor,.DBpediaResourceExtractor
#
#extractors.cs=.MappingExtractor
#
#extractors.cy=.MappingExtractor
#
#extractors.da=.MappingExtractor
#
#extractors.de=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.PersondataExtractor,.PndExtractor,.CommonsResourceExtractor
#
#extractors.el=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor
#
#extractors.en=.MappingExtractor,.CitationExtractor,.DisambiguationExtractor,.GenderExtractor,.HomepageExtractor,.ImageExtractorNew,.PersondataExtractor,.PndExtractor,.TopicalConceptsExtractor,.AnchorTextExtractor,.CommonsResourceExtractor
#
#extractors.eo=.MappingExtractor
#
#extractors.es=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor,.CommonsResourceExtractor
#
#extractors.et=.MappingExtractor
#
#extractors.eu=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor
#
#extractors.fa=.MappingExtractor
#
#extractors.fi=.MappingExtractor
#
#extractors.fr=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.PndExtractor,.TopicalConceptsExtractor,.fr.PopulationExtractor,.CommonsResourceExtractor
#
#extractors.ga=.MappingExtractor,.HomepageExtractor
#
#extractors.gl=.MappingExtractor
#
#extractors.hi=.MappingExtractor
#
#extractors.hr=.MappingExtractor
#
#extractors.hu=.MappingExtractor
#
#extractors.id=.MappingExtractor
#
#extractors.it=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor
#
#extractors.ja=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor,.CommonsResourceExtractor
#
#extractors.ko=.MappingExtractor,.DisambiguationExtractor
#
#extractors.lt=.MappingExtractor
#
#extractors.lv=.MappingExtractor
#
#extractors.nl=.MappingExtractor,.DisambiguationExtractor,.ImageExtractorNew,.CommonsResourceExtractor
#
#extractors.mk=.MappingExtractor
#
#extractors.mt=.MappingExtractor
#
#extractors.pl=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew
#
#extractors.pt=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor,.CommonsResourceExtractor
#
#extractors.ru=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor
#
#extractors.sk=.MappingExtractor
#
#extractors.sl=.MappingExtractor
#
#extractors.sr=.MappingExtractor
#
#extractors.tr=.MappingExtractor
#
#extractors.ur=.MappingExtractor
#
#extractors.vi=.MappingExtractor
#
#extractors.war=.MappingExtractor
#only the raw extractor here: all other wikidata extractors are executed in an separate extraction for wikidata (see: extraction.wikidata.properties)
#extractors.wikidata=.WikidataSameAsExtractor,.WikidataRawExtractor
#extractors.zh=.MappingExtractor
# If we need to Exclude Non-Free Images in this Extraction, set this to true
copyrightCheck
=
false
config.d/extraction.wikidata.properties
0 → 100644
View file @
2c78490a
# this is used in a separate extraction for wikidata
# make sure to have the .WikidataRawExtractor already run before and run redirect script on wikidata_raw_unredirected before that!!!
####################################
# download and extraction target dir
#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties
# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.
# default:
#source=# moved to $extraction-framework/core/src/main/resources/universal.properties
# use only directories that contain a 'download-complete' file? Default is false.
require-download-complete
=
true
# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
languages
=
wikidata
# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
extractors
=
.PageIdExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.WikiPageLengthExtractor
extractors.wikidata
=
.WikidataR2RExtractor,.WikidataRawExtractor,.WikidataReferenceExtractor,.WikidataAliasExtractor,.WikidataLabelExtractor,.WikidataNameSpaceSameAsExtractor,.WikidataPropertyExtractor,.WikidataLabelExtractor,.WikidataDescriptionExtractor,.WikidataSameAsExtractor
config.d/path-conf.sh
0 → 100644
View file @
2c78490a
config.d/prepareGenericArtifacts.sh
0 → 100644
View file @
2c78490a
#!/bin/bash
set
-e
# [CONFIG]
#extracted dumps (basedir)
BASEDIR
=
"/data/extraction/wikidumps/"
#databus-maven-plugin project, containing release pom
DATABUSMVNPOMDIR
=
"/data/extraction/databus-maven-plugin/dbpedia/generic"
#explicit databus version or empty for all
DUMPDATE
=
#if true show dumy output
TRYRUN
=
false
# [DATASET-MAPPING]
mapLang
()
{
lang
=
$(
echo
"
$1
"
|
sed
's|wiki||g'
)
case
"
$lang
"
in
"bat_smg"
)
echo
"_lang=batsmg"
;;
"zh_min_nan"
)
echo
"_lang=nan"
;;
"zh_yue"
)
echo
"_lang=yue"
;;
"wikidata"
)
echo
""
;;
*
)
echo
"_lang=
$lang
"
;;
esac
}
mapExtraction
()
{
case
"
$1
"
in
"article-templates-nested"
)
echo
"article-templates_nested"
;;
"citation-data"
)
echo
"citations_data"
;;
"citation-links"
)
echo
"citations_links"
;;
"commons-page-links"
)
echo
"commons-sameas-links"
;;
"page-ids"
)
echo
"page_ids"
;;
"page-length"
)
echo
"page_length"
;;
"page-links"
)
echo
"wikilinks"
;;
"article-categories"
)
echo
"categories_articles"
;;
"category-labels"
)
echo
"categories_labels"
;;
"skos-categories"
)
echo
"categories_skos"
;;
"revision-ids"
)
echo
"revisions_ids"
;;
"revision-uris"
)
echo
"revisions_uris"
;;
*
)
echo
"
$1
"
;;
esac
}
# [FUNCTIONS]
collectExtractionFun
()
{
#how to use ${string##/*}
#https://www.tldp.org/LDP/abs/html/string-manipulation.html#Substring%20Removal#Substring Removal
for
path
in
$(
find
"
$BASEDIR
"
-name
"*.ttl.bz2"
)
;
do
file
=
"
${
path
##*/
}
"
version
=
"
${
file
#*-
}
"
version
=
"
${
version
%%-*
}
"
version
=
"
${
version
:0:4
}
.
${
version
:4:2
}
.
${
version
:6:2
}
"
if
[
"
$DUMPDATE
"
=
"
$version
"
]
||
[
-z
"
$DUMPDATE
"
]
;
then
lang
=
"
${
file
%%-*
}
"
extraction
=
"
${
file
#*-*-
}
"
extraction
=
"
${
extraction
%%.*
}
"
extension
=
"
${
file
#*.
}
"
mapped
=
"
$(
mapExtraction
$extraction
)
"
artifact
=
"
${
mapped
%%_*
}
"
contVars
=
"
$(
mapLang
$lang
)
"
if
[[
"
$mapped
"
==
*
"_"
*
]]
;
then
contVars
=
"
${
contVars
}
_
${
mapped
#*_
}
"
fi
targetArVe
=
"
$artifact
/
$version
"
targetFile
=
"
$artifact$contVars
.
$extension
"
if
[
-d
"
$DATABUSMVNPOMDIR
/
$artifact
"
]
;
then
if
[
!
-d
"
$DATABUSMVNPOMDIR
/
$targetArVe
"
]
;
then
mkdir
-p
"
$DATABUSMVNPOMDIR
/
$targetArVe
"
fi
if
$TRYRUN
;
then
echo
"
$path
->
$DATABUSMVNPOMDIR
/
$targetArVe
/
$targetFile
"
else
cp
-vn
"
$path
"
"
$DATABUSMVNPOMDIR
/
$targetArVe
/
$targetFile
"
fi
else
>
&2
echo
"unmapped/notexist artifact:
$artifact
|
$mapped
|
$extraction
"
fi
fi
done
}
renameRedirected
()
{
cd
$DATABUSMVNPOMDIR
;
# for f in $(find . -name "*_redirected*" ); do rename -n 's/_redirected\.ttl\.bz2$/\.ttl\.bz2$/' $f; done
for
f
in
$(
find
.
-name
"*_redirected*"
)
;
do
rename
-n
's/_redirected//'
$f
;
done
}
# [Main]
main
()
{
collectExtractionFun
;
}
main
;
config.d/prepareMappingsArtifacts.sh
0 → 100755
View file @
2c78490a
#!/bin/bash
set
-e
# [CONFIG]
#extracted dumps (basedir)
BASEDIR
=
"/data/extraction/wikidumps/"
#databus-maven-plugin project, containing release pom
DATABUSMVNPOMDIR
=
"/data/extraction/databus-maven-plugin/dbpedia/mappings"
#explicit databus version or empty for all
DUMPDATE
=
#if true show dumy output
TRYRUN
=
false
# [DATASET-MAPPING]
mapLang
()
{
lang
=
$(
echo
"
$1
"
|
sed
's|wiki||g'
)
case
"
$lang
"
in
"bat_smg"
)
echo
"_lang=batsmg"
;;
"zh_min_nan"
)
echo
"_lang=nan"
;;
"zh_yue"
)
echo
"_lang=yue"
;;
"wikidata"
)
echo
""
;;
"commons"
)
echo
"_commons"
;;
*
)
echo
"_lang=
$lang
"
;;
esac
}
mapExtraction
()
{
case
"
$1
"
in
"instance-types-transitive"
)
echo
"instance-types_transitive"
;;
"mappingbased-objects-disjoint-domain"
)
echo
"mappingbased-objects_disjointDomain"
;;
"mappingbased-objects-disjoint-range"
)
echo
"mappingbased-objects_disjointRange"
;;
*
)
echo
"
$1
"
;;
esac
}
# [FUNCTIONS]
copyToMavenPlugin
()
{
# https://www.tldp.org/LDP/abs/html/string-manipulation.html#Substring%20Removal#Substring Removal
# ${string##/*}
for
path
in
$(
find
"
$BASEDIR
"
-name
"*.ttl.bz2"
)
;
do
file
=
"
${
path
##*/
}
"
version
=
"
${
file
#*-
}
"
version
=
"
${
version
%%-*
}
"
version
=
"
${
version
:0:4
}
.
${
version
:4:2
}
.
${
version
:6:2
}
"
if
[
"
$DUMPDATE
"
=
"
$version
"
]
||
[
-z
"
$DUMPDATE
"
]
;
then
lang
=
"
${
file
%%-*
}
"
extraction
=
"
${
file
#*-*-
}
"
extraction
=
"
${
extraction
%%.*
}
"
extension
=
"
${
file
#*.
}
"
mapped
=
"
$(
mapExtraction
$extraction
)
"
artifact
=
"
${
mapped
%%_*
}
"
contVars
=
"
$(
mapLang
$lang
)
"
if
[[
"
$mapped
"
==
*
"_"
*
]]
;
then
contVars
=
"
${
contVars
}
_
${
mapped
#*_
}
"
fi
targetArVe
=
"
$artifact
/
$version
"
targetFile
=
"
$artifact$contVars
.
$extension
"
if
[
-d
"
$DATABUSMVNPOMDIR
/
$artifact
"
]
;
then
if
[
!
-d
"
$DATABUSMVNPOMDIR
/
$targetArVe
"
]
;
then
mkdir
-p
"
$DATABUSMVNPOMDIR
/
$targetArVe
"
fi
if
$TRYRUN
;
then
echo
"
$path
->
$DATABUSMVNPOMDIR
/
$targetArVe
/
$targetFile
"
else
cp
-vn
"
$path
"
"
$DATABUSMVNPOMDIR
/
$targetArVe
/
$targetFile
"
fi
else
>
&2
echo
"unmapped/notexist artifact:
$artifact
|
$mapped
|
$extraction
"
fi
fi
done
}
# [MAIN]
main
()
{
copyToMavenPlugin
}
main
config.d/prepareWikidataArtifacts.sh
0 → 100644
View file @
2c78490a
#!/bin/bash
set
-e
# [CONFIG]
#extracted dumps (basedir)
BASEDIR
=
"/data/extraction/wikidumps/"
#databus-maven-plugin project, containing release pom
DATABUSMVNPOMDIR
=
"/data/extraction/databus-maven-plugin/dbpedia/wikidata"
#explicit databus version or empty for all
DUMPDATE
=
#if true show dumy output
TRYRUN
=
false
# [TODO]
echo
"----------------------------"
echo
"Prepare Wikidata for Databus"
echo
"----------------------------"
cd
$BASEDIR
files
=
$(
find wikidatawiki
-name
"*.ttl.bz2"
|
sort
-u
)
function
exceptDataset
{
case
$1
in
"duplicate-iri-split"
)
echo
"debug"
;;
"r2r-mapping-errors"
)
echo
"debug"
;;
"type-like-statements"
)
echo
"debug"
;;
*
)
echo
"
$1
"
;;
esac
}
function
exceptName
{
case
$1
in
"duplicate-iri-split"
)
echo
"debug_duplicateirisplit"
;;
"r2r-mapping-errors"
)
echo
"debug_r2rmappingerrors"
;;
"type-like-statements"
)
echo
"debug_typelikestatements"
;;
*
)
echo
"
$1
"
;;
esac
}
for
file
in
$files
;
do
name
=
${
file
##*/
}
;
name
=
$(
echo
$name
|
cut
-d
"."
-f1
)
dumpVersion
=
${
file
%/*
}
;
dumpVersion
=
${
dumpVersion
##*/
}
version
=
"
${
dumpVersion
:0:4
}
.
${
dumpVersion
:4:2
}
.
${
dumpVersion
:6:2
}
"
CONTVAR
=
""
if
[[
$name
==
*
"-nmw"
*
]]
;
then
CONTVAR
=
"
${
CONTVAR
}
_nmw"
fi
if
[[
$name
==
*
"-reified"
*
]]
;
then
CONTVAR
=
"
${
CONTVAR
}
_reified"
fi
if
[[
$name
==
*
"-reified-qualifiers"
*
]]
;
then
CONTVAR
=
"
${
CONTVAR
}
_qualifiers"
fi
if
[[
$name
==
*
"-redirected"
*
]]
;
then
CONTVAR
=
"
${
CONTVAR
}
_redirected"
fi
if
[[
$name
==
*
"-length"
*
]]
;
then
CONTVAR
=
"
${
CONTVAR
}
_length"
fi
if
[[
$name
==
*
"-ids"
*
]]
;
then
CONTVAR
=
"
${
CONTVAR
}
_ids"
fi
if
[[
$name
==
*
"-uris"
*
]]
;
then
CONTVAR
=
"
${
CONTVAR
}
_uris"
fi
if
[[
$name
==
*
"-transitive"
*
]]
;
then
CONTVAR
=
"
${
CONTVAR
}
_transitive"
fi
dataset
=
$(
echo
$name
|
sed
-e
"s/wikidatawiki-
$dumpVersion
-//g; s/-nmw//g; s/wikidata-//g; s/-reified//g; s/-qualifiers//g; s/-redirected//g; s/-ids//g; s/-length//g; s/-uris//g; s/-transitive//g; s/transitive-//g"
)
new_name
=
"
${
dataset
}${
CONTVAR
}
"
if
[[
$dataset
==
*
"interlanguage-links"
*
]]
;
then
new_name
=
"interlanguange-links_lang="
$(
echo
$dataset
|
sed
"s/interlanguage-links-//g"
)
dataset
=
"interlanguange-links"
fi
dataset
=
$(
exceptDataset
$dataset
)
new_name
=
$(
exceptName
$new_name
)
new_name
=
$new_name
$(
echo
${
file
##*/
}
|
sed
"s/
$name
//g"
)
mkdir
-p
$DATABUSMVNPOMDIR
/
$dataset
/
$version
/
cp
-vn
$file
$DATABUSMVNPOMDIR
/
$dataset
/
$version
/
$new_name
done
config.d/sparkextraction.generic.en.properties
0 → 100644
View file @
2c78490a
# download and extraction target dir
#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties
# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.
# default:
#source=# moved to $extraction-framework/core/src/main/resources/universal.properties
spark-master
=
local[32]
# use only directories that contain a 'download-complete' file? Default is false.
require-download-complete
=
false
# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
# excluded en, seemed too big for local[32]
#languages=af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue
languages
=
en
# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
extractors
=
.ArticleCategoriesExtractor,.ArticlePageExtractor,.ArticleTemplatesExtractor,.CategoryLabelExtractor,
\
.ExternalLinksExtractor,.GeoExtractor,.InfoboxExtractor,.InterLanguageLinksExtractor,.LabelExtractor,.PageIdExtractor,
\
.PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,
\
.WikiPageLengthExtractor,.WikiPageOutDegreeExtractor
extractors.en
=
.CitationExtractor,.DisambiguationExtractor,.HomepageExtractor,.PersondataExtractor,.PndExtractor,.TopicalConceptsExtractor,.AnchorTextExtractor,.CommonsResourceExtractor
# If we need to Exclude Non-Free Images in this Extraction, set this to true
copyrightCheck
=
false
config.d/sparkextraction.generic.properties
0 → 100644
View file @
2c78490a
# download and extraction target dir
#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties
# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.
# default:
#source=# moved to $extraction-framework/core/src/main/resources/universal.properties
spark-master
=
local[32]
# use only directories that contain a 'download-complete' file? Default is false.
require-download-complete
=
false
# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
# excluded en, seemed too big for local[32]
languages
=
af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue
# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
extractors
=
.ArticleCategoriesExtractor,.ArticlePageExtractor,.ArticleTemplatesExtractor,.CategoryLabelExtractor,
\
.ExternalLinksExtractor,.GeoExtractor,.InfoboxExtractor,.InterLanguageLinksExtractor,.LabelExtractor,.PageIdExtractor,
\
.PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,
\
.WikiPageLengthExtractor,.WikiPageOutDegreeExtractor
extractors.ar
=
.TopicalConceptsExtractor
extractors.be
=
extractors.bg
=