Commit 2c78490a authored by Marvin Hofer's avatar Marvin Hofer
Browse files

dbpedia_extraction_minimal_runscript.sh: update

parent 7be22d44
# Default download server. It lists mirrors which may be faster.
# base-url=https://dumps.wikimedia.org/
# base-url=https://ftp.acc.umu.se/mirror/wikimedia.org/dumps/
base-url=http://dumps.wikimedia.your.org/
# the source file name
# should be the same as in universal.properties
# source=pages-articles.xml.bz2
# languages to download
languages=en,af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue
# Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space.
unzip=false
# Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds.
retry-max=5
retry-millis=10000
#for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used
dump-date=
# Default download server. It lists mirrors which may be faster.
base-url=http://dumps.wikimedia.your.org/
# the source file name
# should be the same as in universal.properties
# source=pages-articles.xml.bz2
# languages to download
languages=@mappings
# Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space.
unzip=false
# Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds.
retry-max=5
retry-millis=10000
#for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used
dump-date=
# Default download server. It lists mirrors which may be faster.
base-url=https://dumps.wikimedia.your.org/
# the source file name
# should be the same as in universal.properties
# source=pages-articles.xml.bz2
# languages to download
languages=wikidata
# Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space.
unzip=false
# Sometimes connecting to the server fails, so we try five times with pauses of 10 seconds.
retry-max=5
retry-millis=10000
#for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used
dump-date=
# download and extraction target dir
#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties
# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.
# default:
#source=# moved to $extraction-framework/core/src/main/resources/universal.properties
# use only directories that contain a 'download-complete' file? Default is false.
require-download-complete=false
# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
languages=@mappings
# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
extractors=.MappingExtractor
#extractors.ar=.MappingExtractor,.TopicalConceptsExtractor
#
#extractors.be=.MappingExtractor
#
#extractors.bg=.MappingExtractor
#
#extractors.bn=.MappingExtractor
#
#extractors.ca=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor
#
#extractors.ced=.MappingExtractor
#
#extractors.commons=.MappingExtractor,.ContributorExtractor,.TemplateParameterExtractor,.FileTypeExtractor,.GalleryExtractor,.ImageAnnotationExtractor,.CommonsKMLExtractor,.DBpediaResourceExtractor
#
#extractors.cs=.MappingExtractor
#
#extractors.cy=.MappingExtractor
#
#extractors.da=.MappingExtractor
#
#extractors.de=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.PersondataExtractor,.PndExtractor,.CommonsResourceExtractor
#
#extractors.el=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor
#
#extractors.en=.MappingExtractor,.CitationExtractor,.DisambiguationExtractor,.GenderExtractor,.HomepageExtractor,.ImageExtractorNew,.PersondataExtractor,.PndExtractor,.TopicalConceptsExtractor,.AnchorTextExtractor,.CommonsResourceExtractor
#
#extractors.eo=.MappingExtractor
#
#extractors.es=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor,.CommonsResourceExtractor
#
#extractors.et=.MappingExtractor
#
#extractors.eu=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor
#
#extractors.fa=.MappingExtractor
#
#extractors.fi=.MappingExtractor
#
#extractors.fr=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.PndExtractor,.TopicalConceptsExtractor,.fr.PopulationExtractor,.CommonsResourceExtractor
#
#extractors.ga=.MappingExtractor,.HomepageExtractor
#
#extractors.gl=.MappingExtractor
#
#extractors.hi=.MappingExtractor
#
#extractors.hr=.MappingExtractor
#
#extractors.hu=.MappingExtractor
#
#extractors.id=.MappingExtractor
#
#extractors.it=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor
#
#extractors.ja=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor,.CommonsResourceExtractor
#
#extractors.ko=.MappingExtractor,.DisambiguationExtractor
#
#extractors.lt=.MappingExtractor
#
#extractors.lv=.MappingExtractor
#
#extractors.nl=.MappingExtractor,.DisambiguationExtractor,.ImageExtractorNew,.CommonsResourceExtractor
#
#extractors.mk=.MappingExtractor
#
#extractors.mt=.MappingExtractor
#
#extractors.pl=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew
#
#extractors.pt=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor,.CommonsResourceExtractor
#
#extractors.ru=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.ImageExtractorNew,.TopicalConceptsExtractor
#
#extractors.sk=.MappingExtractor
#
#extractors.sl=.MappingExtractor
#
#extractors.sr=.MappingExtractor
#
#extractors.tr=.MappingExtractor
#
#extractors.ur=.MappingExtractor
#
#extractors.vi=.MappingExtractor
#
#extractors.war=.MappingExtractor
#only the raw extractor here: all other wikidata extractors are executed in an separate extraction for wikidata (see: extraction.wikidata.properties)
#extractors.wikidata=.WikidataSameAsExtractor,.WikidataRawExtractor
#extractors.zh=.MappingExtractor
# If we need to Exclude Non-Free Images in this Extraction, set this to true
copyrightCheck=false
# this is used in a separate extraction for wikidata
# make sure to have the .WikidataRawExtractor already run before and run redirect script on wikidata_raw_unredirected before that!!!
####################################
# download and extraction target dir
#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties
# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.
# default:
#source=# moved to $extraction-framework/core/src/main/resources/universal.properties
# use only directories that contain a 'download-complete' file? Default is false.
require-download-complete=true
# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
languages=wikidata
# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
extractors=.PageIdExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.WikiPageLengthExtractor
extractors.wikidata=.WikidataR2RExtractor,.WikidataRawExtractor,.WikidataReferenceExtractor,.WikidataAliasExtractor,.WikidataLabelExtractor,.WikidataNameSpaceSameAsExtractor,.WikidataPropertyExtractor,.WikidataLabelExtractor,.WikidataDescriptionExtractor,.WikidataSameAsExtractor
#!/bin/bash
set -e
# [CONFIG]
#extracted dumps (basedir)
BASEDIR="/data/extraction/wikidumps/"
#databus-maven-plugin project, containing release pom
DATABUSMVNPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/generic"
#explicit databus version or empty for all
DUMPDATE=
#if true show dumy output
TRYRUN=false
# [DATASET-MAPPING]
mapLang() {
lang=$(echo "$1" | sed 's|wiki||g')
case "$lang" in
"bat_smg") echo "_lang=batsmg";;
"zh_min_nan") echo "_lang=nan";;
"zh_yue") echo "_lang=yue";;
"wikidata") echo "";;
*) echo "_lang=$lang";;
esac
}
mapExtraction() {
case "$1" in
"article-templates-nested") echo "article-templates_nested";;
"citation-data") echo "citations_data";;
"citation-links") echo "citations_links";;
"commons-page-links") echo "commons-sameas-links";;
"page-ids") echo "page_ids";;
"page-length") echo "page_length";;
"page-links") echo "wikilinks";;
"article-categories") echo "categories_articles";;
"category-labels") echo "categories_labels";;
"skos-categories") echo "categories_skos";;
"revision-ids") echo "revisions_ids";;
"revision-uris") echo "revisions_uris";;
*) echo "$1";;
esac
}
# [FUNCTIONS]
collectExtractionFun() {
#how to use ${string##/*}
#https://www.tldp.org/LDP/abs/html/string-manipulation.html#Substring%20Removal#Substring Removal
for path in $(find "$BASEDIR" -name "*.ttl.bz2"); do
file="${path##*/}"
version="${file#*-}"
version="${version%%-*}"
version="${version:0:4}.${version:4:2}.${version:6:2}"
if [ "$DUMPDATE" = "$version" ] || [ -z "$DUMPDATE" ] ; then
lang="${file%%-*}"
extraction="${file#*-*-}"
extraction="${extraction%%.*}"
extension="${file#*.}"
mapped="$(mapExtraction $extraction)"
artifact="${mapped%%_*}"
contVars="$(mapLang $lang)"
if [[ "$mapped" == *"_"* ]]; then
contVars="${contVars}_${mapped#*_}"
fi
targetArVe="$artifact/$version"
targetFile="$artifact$contVars.$extension"
if [ -d "$DATABUSMVNPOMDIR/$artifact" ]; then
if [ ! -d "$DATABUSMVNPOMDIR/$targetArVe" ]; then
mkdir -p "$DATABUSMVNPOMDIR/$targetArVe"
fi
if $TRYRUN; then
echo "$path -> $DATABUSMVNPOMDIR/$targetArVe/$targetFile"
else
cp -vn "$path" "$DATABUSMVNPOMDIR/$targetArVe/$targetFile"
fi
else
>&2 echo "unmapped/notexist artifact: $artifact | $mapped | $extraction"
fi
fi
done
}
renameRedirected() {
cd $DATABUSMVNPOMDIR;
# for f in $(find . -name "*_redirected*" ); do rename -n 's/_redirected\.ttl\.bz2$/\.ttl\.bz2$/' $f; done
for f in $(find . -name "*_redirected*" ); do rename -n 's/_redirected//' $f; done
}
# [Main]
main() {
collectExtractionFun;
}
main;
#!/bin/bash
set -e
# [CONFIG]
#extracted dumps (basedir)
BASEDIR="/data/extraction/wikidumps/"
#databus-maven-plugin project, containing release pom
DATABUSMVNPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/mappings"
#explicit databus version or empty for all
DUMPDATE=
#if true show dumy output
TRYRUN=false
# [DATASET-MAPPING]
mapLang() {
lang=$(echo "$1" | sed 's|wiki||g')
case "$lang" in
"bat_smg") echo "_lang=batsmg";;
"zh_min_nan") echo "_lang=nan";;
"zh_yue") echo "_lang=yue";;
"wikidata") echo "";;
"commons" ) echo "_commons";;
*) echo "_lang=$lang";;
esac
}
mapExtraction() {
case "$1" in
"instance-types-transitive") echo "instance-types_transitive";;
"mappingbased-objects-disjoint-domain") echo "mappingbased-objects_disjointDomain";;
"mappingbased-objects-disjoint-range") echo "mappingbased-objects_disjointRange";;
*) echo "$1";;
esac
}
# [FUNCTIONS]
copyToMavenPlugin() {
# https://www.tldp.org/LDP/abs/html/string-manipulation.html#Substring%20Removal#Substring Removal
# ${string##/*}
for path in $(find "$BASEDIR" -name "*.ttl.bz2"); do
file="${path##*/}"
version="${file#*-}"
version="${version%%-*}"
version="${version:0:4}.${version:4:2}.${version:6:2}"
if [ "$DUMPDATE" = "$version" ] || [ -z "$DUMPDATE" ] ; then
lang="${file%%-*}"
extraction="${file#*-*-}"
extraction="${extraction%%.*}"
extension="${file#*.}"
mapped="$(mapExtraction $extraction)"
artifact="${mapped%%_*}"
contVars="$(mapLang $lang)"
if [[ "$mapped" == *"_"* ]]; then
contVars="${contVars}_${mapped#*_}"
fi
targetArVe="$artifact/$version"
targetFile="$artifact$contVars.$extension"
if [ -d "$DATABUSMVNPOMDIR/$artifact" ]; then
if [ ! -d "$DATABUSMVNPOMDIR/$targetArVe" ]; then
mkdir -p "$DATABUSMVNPOMDIR/$targetArVe"
fi
if $TRYRUN; then
echo "$path -> $DATABUSMVNPOMDIR/$targetArVe/$targetFile"
else
cp -vn "$path" "$DATABUSMVNPOMDIR/$targetArVe/$targetFile"
fi
else
>&2 echo "unmapped/notexist artifact: $artifact | $mapped | $extraction"
fi
fi
done
}
# [MAIN]
main() {
copyToMavenPlugin
}
main
#!/bin/bash
set -e
# [CONFIG]
#extracted dumps (basedir)
BASEDIR="/data/extraction/wikidumps/"
#databus-maven-plugin project, containing release pom
DATABUSMVNPOMDIR="/data/extraction/databus-maven-plugin/dbpedia/wikidata"
#explicit databus version or empty for all
DUMPDATE=
#if true show dumy output
TRYRUN=false
# [TODO]
echo "----------------------------"
echo "Prepare Wikidata for Databus"
echo "----------------------------"
cd $BASEDIR
files=$(find wikidatawiki -name "*.ttl.bz2" | sort -u )
function exceptDataset {
case $1 in
"duplicate-iri-split") echo "debug";;
"r2r-mapping-errors") echo "debug";;
"type-like-statements") echo "debug";;
*) echo "$1";;
esac
}
function exceptName {
case $1 in
"duplicate-iri-split") echo "debug_duplicateirisplit";;
"r2r-mapping-errors") echo "debug_r2rmappingerrors";;
"type-like-statements") echo "debug_typelikestatements";;
*) echo "$1";;
esac
}
for file in $files; do
name=${file##*/}; name=$(echo $name | cut -d"." -f1)
dumpVersion=${file%/*}; dumpVersion=${dumpVersion##*/}
version="${dumpVersion:0:4}.${dumpVersion:4:2}.${dumpVersion:6:2}"
CONTVAR=""
if [[ $name == *"-nmw"* ]]; then
CONTVAR="${CONTVAR}_nmw"
fi
if [[ $name == *"-reified"* ]]; then
CONTVAR="${CONTVAR}_reified"
fi
if [[ $name == *"-reified-qualifiers"* ]]; then
CONTVAR="${CONTVAR}_qualifiers"
fi
if [[ $name == *"-redirected"* ]]; then
CONTVAR="${CONTVAR}_redirected"
fi
if [[ $name == *"-length"* ]]; then
CONTVAR="${CONTVAR}_length"
fi
if [[ $name == *"-ids"* ]]; then
CONTVAR="${CONTVAR}_ids"
fi
if [[ $name == *"-uris"* ]]; then
CONTVAR="${CONTVAR}_uris"
fi
if [[ $name == *"-transitive"* ]]; then
CONTVAR="${CONTVAR}_transitive"
fi
dataset=$(echo $name | sed -e "s/wikidatawiki-$dumpVersion-//g; s/-nmw//g; s/wikidata-//g; s/-reified//g; s/-qualifiers//g; s/-redirected//g; s/-ids//g; s/-length//g; s/-uris//g; s/-transitive//g; s/transitive-//g")
new_name="${dataset}${CONTVAR}"
if [[ $dataset == *"interlanguage-links"* ]]; then
new_name="interlanguange-links_lang="$(echo $dataset | sed "s/interlanguage-links-//g")
dataset="interlanguange-links"
fi
dataset=$(exceptDataset $dataset)
new_name=$(exceptName $new_name)
new_name=$new_name$(echo ${file##*/} | sed "s/$name//g")
mkdir -p $DATABUSMVNPOMDIR/$dataset/$version/
cp -vn $file $DATABUSMVNPOMDIR/$dataset/$version/$new_name
done
# download and extraction target dir
#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties
# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.
# default:
#source=# moved to $extraction-framework/core/src/main/resources/universal.properties
spark-master=local[32]
# use only directories that contain a 'download-complete' file? Default is false.
require-download-complete=false
# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
# excluded en, seemed too big for local[32]
#languages=af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue
languages=en
# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
extractors=.ArticleCategoriesExtractor,.ArticlePageExtractor,.ArticleTemplatesExtractor,.CategoryLabelExtractor,\
.ExternalLinksExtractor,.GeoExtractor,.InfoboxExtractor,.InterLanguageLinksExtractor,.LabelExtractor,.PageIdExtractor,\
.PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,\
.WikiPageLengthExtractor,.WikiPageOutDegreeExtractor
extractors.en=.CitationExtractor,.DisambiguationExtractor,.HomepageExtractor,.PersondataExtractor,.PndExtractor,.TopicalConceptsExtractor,.AnchorTextExtractor,.CommonsResourceExtractor
# If we need to Exclude Non-Free Images in this Extraction, set this to true
copyrightCheck=false
# download and extraction target dir
#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties
# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.
# default:
#source=# moved to $extraction-framework/core/src/main/resources/universal.properties
spark-master=local[32]
# use only directories that contain a 'download-complete' file? Default is false.
require-download-complete=false
# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
# excluded en, seemed too big for local[32]
languages=af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue
# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
extractors=.ArticleCategoriesExtractor,.ArticlePageExtractor,.ArticleTemplatesExtractor,.CategoryLabelExtractor,\
.ExternalLinksExtractor,.GeoExtractor,.InfoboxExtractor,.InterLanguageLinksExtractor,.LabelExtractor,.PageIdExtractor,\
.PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,\
.WikiPageLengthExtractor,.WikiPageOutDegreeExtractor
extractors.ar=.TopicalConceptsExtractor
extractors.be=
extractors.bg=