Commit 7b84dd56 authored by Marvin Hofer's avatar Marvin Hofer
Browse files

dbpedia runscript: postProcessing

parent 2c78490a
......@@ -16,7 +16,7 @@ languages=ro
# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
extractors=.MappingExtractor
extractors=.MappingExtractor,.RedirectExtractor
#extractors.ar=.MappingExtractor,.TopicalConceptsExtractor
#
......
......@@ -107,8 +107,6 @@ copyToMavenPlugin() {
# [MAIN]
main() {
prepareM() {
copyToMavenPlugin
}
main
......@@ -13,9 +13,9 @@ DATAPUSMAVENPLUGINPOMGIT="https://github.com/dbpedia/databus-maven-plugin.git"
EXTRACTIONFRAMEWORKGIT="https://github.com/dbpedia/extraction-framework.git"
# arguments
HELP="usage: --group={generic|mappings|wikidata} [--skip-release|--skip-mvn-install]"
HELP="usage: --group={generic|mappings|wikidata} [--databus-deploy|--skip-mvn-install]"
GROUP=""
SKIPRELEASE=false
DATABUSDEPLOY=false
SKIPMVNINSTALL=false
for i in "$@"
......@@ -25,8 +25,8 @@ case $i in
GROUP="${i#*=}"
shift
;;
--skip-release)
SKIPRELEASE=true
--databus-deploy)
DATABUSDEPLOY=true
shift
;;
--skip-mvn-install)
......@@ -46,7 +46,8 @@ case $i in
esac
done
if [ "$GROUP" != "generic" ] && [ "$GROUP" != "mappings" ] && [ "$GROUP" != "test" ] && [ "$GROUP" != "wikidata" ] || [ -z "$GROUP" ]; then
if [ "$GROUP" != "generic" ] && [ "$GROUP" != "mappings" ] && [ "$GROUP" != "test" ] && [ "$GROUP" != "wikidata" ] || [ -z "$GROUP" ]
then
echo $HELP
exit 1
fi
......@@ -62,14 +63,16 @@ createDirectories() {
# clone repositories
gitCheckout() {
if [ -d $EXTRACTIONFRAMEWORKDIR/.git ]; then
if [ -d $EXTRACTIONFRAMEWORKDIR/.git ]
then
cd $EXTRACTIONFRAMEWORKDIR;
echo -n "extraction-framework "
git pull;
else
git clone $EXTRACTIONFRAMEWORKGIT
fi
if [ -d $DATAPUSMAVENPLUGINPOMDIR/.git ]; then
if [ -d $DATAPUSMAVENPLUGINPOMDIR/.git ]
then
cd $DATAPUSMAVENPLUGINPOMDIR;
echo -n "databus-maven-plugin "
git pull;
......@@ -80,9 +83,11 @@ gitCheckout() {
# download ontology, mappings, wikidataR2R
downloadMetadata() {
cd $EXTRACTIONFRAMEWORKDIR/core && ../run download-ontology;
cd $EXTRACTIONFRAMEWORKDIR/core && ../run download-mappings;
cd $EXTRACTIONFRAMEWORKDIR/core/src/main/resources && curl https://raw.githubusercontent.com/dbpedia/extraction-framework/master/core/src/main/resources/wikidatar2r.json > wikidatar2r.json
cd $EXTRACTIONFRAMEWORKDIR/core;
../run download-ontology;
../run download-mappings;
cd $EXTRACTIONFRAMEWORKDIR/core/src/main/resources;
curl https://raw.githubusercontent.com/dbpedia/extraction-framework/master/core/src/main/resources/wikidatar2r.json > wikidatar2r.json;
}
# downlaod and extract data
......@@ -91,25 +96,37 @@ extractDumps() {
sed -i -e 's,$BASEDIR,'$EXTRACTIONBASEDIR',g' $EXTRACTIONFRAMEWORKDIR/core/src/main/resources/universal.properties;
sed -i -e 's,$LOGDIR,'$LOGDIR',g' $EXTRACTIONFRAMEWORKDIR/core/src/main/resources/universal.properties;
if [ "$SKIPMVNINSTALL" = "false" ]; then
if [ "$SKIPMVNINSTALL" = "false" ]
then
echo "extraction-framework: mvn install"
cd $EXTRACTIONFRAMEWORKDIR && mvn install;
fi
if [ "$GROUP" = "mappings" ]; then
cd $EXTRACTIONFRAMEWORKDIR/dump && ../run download $ROOT/config.d/download.mappings.properties
cd $EXTRACTIONFRAMEWORKDIR/dump && ../run extraction $ROOT/config.d/extraction.mappings.properties
elif [ "$GROUP" = "wikidata" ]; then
cd $EXTRACTIONFRAMEWORKDIR/dump && ../run download $ROOT/config.d/download.wikidata.properties
cd $EXTRACTIONFRAMEWORKDIR/dump && ../run extraction $ROOT/config.d/extraction.wikidata.properties
elif [ "$GROUP" = "generic" ]; then
cd $EXTRACTIONFRAMEWORKDIR/dump && ../run download $ROOT/config.d/download.generic.properties
cd $EXTRACTIONFRAMEWORKDIR/dump && ../run sparkextraction $ROOT/config.d/sparkextraction.generic.properties
cd $EXTRACTIONFRAMEWORKDIR/dump && ../run sparkextraction $ROOT/config.d/sparkextraction.generic.en.properties
elif [ "$GROUP" = "test" ]; then
cd $EXTRACTIONFRAMEWORKDIR/dump && ../run download $ROOT/config.d/download.test.properties
cd $EXTRACTIONFRAMEWORKDIR/dump && ../run extraction $ROOT/config.d/extraction.test.properties
elif [ "$GROUP" = "abstract" ]; then
cd $EXTRACTIONFRAMEWORKDIR/dump;
if [ "$GROUP" = "mappings" ]
then
>&2 ../run download $ROOT/config.d/download.mappings.properties;
>&2 ../run extraction $ROOT/config.d/extraction.mappings.properties;
elif [ "$GROUP" = "wikidata" ]
then
>&2 ../run download $ROOT/config.d/download.wikidata.properties;
>&2 ../run extraction $ROOT/config.d/extraction.wikidata.properties;
elif [ "$GROUP" = "generic" ]
then
>&2 ../run download $ROOT/config.d/download.generic.properties;
>&2 ../run sparkextraction $ROOT/config.d/sparkextraction.generic.properties;
>&2 ../run sparkextraction $ROOT/config.d/sparkextraction.generic.en.properties;
elif [ "$GROUP" = "test" ]
then
>&2 ../run download $ROOT/config.d/download.test.properties;
>&2 ../run extraction $ROOT/config.d/extraction.test.properties;
elif [ "$GROUP" = "abstract" ]
then
echo "TODO abstract extraction and download"
fi
}
......@@ -117,70 +134,97 @@ extractDumps() {
# post-processing
postProcessing() {
if [ "$GROUP" = "mappings"] || [ "$GROUP" = "test" ]; then
cd $EXTRACTIONFRAMEWORKDIR/scripts;
if [ "$GROUP" = "mappings" ]
then
echo "mappings postProcessing"
cd $EXTRACTIONFRAMEWORKDIR/scripts;
../run ResolveTransitiveLinks $BASEDIR redirects redirects_transitive .ttl.bz2 @downloaded;
../run MapObjectUris $BASEDIR redirects_transitive .ttl.bz2 mappingbased-objects-uncleaned _redirected .ttl.bz2 @downloaded;
../run TypeConsistencyCheck type.consistency.check.properties;
# TODO prepareRelease
elif [ "$GROUP" = "wikidata" ]; then
>&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded;
>&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 mappingbased-objects-uncleaned _redirected .ttl.bz2 @downloaded;
>&2 ../run TypeConsistencyCheck type.consistency.check.properties;
cd $ROOT/config.d;
source prepareMappingsArtifacts.sh; BASEDIR=$EXTRACTIONBASEDIR; DATABUSMVNPOMDIR=$DATAPUSMAVENPLUGINPOMDIR;
prepareM;
elif [ "$GROUP" = "wikidata" ]
then
echo "wikidata postProcessing"
cd $EXTRACTIONFRAMEWORKDIR/scripts;
../run ResolveTransitiveLinks $BASEDIR redirects transitive-redirects .ttl.bz2 wikidata
../run MapObjectUris $BASEDIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata
../run TypeConsistencyCheck type.consistency.check.properties;
# TODO prepareRelease
elif [ "$GROUP" = "generic" ]; then
>&2 ../run ResolveTransitiveLinks $BASEDIR redirects transitive-redirects .ttl.bz2 wikidata
>&2 ../run MapObjectUris $BASEDIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata
>&2 ../run TypeConsistencyCheck type.consistency.check.properties;
# cd $ROOT/config.d;
# source prepareMappingsArtifacts.sh; BASEDIR=$EXTRACTIONBASEDIR; DATABUSMVNPOMDIR=$DATAPUSMAVENPLUGINPOMDIR;
# prepareW;
elif [ "$GROUP" = "generic" ]
then
echo "generic postProcessing"
cd $EXTRACTIONFRAMEWORKDIR/scripts;
../run ResolveTransitiveLinks $BASEDIR redirects redirects_transitive .ttl.bz2 @downloaded;
../run MapObjectUris $BASEDIR redirects_transitive .ttl.bz2 disambiguations,infobox-properties,page-links,persondata,topical-concepts _redirected .ttl.bz2 @downloaded;
# TODO prepareRelease
elif [ "$GROUP" = "abstract" ]; then
>&2 ../run ResolveTransitiveLinks $BASEDIR redirects redirects_transitive .ttl.bz2 @downloaded;
>&2 ../run MapObjectUris $BASEDIR redirects_transitive .ttl.bz2 disambiguations,infobox-properties,page-links,persondata,topical-concepts _redirected .ttl.bz2 @downloaded;
# cd $ROOT/config.d;
# source prepareMappingsArtifacts.sh; BASEDIR=$EXTRACTIONBASEDIR; DATABUSMVNPOMDIR=$DATAPUSMAVENPLUGINPOMDIR;
# prepareG;
elif [ "$GROUP" = "abstract" ]
then
echo "abstract postProcessing"
elif [ "$GROUP" = "test" ]
then
echo "test postProcessing"
>&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded;
>&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 mappingbased-objects-uncleaned _redirected .ttl.bz2 @downloaded;
>&2 ../run TypeConsistencyCheckManual mappingbased-objects instance-types ro;
cd $ROOT/config.d;
source prepareMappingsArtifacts.sh; BASEDIR=$EXTRACTIONBASEDIR; DATABUSMVNPOMDIR=$DATAPUSMAVENPLUGINPOMDIR/databus-maven-plugin/dbpedia/mappings;
prepareM;
fi
}
# release
release() {
cd $DATAPUSMAVENPLUGINPOMDIR;
mvn versions:set -DnewVersion=$(ls * | grep '^[0-9]\{4\}.[0-9]\{2\}.[0-9]\{2\}$' | sort -u | tail -1);
databusRelease() {
RELEASEPUBLISHER="https://vehnem.github.io/webid.ttl#this";
RELEASEPACKAGEDIR="/data/extraction/release/\${project.groupId}/\${project.artifactId}";
RELEASEDOWNLOADURL="http://dbpedia-generic.tib.eu/release/\${project.groupId}/\${project.artifactId}/\${project.version}/";
RELEASELABELPREFIX="(pre-release)";
RELEASECOMMENTPREFIX="(MARVIN is the DBpedia bot, that runs the DBpedia Information Extraction Framework (DIEF) and releases the data as is, i.e. unparsed, unsorted, not redirected for debugging the software. After its releases, data is cleaned and persisted under the dbpedia account.)";
if [ "$DATABUSDEPLOY" = "true" ]
then
cd $DATAPUSMAVENPLUGINPOMDIR;
mvn versions:set -DnewVersion=$(ls * | grep '^[0-9]\{4\}.[0-9]\{2\}.[0-9]\{2\}$' | sort -u | tail -1);
RELEASEPUBLISHER="https://vehnem.github.io/webid.ttl#this";
RELEASEPACKAGEDIR="/data/extraction/release/\${project.groupId}/\${project.artifactId}";
RELEASEDOWNLOADURL="http://dbpedia-generic.tib.eu/release/\${project.groupId}/\${project.artifactId}/\${project.version}/";
RELEASELABELPREFIX="(pre-release)";
RELEASECOMMENTPREFIX="(MARVIN is the DBpedia bot, that runs the DBpedia Information Extraction Framework (DIEF) and releases the data as is, i.e. unparsed, unsorted, not redirected for debugging the software. After its releases, data is cleaned and persisted under the dbpedia account.)";
mvn clean deploy -Ddatabus.publisher="$RELEASEPUBLISHER" -Ddatabus.packageDirectory="$RELEASEPACKAGEDIR" -Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL" -Ddatabus.labelPrefix="$RELEASELABELPREFIX" -Ddatabus.commentPrefix="$RELEASECOMMENTPREFIX";
>&2 mvn clean deploy -Ddatabus.publisher="$RELEASEPUBLISHER" -Ddatabus.packageDirectory="$RELEASEPACKAGEDIR" -Ddatabus.downloadUrlPath="$RELEASEDOWNLOADURL" -Ddatabus.labelPrefix="$RELEASELABELPREFIX" -Ddatabus.commentPrefix="$RELEASECOMMENTPREFIX";
fi
}
# clean up: compress log files
clean() {
cleanLogFiles() {
for f in $(find $LOGDIR -type f ); do lbzip2 $f; done;
}
main() {
# PRE-PROCESSING
createDirectories;
gitCheckout;
downloadMetadata &> $LOGDIR/downloadMetadata.log;
# createDirectories;
# gitCheckout;
# downloadMetadata &> $LOGDIR/downloadMetadata.log;
# EXTRACT
extractDumps &> $LOGDIR/extracion.log;
# extractDumps &> $LOGDIR/extracion.log;
# POST-PROCESSING
# postProcessing &> $LOGDIR/postProcessing.log;
postProcessing 2> $LOGDIR/postProcessing.log;
# RELEASE
# if [ "$SKIPRELEASE" = "false" ]; then
# echo true
# fi
databusRelease 2> $LOGDIR/databusDeploy.log
# CLEAN
# clean;
# CLEANUP
cleanLogFiles;
}
main
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment