diff --git a/merge_some_into b/merge_some_into deleted file mode 100644 index b14431f76d3b591254477932a5743307f568d9bb..0000000000000000000000000000000000000000 --- a/merge_some_into +++ /dev/null @@ -1,231 +0,0 @@ -#!/bin/bash -#+------------------------------------------------------------------------------------------------------------------------------+ -#| DBpedia Spotlight - Create database-backed model | -#| @author Joachim Daiber | -#+------------------------------------------------------------------------------------------------------------------------------+ - -# $1 Working directory -# $2 Locale (en_US) -# $3 Stopwords file -# $4 Analyzer+Stemmer language prefix e.g. Dutch -# $5 Model target folder - -export MAVEN_OPTS="-Xmx26G" - -usage () -{ - echo "index_db.sh" - echo "usage: ./index_db.sh -o /data/spotlight/nl/opennlp wdir nl_NL /data/spotlight/nl/stopwords.nl.list Dutch /data/spotlight/nl/final_model" - echo "Create a database-backed model of DBpedia Spotlight for a specified language." - echo " " -} - - -opennlp="None" -eval="false" -blacklist="false" - -while getopts "eo:b:" opt; do - case $opt in - o) opennlp="$OPTARG";; - e) eval="true";; - b) blacklist="$OPTARG";; - esac -done - - -shift $((OPTIND - 1)) - -if [ $# != 5 ] -then - usage - exit -fi - -BASE_DIR=$(pwd) - -function get_path { - if [[ "$1" = /* ]] - then - echo "$1" - else - echo "$BASE_DIR/$1" - fi -} - -BASE_WDIR=$(get_path $1) -TARGET_DIR=$(get_path $5) -STOPWORDS=$(get_path $3) -WDIR="$BASE_WDIR/$2" - -if [[ "$opennlp" != "None" ]]; then - opennlp=$(get_path $opennlp) -fi -if [[ "$blacklist" != "false" ]]; then - blacklist=$(get_path $blacklist) -fi - -LANGUAGE=`echo $2 | sed "s/_.*//g"` - -echo "Language: $LANGUAGE" -echo "Working directory: $WDIR" - -mkdir -p $WDIR - -######################################################################################################## -# Preparing the data. -######################################################################################################## - -echo "Loading Wikipedia dump..." -if [ -z "$WIKI_MIRROR" ]; then - WIKI_MIRROR="https://dumps.wikimedia.org/" -fi - -WP_DOWNLOAD_FILE=$WDIR/dump.xml -echo Checking for wikipedia dump at $WP_DOWNLOAD_FILE -if [ -f "$WP_DOWNLOAD_FILE" ]; then - echo File exists. -else - echo Downloading wikipedia dump. - if [ "$eval" == "false" ]; then - curl -# "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | bzcat > $WDIR/dump.xml - else - curl -# "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | bzcat | python $BASE_DIR/scripts/split_train_test.py 1200 $WDIR/heldout.txt > $WDIR/dump.xml - fi -fi - -cd $WDIR -cp $STOPWORDS stopwords.$LANGUAGE.list - -if [ -e "$opennlp/$LANGUAGE-token.bin" ]; then - cp "$opennlp/$LANGUAGE-token.bin" "$LANGUAGE.tokenizer_model" || echo "tokenizer already exists" -else - touch "$LANGUAGE.tokenizer_model" -fi - - -######################################################################################################## -# DBpedia extraction: -######################################################################################################## - -#Download: -echo "Creating DBpedia nt files..." -cd $BASE_WDIR - -if [ -d extraction-framework ]; then - echo "Updating DBpedia Spotlight..." - cd extraction-framework - git reset --hard HEAD - git pull - mvn install -else - echo "Setting up DEF..." - git clone git://github.com/dbpedia/extraction-framework.git - cd extraction-framework - mvn install -fi - -cd dump - -dumpdate=$(date +%Y%m%d) -dumpdir=$WDIR/${LANGUAGE}wiki/${dumpdate} - -mkdir -p $dumpdir -ln -s $WDIR/dump.xml $dumpdir/${LANGUAGE}wiki-${dumpdate}-dump.xml - -cat << EOF > dbpedia.properties -base-dir=$WDIR -wiki=$LANGUAGE -locale=$LANGUAGE -source=dump.xml -require-download-complete=false -languages=$LANGUAGE -ontology=../ontology.xml -mappings=../mappings -uri-policy.uri=uri:en; generic:en; xml-safe-predicates:* -format.nt.gz=n-triples;uri-policy.uri -EOF - -if [[ ",ga,ar,be,bg,bn,ced,cs,cy,da,eo,et,fa,fi,gl,hi,hr,hu,id,ja,lt,lv,mk,mt,sk,sl,sr,tr,ur,vi,war,zh," == *",$LANGUAGE,"* ]]; then #Languages with no disambiguation definitions - echo "extractors=.RedirectExtractor,.MappingExtractor" >> dbpedia.properties -else - echo "extractors=.RedirectExtractor,.DisambiguationExtractor,.MappingExtractor" >> dbpedia.properties -fi - -../run extraction dbpedia.properties - -zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-instance-types*.nt.gz > $WDIR/instance_types.nt -zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-disambiguations-unredirected.nt.gz > $WDIR/disambiguations.nt -zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-redirects.nt.gz > $WDIR/redirects.nt - -rm -Rf $dumpdir - -######################################################################################################## -# Setting up Spotlight: -######################################################################################################## - -cd $BASE_WDIR - -if [ -d dbpedia-spotlight ]; then - echo "Updating DBpedia Spotlight..." - cd dbpedia-spotlight - git reset --hard HEAD - git pull - mvn -T 1C -q clean install -else - echo "Setting up DBpedia Spotlight..." - git clone --depth 1 https://github.com/dbpedia-spotlight/dbpedia-spotlight-model - mv dbpedia-spotlight-model dbpedia-spotlight - cd dbpedia-spotlight - mvn -T 1C -q clean install -fi - - -######################################################################################################## -# Extracting wiki stats: -######################################################################################################## - -cd $BASE_WDIR -rm -Rf wikistatsextractor -git clone --depth 1 https://github.com/dbpedia-spotlight/wikistatsextractor - -# Stop processing if one step fails -set -e - -#Copy results to local: -cd $BASE_WDIR/wikistatsextractor -mvn install exec:java -Dexec.args="--output_folder $WDIR $LANGUAGE $2 $4Stemmer $WDIR/dump.xml $WDIR/stopwords.$LANGUAGE.list" - -if [ "$blacklist" != "false" ]; then - echo "Removing blacklist URLs..." - mv $WDIR/uriCounts $WDIR/uriCounts_all - grep -v -f $blacklist $WDIR/uriCounts_all > $WDIR/uriCounts -fi - -echo "Finished wikistats extraction. Cleaning up..." -rm -f $WDIR/dump.xml - - -######################################################################################################## -# Building Spotlight model: -######################################################################################################## - -#Create the model: -cd $BASE_WDIR/dbpedia-spotlight - -mvn -pl index exec:java -Dexec.mainClass=org.dbpedia.spotlight.db.CreateSpotlightModel -Dexec.args="$2 $WDIR $TARGET_DIR $opennlp $STOPWORDS $4Stemmer" - -if [ "$eval" == "true" ]; then - mvn -pl eval exec:java -Dexec.mainClass=org.dbpedia.spotlight.evaluation.EvaluateSpotlightModel -Dexec.args="$TARGET_DIR $WDIR/heldout.txt" > $TARGET_DIR/evaluation.txt -fi - -curl https://raw.githubusercontent.com/dbpedia-spotlight/model-quickstarter/master/model_readme.txt > $TARGET_DIR/README.txt -curl "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2-rss.xml" | grep link | sed -e 's/^.*<link>//' -e 's/<[/]link>.*$//' | uniq >> $TARGET_DIR/README.txt - - -echo "Collecting data..." -cd $BASE_DIR -mkdir -p data/$LANGUAGE && mv $WDIR/*Counts data/$LANGUAGE -gzip $WDIR/*.nt & - -set +e diff --git a/migrate_data_marvin_to_download_server.sh b/migrate_data_marvin_to_download_server.sh deleted file mode 100755 index ab284695f8f6d7fb34ca1e98548b4760fc22a549..0000000000000000000000000000000000000000 --- a/migrate_data_marvin_to_download_server.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -# ./marvin-fetch.sh wikidata 2019.08.01 - -GROUP=$1 -VERSION=$2 -SERVER=dbpedia-$1.tib.eu - -# get artifacts -ARTIFACTS=`xmlstarlet sel -N my=http://maven.apache.org/POM/4.0.0 -t -v "/my:project/my:modules/my:module" $GROUP/pom.xml` - -for ARTIFACT in $ARTIFACTS ; do - echo $ARTIFACT - #scp -rv marvin-fetch@$SERVER:/data/databus-maven-plugin/dbpedia/$GROUP/$a/$VERSION $GROUP/$a/ - rsync -av -e ssh --ignore-existing marvin-fetch@$SERVER:/data/derive/databus-maven-plugin/dbpedia/$GROUP/$ARTIFACT/$VERSION $GROUP/$ARTIFACT -done