#!/bin/bash
#+------------------------------------------------------------------------------------------------------------------------------+
#| DBpedia Spotlight - Create database-backed model                                                                             |
#| @author Joachim Daiber                                                                                                       |
#+------------------------------------------------------------------------------------------------------------------------------+

# $1 Working directory
# $2 Locale (en_US)
# $3 Stopwords file
# $4 Analyzer+Stemmer language prefix e.g. Dutch
# $5 Model target folder

export MAVEN_OPTS="-Xmx26G"

usage ()
{
  echo "index_db.sh"
  echo "usage: ./index_db.sh -o /data/spotlight/nl/opennlp wdir nl_NL /data/spotlight/nl/stopwords.nl.list Dutch /data/spotlight/nl/final_model"
  echo "Create a database-backed model of DBpedia Spotlight for a specified language."
  echo " "
}


opennlp="None"
eval="false"
blacklist="false"

while getopts "eo:b:" opt; do
  case $opt in
    o) opennlp="$OPTARG";;
    e) eval="true";;
    b) blacklist="$OPTARG";;
  esac
done


shift $((OPTIND - 1))

if [ $# != 5 ]
then
    usage
    exit
fi

BASE_DIR=$(pwd)

function get_path {
  if [[ "$1"  = /* ]]
  then
    echo "$1"
  else
   echo "$BASE_DIR/$1"
  fi
}

BASE_WDIR=$(get_path $1)
TARGET_DIR=$(get_path $5)
STOPWORDS=$(get_path $3)
WDIR="$BASE_WDIR/$2"

if [[ "$opennlp" != "None" ]]; then
  opennlp=$(get_path $opennlp)
fi
if [[ "$blacklist" != "false" ]]; then
  blacklist=$(get_path $blacklist)
fi

LANGUAGE=`echo $2 | sed "s/_.*//g"`

echo "Language: $LANGUAGE"
echo "Working directory: $WDIR"

mkdir -p $WDIR

########################################################################################################
# Preparing the data.
########################################################################################################

echo "Loading Wikipedia dump..."
if [ -z "$WIKI_MIRROR" ]; then
  WIKI_MIRROR="https://dumps.wikimedia.org/"
fi

WP_DOWNLOAD_FILE=$WDIR/dump.xml
echo Checking for wikipedia dump at $WP_DOWNLOAD_FILE
if [ -f "$WP_DOWNLOAD_FILE" ]; then
  echo File exists.
else
  echo Downloading wikipedia dump.
  if [ "$eval" == "false" ]; then
    curl -# "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | bzcat > $WDIR/dump.xml
  else
    curl -# "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | bzcat | python $BASE_DIR/scripts/split_train_test.py 1200 $WDIR/heldout.txt > $WDIR/dump.xml
  fi
fi

cd $WDIR
cp $STOPWORDS stopwords.$LANGUAGE.list

if [ -e "$opennlp/$LANGUAGE-token.bin" ]; then
  cp "$opennlp/$LANGUAGE-token.bin" "$LANGUAGE.tokenizer_model" || echo "tokenizer already exists"
else
  touch "$LANGUAGE.tokenizer_model"
fi


########################################################################################################
# DBpedia extraction:
########################################################################################################

#Download:
echo "Creating DBpedia nt files..."
cd $BASE_WDIR

if [ -d extraction-framework ]; then
    echo "Updating DBpedia Spotlight..."
    cd extraction-framework
    git reset --hard HEAD
    git pull
    mvn install
else
    echo "Setting up DEF..."
    git clone git://github.com/dbpedia/extraction-framework.git
    cd extraction-framework
    mvn install
fi

cd dump

dumpdate=$(date +%Y%m%d)
dumpdir=$WDIR/${LANGUAGE}wiki/${dumpdate}

mkdir -p $dumpdir
ln -s $WDIR/dump.xml $dumpdir/${LANGUAGE}wiki-${dumpdate}-dump.xml

cat << EOF > dbpedia.properties
base-dir=$WDIR
wiki=$LANGUAGE
locale=$LANGUAGE
source=dump.xml
require-download-complete=false
languages=$LANGUAGE
ontology=../ontology.xml
mappings=../mappings
uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*
format.nt.gz=n-triples;uri-policy.uri
EOF

if [[ ",ga,ar,be,bg,bn,ced,cs,cy,da,eo,et,fa,fi,gl,hi,hr,hu,id,ja,lt,lv,mk,mt,sk,sl,sr,tr,ur,vi,war,zh," == *",$LANGUAGE,"* ]]; then #Languages with no disambiguation definitions
     echo "extractors=.RedirectExtractor,.MappingExtractor" >> dbpedia.properties
else
     echo "extractors=.RedirectExtractor,.DisambiguationExtractor,.MappingExtractor" >> dbpedia.properties
fi

../run extraction dbpedia.properties

zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-instance-types*.nt.gz > $WDIR/instance_types.nt
zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-disambiguations-unredirected.nt.gz > $WDIR/disambiguations.nt
zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-redirects.nt.gz > $WDIR/redirects.nt

rm -Rf $dumpdir

########################################################################################################
# Setting up Spotlight:
########################################################################################################

cd $BASE_WDIR

if [ -d dbpedia-spotlight ]; then
    echo "Updating DBpedia Spotlight..."
    cd dbpedia-spotlight
    git reset --hard HEAD
    git pull
    mvn -T 1C -q clean install
else
    echo "Setting up DBpedia Spotlight..."
    git clone --depth 1 https://github.com/dbpedia-spotlight/dbpedia-spotlight-model
    mv dbpedia-spotlight-model dbpedia-spotlight
    cd dbpedia-spotlight
    mvn -T 1C -q clean install
fi


########################################################################################################
# Extracting wiki stats:
########################################################################################################

cd $BASE_WDIR
rm -Rf wikistatsextractor
git clone --depth 1 https://github.com/dbpedia-spotlight/wikistatsextractor

# Stop processing if one step fails
set -e

#Copy results to local:
cd $BASE_WDIR/wikistatsextractor
mvn install exec:java -Dexec.args="--output_folder $WDIR $LANGUAGE $2 $4Stemmer $WDIR/dump.xml $WDIR/stopwords.$LANGUAGE.list"

if [ "$blacklist" != "false" ]; then
  echo "Removing blacklist URLs..."
  mv $WDIR/uriCounts $WDIR/uriCounts_all
  grep -v -f $blacklist $WDIR/uriCounts_all > $WDIR/uriCounts
fi

echo "Finished wikistats extraction. Cleaning up..."
rm -f $WDIR/dump.xml


########################################################################################################
# Building Spotlight model:
########################################################################################################

#Create the model:
cd $BASE_WDIR/dbpedia-spotlight

mvn -pl index exec:java -Dexec.mainClass=org.dbpedia.spotlight.db.CreateSpotlightModel -Dexec.args="$2 $WDIR $TARGET_DIR $opennlp $STOPWORDS $4Stemmer"

if [ "$eval" == "true" ]; then
  mvn -pl eval exec:java -Dexec.mainClass=org.dbpedia.spotlight.evaluation.EvaluateSpotlightModel -Dexec.args="$TARGET_DIR $WDIR/heldout.txt" > $TARGET_DIR/evaluation.txt
fi

curl https://raw.githubusercontent.com/dbpedia-spotlight/model-quickstarter/master/model_readme.txt > $TARGET_DIR/README.txt
curl "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2-rss.xml" | grep link | sed -e 's/^.*<link>//' -e 's/<[/]link>.*$//' | uniq >> $TARGET_DIR/README.txt


echo "Collecting data..."
cd $BASE_DIR
mkdir -p data/$LANGUAGE && mv $WDIR/*Counts data/$LANGUAGE
gzip $WDIR/*.nt &

set +e