SH something for me to merge later

f91f905a · Sebastian Hellmann · ec7c5b57 · f91f905a
Commit f91f905a authored 5 years ago by Sebastian Hellmann
--- a/merge_some_into
+++ b/merge_some_into
+#!/bin/bash
+#+------------------------------------------------------------------------------------------------------------------------------+
+#| DBpedia Spotlight - Create database-backed model                                                                             |
+#| @author Joachim Daiber                                                                                                       |
+#+------------------------------------------------------------------------------------------------------------------------------+
+
+# $1 Working directory
+# $2 Locale (en_US)
+# $3 Stopwords file
+# $4 Analyzer+Stemmer language prefix e.g. Dutch
+# $5 Model target folder
+
+export MAVEN_OPTS="-Xmx26G"
+
+usage ()
+{
+  echo "index_db.sh"
+  echo "usage: ./index_db.sh -o /data/spotlight/nl/opennlp wdir nl_NL /data/spotlight/nl/stopwords.nl.list Dutch /data/spotlight/nl/final_model"
+  echo "Create a database-backed model of DBpedia Spotlight for a specified language."
+  echo " "
+}
+
+
+opennlp="None"
+eval="false"
+blacklist="false"
+
+while getopts "eo:b:" opt; do
+  case $opt in
+    o) opennlp="$OPTARG";;
+    e) eval="true";;
+    b) blacklist="$OPTARG";;
+  esac
+done
+
+
+shift $((OPTIND - 1))
+
+if [ $# != 5 ]
+then
+    usage
+    exit
+fi
+
+BASE_DIR=$(pwd)
+
+function get_path {
+  if [[ "$1"  = /* ]]
+  then
+    echo "$1"
+  else
+   echo "$BASE_DIR/$1"
+  fi
+}
+
+BASE_WDIR=$(get_path $1)
+TARGET_DIR=$(get_path $5)
+STOPWORDS=$(get_path $3)
+WDIR="$BASE_WDIR/$2"
+
+if [[ "$opennlp" != "None" ]]; then
+  opennlp=$(get_path $opennlp)
+fi
+if [[ "$blacklist" != "false" ]]; then
+  blacklist=$(get_path $blacklist)
+fi
+
+LANGUAGE=`echo $2 | sed "s/_.*//g"`
+
+echo "Language: $LANGUAGE"
+echo "Working directory: $WDIR"
+
+mkdir -p $WDIR
+
+########################################################################################################
+# Preparing the data.
+########################################################################################################
+
+echo "Loading Wikipedia dump..."
+if [ -z "$WIKI_MIRROR" ]; then
+  WIKI_MIRROR="https://dumps.wikimedia.org/"
+fi
+
+WP_DOWNLOAD_FILE=$WDIR/dump.xml
+echo Checking for wikipedia dump at $WP_DOWNLOAD_FILE
+if [ -f "$WP_DOWNLOAD_FILE" ]; then
+  echo File exists.
+else
+  echo Downloading wikipedia dump.
+  if [ "$eval" == "false" ]; then
+    curl -# "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | bzcat > $WDIR/dump.xml
+  else
+    curl -# "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | bzcat | python $BASE_DIR/scripts/split_train_test.py 1200 $WDIR/heldout.txt > $WDIR/dump.xml
+  fi
+fi
+
+cd $WDIR
+cp $STOPWORDS stopwords.$LANGUAGE.list
+
+if [ -e "$opennlp/$LANGUAGE-token.bin" ]; then
+  cp "$opennlp/$LANGUAGE-token.bin" "$LANGUAGE.tokenizer_model" || echo "tokenizer already exists"
+else
+  touch "$LANGUAGE.tokenizer_model"
+fi
+
+
+########################################################################################################
+# DBpedia extraction:
+########################################################################################################
+
+#Download:
+echo "Creating DBpedia nt files..."
+cd $BASE_WDIR
+
+if [ -d extraction-framework ]; then
+    echo "Updating DBpedia Spotlight..."
+    cd extraction-framework
+    git reset --hard HEAD
+    git pull
+    mvn install
+else
+    echo "Setting up DEF..."
+    git clone git://github.com/dbpedia/extraction-framework.git
+    cd extraction-framework
+    mvn install
+fi
+
+cd dump
+
+dumpdate=$(date +%Y%m%d)
+dumpdir=$WDIR/${LANGUAGE}wiki/${dumpdate}
+
+mkdir -p $dumpdir
+ln -s $WDIR/dump.xml $dumpdir/${LANGUAGE}wiki-${dumpdate}-dump.xml
+
+cat << EOF > dbpedia.properties
+base-dir=$WDIR
+wiki=$LANGUAGE
+locale=$LANGUAGE
+source=dump.xml
+require-download-complete=false
+languages=$LANGUAGE
+ontology=../ontology.xml
+mappings=../mappings
+uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*
+format.nt.gz=n-triples;uri-policy.uri
+EOF
+
+if [[ ",ga,ar,be,bg,bn,ced,cs,cy,da,eo,et,fa,fi,gl,hi,hr,hu,id,ja,lt,lv,mk,mt,sk,sl,sr,tr,ur,vi,war,zh," == *",$LANGUAGE,"* ]]; then #Languages with no disambiguation definitions
+     echo "extractors=.RedirectExtractor,.MappingExtractor" >> dbpedia.properties
+else
+     echo "extractors=.RedirectExtractor,.DisambiguationExtractor,.MappingExtractor" >> dbpedia.properties
+fi
+
+../run extraction dbpedia.properties
+
+zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-instance-types*.nt.gz > $WDIR/instance_types.nt
+zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-disambiguations-unredirected.nt.gz > $WDIR/disambiguations.nt
+zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-redirects.nt.gz > $WDIR/redirects.nt
+
+rm -Rf $dumpdir
+
+########################################################################################################
+# Setting up Spotlight:
+########################################################################################################
+
+cd $BASE_WDIR
+
+if [ -d dbpedia-spotlight ]; then
+    echo "Updating DBpedia Spotlight..."
+    cd dbpedia-spotlight
+    git reset --hard HEAD
+    git pull
+    mvn -T 1C -q clean install
+else
+    echo "Setting up DBpedia Spotlight..."
+    git clone --depth 1 https://github.com/dbpedia-spotlight/dbpedia-spotlight-model
+    mv dbpedia-spotlight-model dbpedia-spotlight
+    cd dbpedia-spotlight
+    mvn -T 1C -q clean install
+fi
+
+
+########################################################################################################
+# Extracting wiki stats:
+########################################################################################################
+
+cd $BASE_WDIR
+rm -Rf wikistatsextractor
+git clone --depth 1 https://github.com/dbpedia-spotlight/wikistatsextractor
+
+# Stop processing if one step fails
+set -e
+
+#Copy results to local:
+cd $BASE_WDIR/wikistatsextractor
+mvn install exec:java -Dexec.args="--output_folder $WDIR $LANGUAGE $2 $4Stemmer $WDIR/dump.xml $WDIR/stopwords.$LANGUAGE.list"
+
+if [ "$blacklist" != "false" ]; then
+  echo "Removing blacklist URLs..."
+  mv $WDIR/uriCounts $WDIR/uriCounts_all
+  grep -v -f $blacklist $WDIR/uriCounts_all > $WDIR/uriCounts
+fi
+
+echo "Finished wikistats extraction. Cleaning up..."
+rm -f $WDIR/dump.xml
+
+
+########################################################################################################
+# Building Spotlight model:
+########################################################################################################
+
+#Create the model:
+cd $BASE_WDIR/dbpedia-spotlight
+
+mvn -pl index exec:java -Dexec.mainClass=org.dbpedia.spotlight.db.CreateSpotlightModel -Dexec.args="$2 $WDIR $TARGET_DIR $opennlp $STOPWORDS $4Stemmer"
+
+if [ "$eval" == "true" ]; then
+  mvn -pl eval exec:java -Dexec.mainClass=org.dbpedia.spotlight.evaluation.EvaluateSpotlightModel -Dexec.args="$TARGET_DIR $WDIR/heldout.txt" > $TARGET_DIR/evaluation.txt
+fi
+
+curl https://raw.githubusercontent.com/dbpedia-spotlight/model-quickstarter/master/model_readme.txt > $TARGET_DIR/README.txt
+curl "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2-rss.xml" | grep link | sed -e 's/^.*<link>//' -e 's/<[/]link>.*$//' | uniq >> $TARGET_DIR/README.txt
+
+
+echo "Collecting data..."
+cd $BASE_DIR
+mkdir -p data/$LANGUAGE && mv $WDIR/*Counts data/$LANGUAGE
+gzip $WDIR/*.nt &
+
+set +e