From f91f905a438617cf423331e5757a37f2fb2e3739 Mon Sep 17 00:00:00 2001 From: Sebastian Hellmann <hellmann@informatik.uni-leipzig.de> Date: Thu, 5 Dec 2019 22:39:30 +0100 Subject: [PATCH] SH something for me to merge later --- merge_some_into | 231 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 merge_some_into diff --git a/merge_some_into b/merge_some_into new file mode 100644 index 0000000..b14431f --- /dev/null +++ b/merge_some_into @@ -0,0 +1,231 @@ +#!/bin/bash +#+------------------------------------------------------------------------------------------------------------------------------+ +#| DBpedia Spotlight - Create database-backed model | +#| @author Joachim Daiber | +#+------------------------------------------------------------------------------------------------------------------------------+ + +# $1 Working directory +# $2 Locale (en_US) +# $3 Stopwords file +# $4 Analyzer+Stemmer language prefix e.g. Dutch +# $5 Model target folder + +export MAVEN_OPTS="-Xmx26G" + +usage () +{ + echo "index_db.sh" + echo "usage: ./index_db.sh -o /data/spotlight/nl/opennlp wdir nl_NL /data/spotlight/nl/stopwords.nl.list Dutch /data/spotlight/nl/final_model" + echo "Create a database-backed model of DBpedia Spotlight for a specified language." + echo " " +} + + +opennlp="None" +eval="false" +blacklist="false" + +while getopts "eo:b:" opt; do + case $opt in + o) opennlp="$OPTARG";; + e) eval="true";; + b) blacklist="$OPTARG";; + esac +done + + +shift $((OPTIND - 1)) + +if [ $# != 5 ] +then + usage + exit +fi + +BASE_DIR=$(pwd) + +function get_path { + if [[ "$1" = /* ]] + then + echo "$1" + else + echo "$BASE_DIR/$1" + fi +} + +BASE_WDIR=$(get_path $1) +TARGET_DIR=$(get_path $5) +STOPWORDS=$(get_path $3) +WDIR="$BASE_WDIR/$2" + +if [[ "$opennlp" != "None" ]]; then + opennlp=$(get_path $opennlp) +fi +if [[ "$blacklist" != "false" ]]; then + blacklist=$(get_path $blacklist) +fi + +LANGUAGE=`echo $2 | sed "s/_.*//g"` + +echo "Language: $LANGUAGE" +echo "Working directory: $WDIR" + +mkdir -p $WDIR + +######################################################################################################## +# Preparing the data. +######################################################################################################## + +echo "Loading Wikipedia dump..." +if [ -z "$WIKI_MIRROR" ]; then + WIKI_MIRROR="https://dumps.wikimedia.org/" +fi + +WP_DOWNLOAD_FILE=$WDIR/dump.xml +echo Checking for wikipedia dump at $WP_DOWNLOAD_FILE +if [ -f "$WP_DOWNLOAD_FILE" ]; then + echo File exists. +else + echo Downloading wikipedia dump. + if [ "$eval" == "false" ]; then + curl -# "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | bzcat > $WDIR/dump.xml + else + curl -# "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | bzcat | python $BASE_DIR/scripts/split_train_test.py 1200 $WDIR/heldout.txt > $WDIR/dump.xml + fi +fi + +cd $WDIR +cp $STOPWORDS stopwords.$LANGUAGE.list + +if [ -e "$opennlp/$LANGUAGE-token.bin" ]; then + cp "$opennlp/$LANGUAGE-token.bin" "$LANGUAGE.tokenizer_model" || echo "tokenizer already exists" +else + touch "$LANGUAGE.tokenizer_model" +fi + + +######################################################################################################## +# DBpedia extraction: +######################################################################################################## + +#Download: +echo "Creating DBpedia nt files..." +cd $BASE_WDIR + +if [ -d extraction-framework ]; then + echo "Updating DBpedia Spotlight..." + cd extraction-framework + git reset --hard HEAD + git pull + mvn install +else + echo "Setting up DEF..." + git clone git://github.com/dbpedia/extraction-framework.git + cd extraction-framework + mvn install +fi + +cd dump + +dumpdate=$(date +%Y%m%d) +dumpdir=$WDIR/${LANGUAGE}wiki/${dumpdate} + +mkdir -p $dumpdir +ln -s $WDIR/dump.xml $dumpdir/${LANGUAGE}wiki-${dumpdate}-dump.xml + +cat << EOF > dbpedia.properties +base-dir=$WDIR +wiki=$LANGUAGE +locale=$LANGUAGE +source=dump.xml +require-download-complete=false +languages=$LANGUAGE +ontology=../ontology.xml +mappings=../mappings +uri-policy.uri=uri:en; generic:en; xml-safe-predicates:* +format.nt.gz=n-triples;uri-policy.uri +EOF + +if [[ ",ga,ar,be,bg,bn,ced,cs,cy,da,eo,et,fa,fi,gl,hi,hr,hu,id,ja,lt,lv,mk,mt,sk,sl,sr,tr,ur,vi,war,zh," == *",$LANGUAGE,"* ]]; then #Languages with no disambiguation definitions + echo "extractors=.RedirectExtractor,.MappingExtractor" >> dbpedia.properties +else + echo "extractors=.RedirectExtractor,.DisambiguationExtractor,.MappingExtractor" >> dbpedia.properties +fi + +../run extraction dbpedia.properties + +zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-instance-types*.nt.gz > $WDIR/instance_types.nt +zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-disambiguations-unredirected.nt.gz > $WDIR/disambiguations.nt +zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-redirects.nt.gz > $WDIR/redirects.nt + +rm -Rf $dumpdir + +######################################################################################################## +# Setting up Spotlight: +######################################################################################################## + +cd $BASE_WDIR + +if [ -d dbpedia-spotlight ]; then + echo "Updating DBpedia Spotlight..." + cd dbpedia-spotlight + git reset --hard HEAD + git pull + mvn -T 1C -q clean install +else + echo "Setting up DBpedia Spotlight..." + git clone --depth 1 https://github.com/dbpedia-spotlight/dbpedia-spotlight-model + mv dbpedia-spotlight-model dbpedia-spotlight + cd dbpedia-spotlight + mvn -T 1C -q clean install +fi + + +######################################################################################################## +# Extracting wiki stats: +######################################################################################################## + +cd $BASE_WDIR +rm -Rf wikistatsextractor +git clone --depth 1 https://github.com/dbpedia-spotlight/wikistatsextractor + +# Stop processing if one step fails +set -e + +#Copy results to local: +cd $BASE_WDIR/wikistatsextractor +mvn install exec:java -Dexec.args="--output_folder $WDIR $LANGUAGE $2 $4Stemmer $WDIR/dump.xml $WDIR/stopwords.$LANGUAGE.list" + +if [ "$blacklist" != "false" ]; then + echo "Removing blacklist URLs..." + mv $WDIR/uriCounts $WDIR/uriCounts_all + grep -v -f $blacklist $WDIR/uriCounts_all > $WDIR/uriCounts +fi + +echo "Finished wikistats extraction. Cleaning up..." +rm -f $WDIR/dump.xml + + +######################################################################################################## +# Building Spotlight model: +######################################################################################################## + +#Create the model: +cd $BASE_WDIR/dbpedia-spotlight + +mvn -pl index exec:java -Dexec.mainClass=org.dbpedia.spotlight.db.CreateSpotlightModel -Dexec.args="$2 $WDIR $TARGET_DIR $opennlp $STOPWORDS $4Stemmer" + +if [ "$eval" == "true" ]; then + mvn -pl eval exec:java -Dexec.mainClass=org.dbpedia.spotlight.evaluation.EvaluateSpotlightModel -Dexec.args="$TARGET_DIR $WDIR/heldout.txt" > $TARGET_DIR/evaluation.txt +fi + +curl https://raw.githubusercontent.com/dbpedia-spotlight/model-quickstarter/master/model_readme.txt > $TARGET_DIR/README.txt +curl "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2-rss.xml" | grep link | sed -e 's/^.*<link>//' -e 's/<[/]link>.*$//' | uniq >> $TARGET_DIR/README.txt + + +echo "Collecting data..." +cd $BASE_DIR +mkdir -p data/$LANGUAGE && mv $WDIR/*Counts data/$LANGUAGE +gzip $WDIR/*.nt & + +set +e -- GitLab