diff --git a/databus-release.sh b/databus-release.sh index d0447ad3fa51c50694a2f9dea005d75b7149546a..6cf8260698f538a19c2b0361c4c15ed08bd03cba 100755 --- a/databus-release.sh +++ b/databus-release.sh @@ -38,8 +38,7 @@ cd $DATABUSDIR/dbpedia/$GROUP; mvn versions:set -DnewVersion=$(ls * | grep '^[0-9]\{4\}.[0-9]\{2\}.[0-9]\{2\}$' | sort -u | tail -1); # get git commit link -GITSHORTHASH=${git log | head -1 | cut -f2 -d ' ' | grep -o "^......." } -GITHUBLINK=${git log | head -1 | cut -f2 -d ' ' | sed 's|^|https://github.com/dbpedia/extraction-framework/commit/|'} +GITHUBLINK="$(diefCommitLink)" PUBLISHER="https://vehnem.github.io/webid.ttl#this"; # TODO marvin: shouldn't this be the web dir directly? diff --git a/extractionConfiguration/download.text.properties b/extractionConfiguration/download.text.properties index 44c7d0bd7c6581e92be4aa378385bdf87bb7cb49..27fdec695ae1b2c411296527ad92488c0cb5e957 100644 --- a/extractionConfiguration/download.text.properties +++ b/extractionConfiguration/download.text.properties @@ -8,9 +8,7 @@ base-url=http://dumps.wikimedia.your.org/ # source=pages-articles.xml.bz2 # languages to download -TODO testing for english -languages=en -#,af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue +languages=en,af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue # Unzip files while downloading? Not necessary, extraction will unzip on the fly. Let's save space. unzip=false @@ -20,4 +18,4 @@ retry-max=5 retry-millis=10000 #for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used -dump-date= +dump-date=20200201 diff --git a/extractionConfiguration/extraction.test.properties b/extractionConfiguration/extraction.test.properties index 64ca339722a1c0d9b06309357c63d70b9e86b928..c6a0af37e1a9adb9bbcf21bc5b30636ee715518c 100644 --- a/extractionConfiguration/extraction.test.properties +++ b/extractionConfiguration/extraction.test.properties @@ -16,7 +16,40 @@ languages=ro # extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings" -extractors=.MappingExtractor,.RedirectExtractor +#extractors=.NifExtractor +extractors=.MappingExtractor + +uri-policy.iri=generic:en; xml-safe-predicates:* +format.ttl.bz2=turtle-triples;uri-policy.iri +mwc-apiUrl=https://{{LANG}}.wikipedia.org/w/api.php +mwc-maxRetries=5 +mwc-connectMs=4000 +mwc-readMs=30000 +mwc-sleepFactor=2000 +#parameters specific for the abstract extraction +abstract-query=&format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=%s +# the tag path of the XML tags under which the result is expected +abstract-tags=api,query,pages,page,extract +# the properties used to specify long- and short abstracts (should not change) +short-abstracts-property=rdfs:comment +long-abstracts-property=abstract +# the short abstract is at least this long +short-abstract-min-length=200 + +#parameters specific to the nif extraction + +#only extract abstract (not the whole page) +nif-extract-abstract-only=true +#the request query string +nif-query=&format=xml&action=parse&prop=text&page=%s&pageid=%d +#the xml path of the response +nif-tags=api,parse,text +# will leave out the long and short abstract datasets +nif-isTestRun=false +# will write all anchor texts for each nif instance +nif-write-anchor=true +# write only the anchor text for link instances +nif-write-link-anchor=true #extractors.ar=.MappingExtractor,.TopicalConceptsExtractor # diff --git a/extractionConfiguration/extraction.text.en.properties b/extractionConfiguration/extraction.text.en.properties index 58994e8de7abe985647d767766dcf5fd8c9625c1..2c2cf907432b50072687ad88bfc4fad41dd13761 100644 --- a/extractionConfiguration/extraction.text.en.properties +++ b/extractionConfiguration/extraction.text.en.properties @@ -78,7 +78,7 @@ short-abstract-min-length=200 #parameters specific to the nif extraction #only extract abstract (not the whole page) -nif-extract-abstract-only=false +nif-extract-abstract-only=true #the request query string nif-query=&format=xml&action=parse&prop=text&page=%s&pageid=%d #the xml path of the response diff --git a/extractionConfiguration/extraction.text.properties b/extractionConfiguration/extraction.text.properties index 8e84911d5d3b05a035fd7683820dc75d46e2f4f2..8366fbf132159f263b5bc1684701551019fa34fc 100644 --- a/extractionConfiguration/extraction.text.properties +++ b/extractionConfiguration/extraction.text.properties @@ -5,7 +5,7 @@ #log-dir= see: ../core/src/main/resources/universal.properties # WikiPages failed to extract in the first try can be retried with this option (especially interesting when extraction from the mediawiki api) -retry-failed-pages=false +#retry-failed-pages=true # Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly. # Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd- @@ -23,13 +23,14 @@ require-download-complete=false # List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings' # NOTE sync with minidumps -languages=af,als,am,an,arz,ast,azb,ba,bar,bat-smg,bpy,br,bs,bug,cdo,ce,ceb,ckb,cv,fo,fy,gd,he,hsb,ht,ia,ilo,io,is,jv,ka,kn,ku,ky,la,lb,li,lmo,mai,mg,min,ml,mn,mr,mrj,ms,mt,my,mzn,nah,nap,nds,ne,new,nn,no,oc,or,os,pa,pms,pnb,qu,sa,sah,scn,sco,sh,si,simple,sq,su,sw,ta,te,tg,th,tl,tt,uz,vec,wa,xmf,yo,zh-min-nan,zh-yue +languages=en,af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,zh,zh-min-nan,zh-yue # default namespaces: Main, File, Category, Template # we only want abstracts for articles -> only main namespace -namespaces=Main +#namespaces=Main # extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings" - +parallel-processes=8 +run-jobs-in-parallel=true extractors=.NifExtractor # if ontology and mapping files are not given or do not exist, download info from mappings.dbpedia.org diff --git a/extractionConfiguration/universal.properties.template b/extractionConfiguration/universal.properties.template index 9b4c59373b21b6ab91dbdded33f0a491c871f54f..544adea116b06ca5c78b901da91a5ed7ff29edfd 100644 --- a/extractionConfiguration/universal.properties.template +++ b/extractionConfiguration/universal.properties.template @@ -1,7 +1,8 @@ # NOTE: this properties files is imported in every extraction process and contains general parameters which only have to be set once for every release # The DBpedia version to be extracted (in this format: YYYY-MM) -dbpedia-version=2018-10 +# SH Note: leaving it blank takes latest. +dbpedia-version= # Replace with your Wikipedia dump download directory (should not change over the course of a release) # base-dir=/data/extraction/wikidumps/ @@ -91,3 +92,7 @@ summarize-exceptions=true # Options for the SparkExtraction spark-master=local[32] spark-local-dir=/data/extraction/spark.local.dir/ + +########################### +## Abstract and NIF extraction + diff --git a/functions.sh b/functions.sh index d1333627a2864532edb2d164a51ed9e227b91fac..317b9057ce87088de182f3c5216516cb2faff14d 100755 --- a/functions.sh +++ b/functions.sh @@ -35,7 +35,8 @@ extractDumps() { >&2 ../run sparkextraction $CONFIGDIR/extraction.generic.en.properties; elif ["$GROUP" = "text" ] then - >&2 ../run extraction $CONFIGDIR/extraction.$GROUP.en.properties; + #>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.en.properties; + >&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties; else # run for all >&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties; @@ -109,19 +110,19 @@ mapNamesToDatabus() { case "$1" in # generic -# "article-templates-nested") echo "article-templates_nested";; -# "citation-data") echo "citations_data";; -# "citation-links") echo "citations_links";; -# "commons-page-links") echo "commons-sameas-links";; -# "page-ids") echo "page_ids";; -# "page-length") echo "page_length";; -# "page-links") echo "wikilinks";; -# "article-categories") echo "categories_articles";; -# "category-labels") echo "categories_labels";; -# "skos-categories") echo "categories_skos";; -# "revision-ids") echo "revision_ids";; -# "revision-uris") echo "revision_uris";; -# + "article-templates-nested") echo "article-templates_nested";; + "citation-data") echo "citations_data";; + "citation-links") echo "citations_links";; + "commons-page-links") echo "commons-sameas-links";; + "page-ids") echo "page_ids";; + "page-length") echo "page_length";; + "page-links") echo "wikilinks";; + "article-categories") echo "categories_articles";; + "category-labels") echo "categories_labels";; + "skos-categories") echo "categories_skos";; + "revision-ids") echo "revisions_ids";; + "revision-uris") echo "revisions_uris";; + # mappings "mappingbased-objects-disjoint-domain") echo "mappingbased-objects_disjointDomain";; "mappingbased-objects-disjoint-range") echo "mappingbased-objects_disjointRange";; @@ -132,8 +133,8 @@ mapNamesToDatabus() { "labels-nmw") echo "labels_nmw";; "mappingbased-properties-reified-qualifiers") echo "mappingbased-properties-reified_qualifiers";; "mappingbased-objects-uncleaned-redirected") echo "mappingbased-objects";; - "revision-ids") echo "revision_ids";; - "revision-uris") echo "revision_uris";; + "revision-ids") echo "revisions_ids";; + "revision-uris") echo "revisions_uris";; "wikidata-duplicate-iri-split") echo "debug_duplicateirisplit";; "wikidata-r2r-mapping-errors") echo "debug_r2rmappingerrors";; "wikidata-type-like-statements") echo "debug_typelikestatements";; @@ -199,8 +200,8 @@ mapAndCopy() { } +diefCommitLink() { - - - - + cd $DIEFDIR + echo "https://github.com/dbpedia/extraction-framework/commit/$(git rev-parse @)" +} diff --git a/marvin_extraction_run.sh b/marvin_extraction_run.sh index 8ded91b3dc8f9bfc4ba1ec7464db6ad9a5d0c329..929ce8bad5a984adfc0c8b3521c89f66f5c3a117 100755 --- a/marvin_extraction_run.sh +++ b/marvin_extraction_run.sh @@ -1,5 +1,7 @@ #!/bin/bash +echo "LC_ALL=en_US.UTF-8" +export LC_ALL=en_US.UTF-8 ####################### diff --git a/merge_some_into b/merge_some_into new file mode 100644 index 0000000000000000000000000000000000000000..b14431f76d3b591254477932a5743307f568d9bb --- /dev/null +++ b/merge_some_into @@ -0,0 +1,231 @@ +#!/bin/bash +#+------------------------------------------------------------------------------------------------------------------------------+ +#| DBpedia Spotlight - Create database-backed model | +#| @author Joachim Daiber | +#+------------------------------------------------------------------------------------------------------------------------------+ + +# $1 Working directory +# $2 Locale (en_US) +# $3 Stopwords file +# $4 Analyzer+Stemmer language prefix e.g. Dutch +# $5 Model target folder + +export MAVEN_OPTS="-Xmx26G" + +usage () +{ + echo "index_db.sh" + echo "usage: ./index_db.sh -o /data/spotlight/nl/opennlp wdir nl_NL /data/spotlight/nl/stopwords.nl.list Dutch /data/spotlight/nl/final_model" + echo "Create a database-backed model of DBpedia Spotlight for a specified language." + echo " " +} + + +opennlp="None" +eval="false" +blacklist="false" + +while getopts "eo:b:" opt; do + case $opt in + o) opennlp="$OPTARG";; + e) eval="true";; + b) blacklist="$OPTARG";; + esac +done + + +shift $((OPTIND - 1)) + +if [ $# != 5 ] +then + usage + exit +fi + +BASE_DIR=$(pwd) + +function get_path { + if [[ "$1" = /* ]] + then + echo "$1" + else + echo "$BASE_DIR/$1" + fi +} + +BASE_WDIR=$(get_path $1) +TARGET_DIR=$(get_path $5) +STOPWORDS=$(get_path $3) +WDIR="$BASE_WDIR/$2" + +if [[ "$opennlp" != "None" ]]; then + opennlp=$(get_path $opennlp) +fi +if [[ "$blacklist" != "false" ]]; then + blacklist=$(get_path $blacklist) +fi + +LANGUAGE=`echo $2 | sed "s/_.*//g"` + +echo "Language: $LANGUAGE" +echo "Working directory: $WDIR" + +mkdir -p $WDIR + +######################################################################################################## +# Preparing the data. +######################################################################################################## + +echo "Loading Wikipedia dump..." +if [ -z "$WIKI_MIRROR" ]; then + WIKI_MIRROR="https://dumps.wikimedia.org/" +fi + +WP_DOWNLOAD_FILE=$WDIR/dump.xml +echo Checking for wikipedia dump at $WP_DOWNLOAD_FILE +if [ -f "$WP_DOWNLOAD_FILE" ]; then + echo File exists. +else + echo Downloading wikipedia dump. + if [ "$eval" == "false" ]; then + curl -# "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | bzcat > $WDIR/dump.xml + else + curl -# "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | bzcat | python $BASE_DIR/scripts/split_train_test.py 1200 $WDIR/heldout.txt > $WDIR/dump.xml + fi +fi + +cd $WDIR +cp $STOPWORDS stopwords.$LANGUAGE.list + +if [ -e "$opennlp/$LANGUAGE-token.bin" ]; then + cp "$opennlp/$LANGUAGE-token.bin" "$LANGUAGE.tokenizer_model" || echo "tokenizer already exists" +else + touch "$LANGUAGE.tokenizer_model" +fi + + +######################################################################################################## +# DBpedia extraction: +######################################################################################################## + +#Download: +echo "Creating DBpedia nt files..." +cd $BASE_WDIR + +if [ -d extraction-framework ]; then + echo "Updating DBpedia Spotlight..." + cd extraction-framework + git reset --hard HEAD + git pull + mvn install +else + echo "Setting up DEF..." + git clone git://github.com/dbpedia/extraction-framework.git + cd extraction-framework + mvn install +fi + +cd dump + +dumpdate=$(date +%Y%m%d) +dumpdir=$WDIR/${LANGUAGE}wiki/${dumpdate} + +mkdir -p $dumpdir +ln -s $WDIR/dump.xml $dumpdir/${LANGUAGE}wiki-${dumpdate}-dump.xml + +cat << EOF > dbpedia.properties +base-dir=$WDIR +wiki=$LANGUAGE +locale=$LANGUAGE +source=dump.xml +require-download-complete=false +languages=$LANGUAGE +ontology=../ontology.xml +mappings=../mappings +uri-policy.uri=uri:en; generic:en; xml-safe-predicates:* +format.nt.gz=n-triples;uri-policy.uri +EOF + +if [[ ",ga,ar,be,bg,bn,ced,cs,cy,da,eo,et,fa,fi,gl,hi,hr,hu,id,ja,lt,lv,mk,mt,sk,sl,sr,tr,ur,vi,war,zh," == *",$LANGUAGE,"* ]]; then #Languages with no disambiguation definitions + echo "extractors=.RedirectExtractor,.MappingExtractor" >> dbpedia.properties +else + echo "extractors=.RedirectExtractor,.DisambiguationExtractor,.MappingExtractor" >> dbpedia.properties +fi + +../run extraction dbpedia.properties + +zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-instance-types*.nt.gz > $WDIR/instance_types.nt +zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-disambiguations-unredirected.nt.gz > $WDIR/disambiguations.nt +zcat $dumpdir/${LANGUAGE}wiki-${dumpdate}-redirects.nt.gz > $WDIR/redirects.nt + +rm -Rf $dumpdir + +######################################################################################################## +# Setting up Spotlight: +######################################################################################################## + +cd $BASE_WDIR + +if [ -d dbpedia-spotlight ]; then + echo "Updating DBpedia Spotlight..." + cd dbpedia-spotlight + git reset --hard HEAD + git pull + mvn -T 1C -q clean install +else + echo "Setting up DBpedia Spotlight..." + git clone --depth 1 https://github.com/dbpedia-spotlight/dbpedia-spotlight-model + mv dbpedia-spotlight-model dbpedia-spotlight + cd dbpedia-spotlight + mvn -T 1C -q clean install +fi + + +######################################################################################################## +# Extracting wiki stats: +######################################################################################################## + +cd $BASE_WDIR +rm -Rf wikistatsextractor +git clone --depth 1 https://github.com/dbpedia-spotlight/wikistatsextractor + +# Stop processing if one step fails +set -e + +#Copy results to local: +cd $BASE_WDIR/wikistatsextractor +mvn install exec:java -Dexec.args="--output_folder $WDIR $LANGUAGE $2 $4Stemmer $WDIR/dump.xml $WDIR/stopwords.$LANGUAGE.list" + +if [ "$blacklist" != "false" ]; then + echo "Removing blacklist URLs..." + mv $WDIR/uriCounts $WDIR/uriCounts_all + grep -v -f $blacklist $WDIR/uriCounts_all > $WDIR/uriCounts +fi + +echo "Finished wikistats extraction. Cleaning up..." +rm -f $WDIR/dump.xml + + +######################################################################################################## +# Building Spotlight model: +######################################################################################################## + +#Create the model: +cd $BASE_WDIR/dbpedia-spotlight + +mvn -pl index exec:java -Dexec.mainClass=org.dbpedia.spotlight.db.CreateSpotlightModel -Dexec.args="$2 $WDIR $TARGET_DIR $opennlp $STOPWORDS $4Stemmer" + +if [ "$eval" == "true" ]; then + mvn -pl eval exec:java -Dexec.mainClass=org.dbpedia.spotlight.evaluation.EvaluateSpotlightModel -Dexec.args="$TARGET_DIR $WDIR/heldout.txt" > $TARGET_DIR/evaluation.txt +fi + +curl https://raw.githubusercontent.com/dbpedia-spotlight/model-quickstarter/master/model_readme.txt > $TARGET_DIR/README.txt +curl "$WIKI_MIRROR/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2-rss.xml" | grep link | sed -e 's/^.*<link>//' -e 's/<[/]link>.*$//' | uniq >> $TARGET_DIR/README.txt + + +echo "Collecting data..." +cd $BASE_DIR +mkdir -p data/$LANGUAGE && mv $WDIR/*Counts data/$LANGUAGE +gzip $WDIR/*.nt & + +set +e