Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MARVIN-config
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dbpedia-assoc
MARVIN-config
Commits
f91f905a
Commit
f91f905a
authored
5 years ago
by
Sebastian Hellmann
Browse files
Options
Downloads
Patches
Plain Diff
SH something for me to merge later
parent
ec7c5b57
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
merge_some_into
+231
-0
231 additions, 0 deletions
merge_some_into
with
231 additions
and
0 deletions
merge_some_into
0 → 100644
+
231
−
0
View file @
f91f905a
#!/bin/bash
#+------------------------------------------------------------------------------------------------------------------------------+
#| DBpedia Spotlight - Create database-backed model |
#| @author Joachim Daiber |
#+------------------------------------------------------------------------------------------------------------------------------+
# $1 Working directory
# $2 Locale (en_US)
# $3 Stopwords file
# $4 Analyzer+Stemmer language prefix e.g. Dutch
# $5 Model target folder
export
MAVEN_OPTS
=
"-Xmx26G"
usage
()
{
echo
"index_db.sh"
echo
"usage: ./index_db.sh -o /data/spotlight/nl/opennlp wdir nl_NL /data/spotlight/nl/stopwords.nl.list Dutch /data/spotlight/nl/final_model"
echo
"Create a database-backed model of DBpedia Spotlight for a specified language."
echo
" "
}
opennlp
=
"None"
eval
=
"false"
blacklist
=
"false"
while
getopts
"eo:b:"
opt
;
do
case
$opt
in
o
)
opennlp
=
"
$OPTARG
"
;;
e
)
eval
=
"true"
;;
b
)
blacklist
=
"
$OPTARG
"
;;
esac
done
shift
$((
OPTIND
-
1
))
if
[
$#
!=
5
]
then
usage
exit
fi
BASE_DIR
=
$(
pwd
)
function
get_path
{
if
[[
"
$1
"
=
/
*
]]
then
echo
"
$1
"
else
echo
"
$BASE_DIR
/
$1
"
fi
}
BASE_WDIR
=
$(
get_path
$1
)
TARGET_DIR
=
$(
get_path
$5
)
STOPWORDS
=
$(
get_path
$3
)
WDIR
=
"
$BASE_WDIR
/
$2
"
if
[[
"
$opennlp
"
!=
"None"
]]
;
then
opennlp
=
$(
get_path
$opennlp
)
fi
if
[[
"
$blacklist
"
!=
"false"
]]
;
then
blacklist
=
$(
get_path
$blacklist
)
fi
LANGUAGE
=
`
echo
$2
|
sed
"s/_.*//g"
`
echo
"Language:
$LANGUAGE
"
echo
"Working directory:
$WDIR
"
mkdir
-p
$WDIR
########################################################################################################
# Preparing the data.
########################################################################################################
echo
"Loading Wikipedia dump..."
if
[
-z
"
$WIKI_MIRROR
"
]
;
then
WIKI_MIRROR
=
"https://dumps.wikimedia.org/"
fi
WP_DOWNLOAD_FILE
=
$WDIR
/dump.xml
echo
Checking
for
wikipedia dump at
$WP_DOWNLOAD_FILE
if
[
-f
"
$WP_DOWNLOAD_FILE
"
]
;
then
echo
File exists.
else
echo
Downloading wikipedia dump.
if
[
"
$eval
"
==
"false"
]
;
then
curl -#
"
$WIKI_MIRROR
/
${
LANGUAGE
}
wiki/latest/
${
LANGUAGE
}
wiki-latest-pages-articles.xml.bz2"
| bzcat
>
$WDIR
/dump.xml
else
curl -#
"
$WIKI_MIRROR
/
${
LANGUAGE
}
wiki/latest/
${
LANGUAGE
}
wiki-latest-pages-articles.xml.bz2"
| bzcat | python
$BASE_DIR
/scripts/split_train_test.py 1200
$WDIR
/heldout.txt
>
$WDIR
/dump.xml
fi
fi
cd
$WDIR
cp
$STOPWORDS
stopwords.
$LANGUAGE
.list
if
[
-e
"
$opennlp
/
$LANGUAGE
-token.bin"
]
;
then
cp
"
$opennlp
/
$LANGUAGE
-token.bin"
"
$LANGUAGE
.tokenizer_model"
||
echo
"tokenizer already exists"
else
touch
"
$LANGUAGE
.tokenizer_model"
fi
########################################################################################################
# DBpedia extraction:
########################################################################################################
#Download:
echo
"Creating DBpedia nt files..."
cd
$BASE_WDIR
if
[
-d
extraction-framework
]
;
then
echo
"Updating DBpedia Spotlight..."
cd
extraction-framework
git reset
--hard
HEAD
git pull
mvn
install
else
echo
"Setting up DEF..."
git clone git://github.com/dbpedia/extraction-framework.git
cd
extraction-framework
mvn
install
fi
cd
dump
dumpdate
=
$(
date
+%Y%m%d
)
dumpdir
=
$WDIR
/
${
LANGUAGE
}
wiki/
${
dumpdate
}
mkdir
-p
$dumpdir
ln
-s
$WDIR
/dump.xml
$dumpdir
/
${
LANGUAGE
}
wiki-
${
dumpdate
}
-dump
.xml
cat
<<
EOF
> dbpedia.properties
base-dir=
$WDIR
wiki=
$LANGUAGE
locale=
$LANGUAGE
source=dump.xml
require-download-complete=false
languages=
$LANGUAGE
ontology=../ontology.xml
mappings=../mappings
uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*
format.nt.gz=n-triples;uri-policy.uri
EOF
if
[[
",ga,ar,be,bg,bn,ced,cs,cy,da,eo,et,fa,fi,gl,hi,hr,hu,id,ja,lt,lv,mk,mt,sk,sl,sr,tr,ur,vi,war,zh,"
==
*
",
$LANGUAGE
,"
*
]]
;
then
#Languages with no disambiguation definitions
echo
"extractors=.RedirectExtractor,.MappingExtractor"
>>
dbpedia.properties
else
echo
"extractors=.RedirectExtractor,.DisambiguationExtractor,.MappingExtractor"
>>
dbpedia.properties
fi
../run extraction dbpedia.properties
zcat
$dumpdir
/
${
LANGUAGE
}
wiki-
${
dumpdate
}
-instance-types
*
.nt.gz
>
$WDIR
/instance_types.nt
zcat
$dumpdir
/
${
LANGUAGE
}
wiki-
${
dumpdate
}
-disambiguations-unredirected
.nt.gz
>
$WDIR
/disambiguations.nt
zcat
$dumpdir
/
${
LANGUAGE
}
wiki-
${
dumpdate
}
-redirects
.nt.gz
>
$WDIR
/redirects.nt
rm
-Rf
$dumpdir
########################################################################################################
# Setting up Spotlight:
########################################################################################################
cd
$BASE_WDIR
if
[
-d
dbpedia-spotlight
]
;
then
echo
"Updating DBpedia Spotlight..."
cd
dbpedia-spotlight
git reset
--hard
HEAD
git pull
mvn
-T
1C
-q
clean
install
else
echo
"Setting up DBpedia Spotlight..."
git clone
--depth
1 https://github.com/dbpedia-spotlight/dbpedia-spotlight-model
mv
dbpedia-spotlight-model dbpedia-spotlight
cd
dbpedia-spotlight
mvn
-T
1C
-q
clean
install
fi
########################################################################################################
# Extracting wiki stats:
########################################################################################################
cd
$BASE_WDIR
rm
-Rf
wikistatsextractor
git clone
--depth
1 https://github.com/dbpedia-spotlight/wikistatsextractor
# Stop processing if one step fails
set
-e
#Copy results to local:
cd
$BASE_WDIR
/wikistatsextractor
mvn
install exec
:java
-Dexec
.args
=
"--output_folder
$WDIR
$LANGUAGE
$2
$4Stemmer
$WDIR
/dump.xml
$WDIR
/stopwords.
$LANGUAGE
.list"
if
[
"
$blacklist
"
!=
"false"
]
;
then
echo
"Removing blacklist URLs..."
mv
$WDIR
/uriCounts
$WDIR
/uriCounts_all
grep
-v
-f
$blacklist
$WDIR
/uriCounts_all
>
$WDIR
/uriCounts
fi
echo
"Finished wikistats extraction. Cleaning up..."
rm
-f
$WDIR
/dump.xml
########################################################################################################
# Building Spotlight model:
########################################################################################################
#Create the model:
cd
$BASE_WDIR
/dbpedia-spotlight
mvn
-pl
index
exec
:java
-Dexec
.mainClass
=
org.dbpedia.spotlight.db.CreateSpotlightModel
-Dexec
.args
=
"
$2
$WDIR
$TARGET_DIR
$opennlp
$STOPWORDS
$4Stemmer
"
if
[
"
$eval
"
==
"true"
]
;
then
mvn
-pl
eval exec
:java
-Dexec
.mainClass
=
org.dbpedia.spotlight.evaluation.EvaluateSpotlightModel
-Dexec
.args
=
"
$TARGET_DIR
$WDIR
/heldout.txt"
>
$TARGET_DIR
/evaluation.txt
fi
curl https://raw.githubusercontent.com/dbpedia-spotlight/model-quickstarter/master/model_readme.txt
>
$TARGET_DIR
/README.txt
curl
"
$WIKI_MIRROR
/
${
LANGUAGE
}
wiki/latest/
${
LANGUAGE
}
wiki-latest-pages-articles.xml.bz2-rss.xml"
|
grep link
|
sed
-e
's/^.*<link>//'
-e
's/<[/]link>.*$//'
|
uniq
>>
$TARGET_DIR
/README.txt
echo
"Collecting data..."
cd
$BASE_DIR
mkdir
-p
data/
$LANGUAGE
&&
mv
$WDIR
/
*
Counts data/
$LANGUAGE
gzip
$WDIR
/
*
.nt &
set
+e
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment