Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MARVIN-config
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dbpedia-assoc
MARVIN-config
Commits
67e2b885
Commit
67e2b885
authored
4 years ago
by
vehnem
Browse files
Options
Downloads
Plain Diff
Merge branch 'master' of git.informatik.uni-leipzig.de:dbpedia-assoc/marvin-config
parents
19072e2a
20f99236
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
merge_some_into
+0
-231
0 additions, 231 deletions
merge_some_into
migrate_data_marvin_to_download_server.sh
+0
-15
0 additions, 15 deletions
migrate_data_marvin_to_download_server.sh
with
0 additions
and
246 deletions
merge_some_into
deleted
100644 → 0
+
0
−
231
View file @
19072e2a
#!/bin/bash
#+------------------------------------------------------------------------------------------------------------------------------+
#| DBpedia Spotlight - Create database-backed model |
#| @author Joachim Daiber |
#+------------------------------------------------------------------------------------------------------------------------------+
# $1 Working directory
# $2 Locale (en_US)
# $3 Stopwords file
# $4 Analyzer+Stemmer language prefix e.g. Dutch
# $5 Model target folder
export
MAVEN_OPTS
=
"-Xmx26G"
usage
()
{
echo
"index_db.sh"
echo
"usage: ./index_db.sh -o /data/spotlight/nl/opennlp wdir nl_NL /data/spotlight/nl/stopwords.nl.list Dutch /data/spotlight/nl/final_model"
echo
"Create a database-backed model of DBpedia Spotlight for a specified language."
echo
" "
}
opennlp
=
"None"
eval
=
"false"
blacklist
=
"false"
while
getopts
"eo:b:"
opt
;
do
case
$opt
in
o
)
opennlp
=
"
$OPTARG
"
;;
e
)
eval
=
"true"
;;
b
)
blacklist
=
"
$OPTARG
"
;;
esac
done
shift
$((
OPTIND
-
1
))
if
[
$#
!=
5
]
then
usage
exit
fi
BASE_DIR
=
$(
pwd
)
function
get_path
{
if
[[
"
$1
"
=
/
*
]]
then
echo
"
$1
"
else
echo
"
$BASE_DIR
/
$1
"
fi
}
BASE_WDIR
=
$(
get_path
$1
)
TARGET_DIR
=
$(
get_path
$5
)
STOPWORDS
=
$(
get_path
$3
)
WDIR
=
"
$BASE_WDIR
/
$2
"
if
[[
"
$opennlp
"
!=
"None"
]]
;
then
opennlp
=
$(
get_path
$opennlp
)
fi
if
[[
"
$blacklist
"
!=
"false"
]]
;
then
blacklist
=
$(
get_path
$blacklist
)
fi
LANGUAGE
=
`
echo
$2
|
sed
"s/_.*//g"
`
echo
"Language:
$LANGUAGE
"
echo
"Working directory:
$WDIR
"
mkdir
-p
$WDIR
########################################################################################################
# Preparing the data.
########################################################################################################
echo
"Loading Wikipedia dump..."
if
[
-z
"
$WIKI_MIRROR
"
]
;
then
WIKI_MIRROR
=
"https://dumps.wikimedia.org/"
fi
WP_DOWNLOAD_FILE
=
$WDIR
/dump.xml
echo
Checking
for
wikipedia dump at
$WP_DOWNLOAD_FILE
if
[
-f
"
$WP_DOWNLOAD_FILE
"
]
;
then
echo
File exists.
else
echo
Downloading wikipedia dump.
if
[
"
$eval
"
==
"false"
]
;
then
curl -#
"
$WIKI_MIRROR
/
${
LANGUAGE
}
wiki/latest/
${
LANGUAGE
}
wiki-latest-pages-articles.xml.bz2"
| bzcat
>
$WDIR
/dump.xml
else
curl -#
"
$WIKI_MIRROR
/
${
LANGUAGE
}
wiki/latest/
${
LANGUAGE
}
wiki-latest-pages-articles.xml.bz2"
| bzcat | python
$BASE_DIR
/scripts/split_train_test.py 1200
$WDIR
/heldout.txt
>
$WDIR
/dump.xml
fi
fi
cd
$WDIR
cp
$STOPWORDS
stopwords.
$LANGUAGE
.list
if
[
-e
"
$opennlp
/
$LANGUAGE
-token.bin"
]
;
then
cp
"
$opennlp
/
$LANGUAGE
-token.bin"
"
$LANGUAGE
.tokenizer_model"
||
echo
"tokenizer already exists"
else
touch
"
$LANGUAGE
.tokenizer_model"
fi
########################################################################################################
# DBpedia extraction:
########################################################################################################
#Download:
echo
"Creating DBpedia nt files..."
cd
$BASE_WDIR
if
[
-d
extraction-framework
]
;
then
echo
"Updating DBpedia Spotlight..."
cd
extraction-framework
git reset
--hard
HEAD
git pull
mvn
install
else
echo
"Setting up DEF..."
git clone git://github.com/dbpedia/extraction-framework.git
cd
extraction-framework
mvn
install
fi
cd
dump
dumpdate
=
$(
date
+%Y%m%d
)
dumpdir
=
$WDIR
/
${
LANGUAGE
}
wiki/
${
dumpdate
}
mkdir
-p
$dumpdir
ln
-s
$WDIR
/dump.xml
$dumpdir
/
${
LANGUAGE
}
wiki-
${
dumpdate
}
-dump
.xml
cat
<<
EOF
> dbpedia.properties
base-dir=
$WDIR
wiki=
$LANGUAGE
locale=
$LANGUAGE
source=dump.xml
require-download-complete=false
languages=
$LANGUAGE
ontology=../ontology.xml
mappings=../mappings
uri-policy.uri=uri:en; generic:en; xml-safe-predicates:*
format.nt.gz=n-triples;uri-policy.uri
EOF
if
[[
",ga,ar,be,bg,bn,ced,cs,cy,da,eo,et,fa,fi,gl,hi,hr,hu,id,ja,lt,lv,mk,mt,sk,sl,sr,tr,ur,vi,war,zh,"
==
*
",
$LANGUAGE
,"
*
]]
;
then
#Languages with no disambiguation definitions
echo
"extractors=.RedirectExtractor,.MappingExtractor"
>>
dbpedia.properties
else
echo
"extractors=.RedirectExtractor,.DisambiguationExtractor,.MappingExtractor"
>>
dbpedia.properties
fi
../run extraction dbpedia.properties
zcat
$dumpdir
/
${
LANGUAGE
}
wiki-
${
dumpdate
}
-instance-types
*
.nt.gz
>
$WDIR
/instance_types.nt
zcat
$dumpdir
/
${
LANGUAGE
}
wiki-
${
dumpdate
}
-disambiguations-unredirected
.nt.gz
>
$WDIR
/disambiguations.nt
zcat
$dumpdir
/
${
LANGUAGE
}
wiki-
${
dumpdate
}
-redirects
.nt.gz
>
$WDIR
/redirects.nt
rm
-Rf
$dumpdir
########################################################################################################
# Setting up Spotlight:
########################################################################################################
cd
$BASE_WDIR
if
[
-d
dbpedia-spotlight
]
;
then
echo
"Updating DBpedia Spotlight..."
cd
dbpedia-spotlight
git reset
--hard
HEAD
git pull
mvn
-T
1C
-q
clean
install
else
echo
"Setting up DBpedia Spotlight..."
git clone
--depth
1 https://github.com/dbpedia-spotlight/dbpedia-spotlight-model
mv
dbpedia-spotlight-model dbpedia-spotlight
cd
dbpedia-spotlight
mvn
-T
1C
-q
clean
install
fi
########################################################################################################
# Extracting wiki stats:
########################################################################################################
cd
$BASE_WDIR
rm
-Rf
wikistatsextractor
git clone
--depth
1 https://github.com/dbpedia-spotlight/wikistatsextractor
# Stop processing if one step fails
set
-e
#Copy results to local:
cd
$BASE_WDIR
/wikistatsextractor
mvn
install exec
:java
-Dexec
.args
=
"--output_folder
$WDIR
$LANGUAGE
$2
$4Stemmer
$WDIR
/dump.xml
$WDIR
/stopwords.
$LANGUAGE
.list"
if
[
"
$blacklist
"
!=
"false"
]
;
then
echo
"Removing blacklist URLs..."
mv
$WDIR
/uriCounts
$WDIR
/uriCounts_all
grep
-v
-f
$blacklist
$WDIR
/uriCounts_all
>
$WDIR
/uriCounts
fi
echo
"Finished wikistats extraction. Cleaning up..."
rm
-f
$WDIR
/dump.xml
########################################################################################################
# Building Spotlight model:
########################################################################################################
#Create the model:
cd
$BASE_WDIR
/dbpedia-spotlight
mvn
-pl
index
exec
:java
-Dexec
.mainClass
=
org.dbpedia.spotlight.db.CreateSpotlightModel
-Dexec
.args
=
"
$2
$WDIR
$TARGET_DIR
$opennlp
$STOPWORDS
$4Stemmer
"
if
[
"
$eval
"
==
"true"
]
;
then
mvn
-pl
eval exec
:java
-Dexec
.mainClass
=
org.dbpedia.spotlight.evaluation.EvaluateSpotlightModel
-Dexec
.args
=
"
$TARGET_DIR
$WDIR
/heldout.txt"
>
$TARGET_DIR
/evaluation.txt
fi
curl https://raw.githubusercontent.com/dbpedia-spotlight/model-quickstarter/master/model_readme.txt
>
$TARGET_DIR
/README.txt
curl
"
$WIKI_MIRROR
/
${
LANGUAGE
}
wiki/latest/
${
LANGUAGE
}
wiki-latest-pages-articles.xml.bz2-rss.xml"
|
grep link
|
sed
-e
's/^.*<link>//'
-e
's/<[/]link>.*$//'
|
uniq
>>
$TARGET_DIR
/README.txt
echo
"Collecting data..."
cd
$BASE_DIR
mkdir
-p
data/
$LANGUAGE
&&
mv
$WDIR
/
*
Counts data/
$LANGUAGE
gzip
$WDIR
/
*
.nt &
set
+e
This diff is collapsed.
Click to expand it.
migrate_data_marvin_to_download_server.sh
deleted
100755 → 0
+
0
−
15
View file @
19072e2a
#!/bin/bash
# ./marvin-fetch.sh wikidata 2019.08.01
GROUP
=
$1
VERSION
=
$2
SERVER
=
dbpedia-
$1
.tib.eu
# get artifacts
ARTIFACTS
=
`
xmlstarlet sel
-N
my
=
http://maven.apache.org/POM/4.0.0
-t
-v
"/my:project/my:modules/my:module"
$GROUP
/pom.xml
`
for
ARTIFACT
in
$ARTIFACTS
;
do
echo
$ARTIFACT
#scp -rv marvin-fetch@$SERVER:/data/databus-maven-plugin/dbpedia/$GROUP/$a/$VERSION $GROUP/$a/
rsync
-av
-e
ssh
--ignore-existing
marvin-fetch@
$SERVER
:/data/derive/databus-maven-plugin/dbpedia/
$GROUP
/
$ARTIFACT
/
$VERSION
$GROUP
/
$ARTIFACT
done
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment