functions.sh 6.79 KB
Newer Older
Sebastian Hellmann's avatar
Sebastian Hellmann committed
1
2
#!/bin/bash

Sebastian Hellmann's avatar
Sebastian Hellmann committed
3
4
5
6
7
8

HELP="description:
marvin_extraction_run.sh and databus-release.sh take one argument, which is the extraction group
selects download.\$GROUP.properties and extraction.\$GROUP.properties from extractionConfig dir and uses \$GROUP as a path.

usage: 
kurzum's avatar
kurzum committed
9
./marvin_extraction_run.sh {test|generic|generic.en|mappings|wikidata|text|sparktestgeneric}
Sebastian Hellmann's avatar
Sebastian Hellmann committed
10
"
11
12
13
14
15
16
17

##############
# setup paths
##############

ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
CONFIGDIR="$ROOT/extractionConfiguration"
kurzum's avatar
kurzum committed
18
LOGDIR="$ROOT/logs/$(date +%Y-%m-%d)"  && mkdir -p $LOGDIR
kurzum's avatar
kurzum committed
19
DATABUSDIR="$ROOT/databus-poms"
Your Name's avatar
Your Name committed
20
MARVINEXTRACTIONDIR="$ROOT/marvin-extraction"
kurzum's avatar
kurzum committed
21
22
DIEFDIR="$MARVINEXTRACTIONDIR/extraction-framework"
EXTRACTIONBASEDIR="$MARVINEXTRACTIONDIR/wikidumps" && mkdir -p $EXTRACTIONBASEDIR
23
24
25
26
27

##############
# functions
##############

Sebastian Hellmann's avatar
Sebastian Hellmann committed
28
# extract data
Sebastian Hellmann's avatar
Sebastian Hellmann committed
29
extractDumps() {
Sebastian Hellmann's avatar
Sebastian Hellmann committed
30
    cd $DIEFDIR/dump;
kurzum's avatar
kurzum committed
31
	>&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
32
33
}

Sebastian Hellmann's avatar
Sebastian Hellmann committed
34

kurzum's avatar
kurzum committed
35
# post-processing, see http://dev.dbpedia.org/Post-Processing
Sebastian Hellmann's avatar
Sebastian Hellmann committed
36
postProcessing() {
Sebastian Hellmann's avatar
Sebastian Hellmann committed
37
38

    cd $DIEFDIR/scripts;
kurzum's avatar
logging    
kurzum committed
39
    # ResolveTransitiveLinks for all, affects the 'redirects' dataset
kurzum's avatar
kurzum committed
40
    # TODO ResolveTransitiveLinks can take a wikidata interlanguage link parameter, that helps to sort the redirects
kurzum's avatar
logging    
kurzum committed
41
    echo "-- ResolveTransitiveLinks"
kurzum's avatar
kurzum committed
42
43
44
45
    >&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded;

	# Datasets for MapObjectURIs
    if [ "$GROUP" = "mappings" ] || [ "$GROUP" = "test" ] 
Sebastian Hellmann's avatar
Sebastian Hellmann committed
46
    then
kurzum's avatar
kurzum committed
47
		DATASETS="mappingbased-objects-uncleaned"
Sebastian Hellmann's avatar
Sebastian Hellmann committed
48
49
    elif [ "$GROUP" = "wikidata" ]
    then
kurzum's avatar
kurzum committed
50
51
		DATASETS="mappingbased-objects-uncleaned,raw"
    elif [ "$GROUP" = "generic" ] || [ "$GROUP" = "generic.en" ] || [ "$GROUP" = "sparktestgeneric" ]
Sebastian Hellmann's avatar
Sebastian Hellmann committed
52
    then
kurzum's avatar
kurzum committed
53
54
		DATASETS="disambiguations,infobox-properties,page-links,persondata,topical-concepts"
	fi
kurzum's avatar
logging    
kurzum committed
55
	echo "-- MapObjectURIs" 
kurzum's avatar
kurzum committed
56
	>&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 $DATASETS _redirected .ttl.bz2 @downloaded;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
57

kurzum's avatar
logging    
kurzum committed
58
	# Datasets with TypeConsistencyCheck
kurzum's avatar
kurzum committed
59
60
	if [ "$GROUP" = "mappings" ] || [ "$GROUP" = "test" ] || [ "$GROUP" = "wikidata" ] || [ "$GROUP" = "generic" ] || [ "$GROUP" = "generic.en" ] || [ "$GROUP" = "sparktestgeneric" ]
	then
kurzum's avatar
logging    
kurzum committed
61
		echo "-- TypeConsistencyCheck"
kurzum's avatar
kurzum committed
62
63
64
65
		>&2 ../run TypeConsistencyCheck type.consistency.check.properties;
	fi
     
     # Handling of redirects, i.e. copy to log and rename old
kurzum's avatar
logging    
kurzum committed
66
     echo "-- copying redirects"
67
68
69
70
	 mkdir -p $LOGDIR/unredirected/
	 for redirectedFile in $(find $EXTRACTIONBASEDIR -name "*_redirected.ttl.bz2") ; do 
        unredirectedFile=$(echo $redirectedFile | sed 's|_redirected\.ttl\.bz2$|\.ttl\.bz2|g');
        [ -f $unredirectedFile ] && cp -vn "$unredirectedFile" "$LOGDIR/unredirected/";
kurzum's avatar
logging    
kurzum committed
71
        >&2 rename -f -v 's/_redirected//' $redirectedFile;
kurzum's avatar
kurzum committed
72
	 done
Sebastian Hellmann's avatar
Sebastian Hellmann committed
73
74
75
}

# compress log files
Sebastian Hellmann's avatar
Sebastian Hellmann committed
76
# log files from same day get overwritten, only latest is kept
Sebastian Hellmann's avatar
Sebastian Hellmann committed
77
archiveLogFiles() {
Sebastian Hellmann's avatar
Sebastian Hellmann committed
78
    for f in $(find $LOGDIR -type f ); do lbzip2 -f $f; done;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
79
}
Sebastian Hellmann's avatar
Sebastian Hellmann committed
80

Sebastian Hellmann's avatar
Sebastian Hellmann committed
81
82


Sebastian Hellmann's avatar
Sebastian Hellmann committed
83
84
85
86
##########################
# Databus Mapping
##########################

Your Name's avatar
Your Name committed
87
# switch case for some language exceptions
Sebastian Hellmann's avatar
Sebastian Hellmann committed
88
89
90
91
92
93
mapLangToContVar() {
    lang=$(echo "$1" | sed 's|wiki||g')
    case "$lang" in
        "bat_smg") echo "_lang=batsmg";;
        "zh_min_nan") echo "_lang=nan";;
        "zh_yue") echo "_lang=yue";;
94
        "data") echo "";;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
95
96
97
98
99
100
101
102
103
104
        "commons" ) echo "_commons";;
        *) echo "_lang=$lang";;
    esac
}


mapNamesToDatabus() {

    case "$1" in

Your Name's avatar
Your Name committed
105
106
	# generic
        "article-templates-nested") echo "article-templates_nested";;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
107
108
109
110
111
112
113
114
115
        "citation-data") echo "citations_data";;
        "citation-links") echo "citations_links";;
        "commons-page-links") echo "commons-sameas-links";;
        "page-ids") echo "page_ids";;
        "page-length") echo "page_length";;
        "page-links") echo "wikilinks";;
        "article-categories") echo "categories_articles";;
        "category-labels") echo "categories_labels";;
        "skos-categories") echo "categories_skos";;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
116
117
        "revision-ids") echo "revisions_ids";;
        "revision-uris") echo "revisions_uris";;
Your Name's avatar
Your Name committed
118
119
120
121
122
123
124
125
126
127
128

       # mappings
	"mappingbased-objects-disjoint-domain") echo "mappingbased-objects_disjointDomain";;
	"mappingbased-objects-disjoint-range")  echo "mappingbased-objects_disjointRange";;

	# wikidata
	"alias-nmw") echo "alias_nmw";;
	"description-nmw") echo "description_nmw";;
	"labels-nmw") echo "labels_nmw";;
	"mappingbased-properties-reified-qualifiers") echo "mappingbased-properties-reified_qualifiers";;
	"mappingbased-objects-uncleaned-redirected") echo "mappingbased-objects";;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
129
130
	"revision-ids") echo "revisions_ids";;
	"revision-uris") echo "revisions_uris";;
Your Name's avatar
Your Name committed
131
132
133
134
135
136
137
138
	"wikidata-duplicate-iri-split") echo "debug_duplicateirisplit";;
	"wikidata-r2r-mapping-errors") echo "debug_r2rmappingerrors";;
	"wikidata-type-like-statements") echo "debug_typelikestatements";;
	"transitive-redirects") echo "redirects_transitive";;

	# both mappings and wikidata
	"instance-types") echo "instance-types_specific";;
	"instance-types-transitive") echo "instance-types_transitive";;
Sebastian Hellmann's avatar
Sebastian Hellmann committed
139
140
141
142
143

        *) echo "$1";;
    esac
}

kurzum's avatar
kurzum committed
144
# creates links in databus dir
Your Name's avatar
Your Name committed
145
mapAndCopy() {
kurzum's avatar
kurzum committed
146
	# each individual file
147
	path=$1
Your Name's avatar
Your Name committed
148

149
150
151
152
	# split filename
	# how to use ${string##/*}
	# https://www.tldp.org/LDP/abs/html/string-manipulation.html#Substring%20Removal#Substring Removal
	file="${path##*/}"
Your Name's avatar
Your Name committed
153

154
155
156
	version="${file#*-}"
	version="${version%%-*}"
	version="${version:0:4}.${version:4:2}.${version:6:2}"
Your Name's avatar
Your Name committed
157

158
	lang="${file%%-*}"
Your Name's avatar
Your Name committed
159

160
161
	extraction="${file#*-*-}"
	extraction="${extraction%%.*}"
Your Name's avatar
Your Name committed
162
163
164
165
	extraction=$(echo -n $extraction | sed 's|interlanguage-links-|interlanguage-links_lang=|') # generic exception

	extensions="${file#*.}"

166
167
168
169
170
171
172
173
	# map names and languages
	mapped="$(mapNamesToDatabus $extraction)"
	contVars="$(mapLangToContVar $lang)"
	if [[ "$mapped" == *"_"* ]]; then
		contVars="${contVars}_${mapped#*_}"
	fi
	artifact="${mapped%%_*}"
	targetFolder="$DATABUSDIR/dbpedia/$GROUP/$artifact/$version"
Your Name's avatar
Your Name committed
174
175
176
177
178
	targetFile="$artifact$contVars.$extensions"

	if [ -d "$DATABUSDIR/dbpedia/$GROUP/$artifact" ]; then
		mkdir -p $targetFolder
	else
kurzum's avatar
kurzum committed
179
		echo "[DEBUG]\"$artifact\" (artifact not found, might not be in group $GROUP) $path" >&2;
Your Name's avatar
Your Name committed
180
181
182
183
	fi

	# TODO proper handling of "_redirected"
	# TODO see above, redirected are moved to logdir and overwrite the unredirected
kurzum's avatar
kurzum committed
184
	# concerns only generic:
Your Name's avatar
Your Name committed
185
186
187
188
189
190
	# < enwiki/20191001/enwiki-20191001-disambiguations_redirected.ttl.bz2
	# < enwiki/20191001/enwiki-20191001-infobox-properties_redirected.ttl.bz2
	# < enwiki/20191001/enwiki-20191001-page-links_redirected.ttl.bz2
	# < enwiki/20191001/enwiki-20191001-persondata_redirected.ttl.bz2
	# < enwiki/20191001/enwiki-20191001-topical-concepts_redirected.ttl.bz2

191
	# copy
Your Name's avatar
Your Name committed
192
	# TODO comment  after testing
kurzum's avatar
logging    
kurzum committed
193
	cp -vn "$path" "$targetFolder/$targetFile"
Your Name's avatar
Your Name committed
194
	# ln -s "$path" "$targetFolder/$targetFile"
kurzum's avatar
logging    
kurzum committed
195
	# echo -e "< $path\n> $targetFolder/$targetFile\n----------------------"
Your Name's avatar
Your Name committed
196

Sebastian Hellmann's avatar
Sebastian Hellmann committed
197
198
}

vehnem's avatar
vehnem committed
199
diefCommitLink() {
Sebastian Hellmann's avatar
Sebastian Hellmann committed
200

vehnem's avatar
vehnem committed
201
202
203
	cd $DIEFDIR
	echo "https://github.com/dbpedia/extraction-framework/commit/$(git rev-parse @)"
}