Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
dbpedia-assoc
MARVIN-config
Commits
52796cff
Commit
52796cff
authored
Jul 09, 2020
by
vehnem
Browse files
download-check frontend
parent
0c62008f
Changes
13
Hide whitespace changes
Inline
Side-by-side
dashboard/src/main/scala/org/dbpedia/release/DataApiServlet.scala
View file @
52796cff
...
...
@@ -6,7 +6,7 @@ import java.util.Calendar
import
org.apache.jena.query.
{
QueryException
,
QueryExecutionFactory
}
import
org.dbpedia.release.config.Config
import
org.dbpedia.release.config.Config.versions
import
org.dbpedia.release.handler.
{
CompletenessHandler
,
ReleaseLogHandler
}
import
org.dbpedia.release.handler.
{
CompletenessHandler
,
InputDumpHandler
,
ReleaseLogHandler
}
import
org.dbpedia.release.model.VersionStatus
import
org.json4s.
{
DefaultFormats
,
Formats
}
import
org.scalatra._
...
...
@@ -75,7 +75,10 @@ class DataApiServlet(implicit val swagger: Swagger)
val
group
=
params
(
"group"
)
val
version
=
params
(
"version"
)
InputDumpHandler
.
getDownloadChecks
(
group
,
version
)
match
{
case
Some
(
states
)
=>
states
case
_
=>
"{}"
}
}
get
(
"/release/completeness/:group/:version"
)
{
...
...
dashboard/src/main/scala/org/dbpedia/release/Main.scala
0 → 100644
View file @
52796cff
package
org.dbpedia.release
import
java.io.
{
BufferedInputStream
,
FileInputStream
}
import
java.util.Properties
import
scala.collection.mutable
import
scala.jdk.CollectionConverters._
object
Main
extends
App
{
/**
* for f in $(find . -regex '.*2020.04.01.*\.ttl\.bz2')
* do
* echo $f | cut -d '/' -f4 | sed 's|\.ttl\.bz2||g' | sed 's|_lang=[a-z]*||g' | sed 's|_commons||g'
* done
*/
val
databusCVNames
=
List
(
"anchor-text"
,
"article-templates"
,
"article-templates_nested"
,
"categories_articles"
,
"categories_labels"
,
"categories_skos"
,
"citations_data"
,
"citations_links"
,
"commons-sameas-links"
,
"disambiguations"
,
"external-links"
,
"geo-coordinates"
,
"homepages"
,
"infobox-properties"
,
"infobox-property-definitions"
,
"interlanguage-links"
,
"labels"
,
"page_ids"
,
"page_length"
,
"persondata"
,
"redirects"
,
"redirects_transitive"
,
"revisions_ids"
,
"revisions_uris"
,
"topical-concepts"
,
"wikilinks"
,
"wikipedia-links"
)
def
finalLang
(
lang
:
String
)
:
String
=
{
lang
match
{
case
"bat-smg"
=>
"batsmg"
case
"zh-yue"
=>
"yue"
case
"zh-min-nan"
=>
"nan"
case
_
=>
lang
}
}
val
artifactsByExtractor
=
Map
(
".AnchorTextExtractor"
->
List
(
"anchor-text"
),
".ExternalLinksExtractor"
->
List
(
"external-links"
),
".LabelExtractor"
->
List
(
"labels"
),
".CitationExtractor"
->
List
(
"citations_data"
,
"citations_links"
),
".DisambiguationExtractor"
->
List
(
"disambiguations"
),
".ArticleTemplatesExtractor"
->
List
(
"article-templates"
,
"article-templates_nested"
),
".HomepageExtractor"
->
List
(
"homepages"
),
".RedirectExtractor"
->
List
(
"redirects"
,
"redirects_transitive"
),
// redirects_transitive are due post-processing
".InterLanguageLinksExtractor"
->
List
(
"interlanguage-links"
),
".WikiPageLengthExtractor"
->
List
(
"page_length"
),
".PageLinksExtractor"
->
List
(
"wikilinks"
),
".PageIdExtractor"
->
List
(
"page_ids"
),
".PersondataExtractor"
->
List
(
"persondata"
),
".TopicalConceptsExtractor"
->
List
(
"topical-concepts"
),
".SkosCategoriesExtractor"
->
List
(
"categories_skos"
),
".CategoryLabelExtractor"
->
List
(
"categories_labels"
),
".InfoboxExtractor"
->
List
(
"infobox-properties"
,
"infobox-property-definitions"
),
".ArticleCategoriesExtractor"
->
List
(
"categories_articles"
),
".CommonsResourceExtractor"
->
List
(
"commons-sameas-links"
),
".GeoExtractor"
->
List
(
"geo-coordinates"
),
".ProvenanceExtractor"
->
List
(
"revisions_uris"
),
".RevisionIdExtractor"
->
List
(
"revisions_ids"
),
".ArticlePageExtractor"
->
List
(
"wikipedia-links"
),
// Unknown
".WikiPageOutDegreeExtractor"
->
List
(),
".GalleryExtractor"
->
List
(),
".ContributorExtractor"
->
List
(),
".TemplateParameterExtractor"
->
List
(),
".ImageAnnotationExtractor"
->
List
(),
".CommonsKMLExtractor"
->
List
(),
".fr.PopulationExtractor"
->
List
(),
".FileTypeExtractor"
->
List
(),
".PndExtractor"
->
List
(),
".DBpediaResourceExtractor"
->
List
()
)
val
properties
=
new
Properties
()
val
extractorMap
=
new
mutable
.
HashMap
[
String
,
Set
[
String
]]()
properties
.
load
(
new
BufferedInputStream
(
new
FileInputStream
(
"/home/marvin/src/git.informatik.uni-leipzig.de/dbpedia-assoc/marvin-config/extractionConfiguration/extraction.generic.properties"
)))
// TODO en.properties (impl multiple configs loadable/merge)
val
defaultExtractors
=
properties
.
getProperty
(
"extractors"
).
split
(
','
).
map
(
_
.
trim
).
filter
(
_
.
nonEmpty
).
flatMap
(
e
=>
artifactsByExtractor
.
getOrElse
(
e
,
List
(
e
))).
toSet
(
properties
.
getProperty
(
"languages"
).
split
(
','
)
++
Array
(
"en"
)).
foreach
(
lang
=>
{
extractorMap
(
finalLang
(
lang
))
=
defaultExtractors
})
val
LANG
=
"extractors\\.(.*)"
.
r
properties
.
asScala
.
foreach
({
case
(
LANG
(
lang
),
v
)
=>
extractorMap
(
finalLang
(
lang
))
=
v
.
split
(
','
).
map
(
_
.
trim
).
filter
(
_
.
nonEmpty
).
flatMap
(
e
=>
artifactsByExtractor
.
getOrElse
(
e
,
List
(
e
))).
toSet
++
defaultExtractors
case
_
=>
None
})
// TODO consider downloads.properties for this
extractorMap
.
remove
(
"ced"
)
extractorMap
.
remove
(
"mt"
)
extractorMap
.
remove
(
"commons"
)
// /* find unused but configured extractos */
// extractorMap.values.foldRight(Set[String]())( (a,b) => {
// a.filter(_.startsWith(".")) ++ b
// }).foreach(println)
//
extractorMap
.
values
.
flatMap
(
_
.
toList
).
groupBy
((
word
:
String
)
=>
word
).
map
(
x
=>
x
.
_1
->
x
.
_2
.
size
).
foreach
(
println
)
hr
val
pagesLangs
=
Set
(
"vi"
,
"lt"
,
"zh"
,
"ca"
,
"am"
,
"sco"
,
"nn"
,
"scn"
,
"ku"
,
"os"
,
"war"
,
"war"
,
"ms"
,
"mai"
,
"no"
,
"ku"
,
"tr"
,
"ru"
,
"pms"
,
"ky"
,
"an"
,
"eml"
,
"ga"
,
"nap"
,
"kn"
,
"id"
,
"pl"
,
"sah"
,
"ga"
,
"simple"
,
"cv"
,
"sd"
,
"vec"
,
"yue"
,
"bn"
,
"pnb"
,
"bug"
,
"ne"
,
"pms"
,
"hr"
,
"ia"
,
"tg"
,
"als"
,
"no"
,
"da"
,
"de"
,
"azb"
,
"ms"
,
"qu"
,
"he"
,
"pnb"
,
"yo"
,
"li"
,
"oc"
,
"ja"
,
"new"
,
"lmo"
,
"ceb"
,
"my"
,
"sd"
,
"ur"
,
"nds"
,
"batsmg"
,
"min"
,
"wa"
,
"hu"
,
"ia"
,
"lb"
,
"wuu"
,
"et"
,
"arz"
,
"ceb"
,
"si"
,
"an"
,
"sah"
,
"te"
,
"ka"
,
"eml"
,
"da"
,
"sr"
,
"yi"
,
"ja"
,
"cy"
,
"ko"
,
"azb"
,
"bs"
,
"kk"
,
"en"
,
"ml"
,
"fo"
,
"simple"
,
"sh"
,
"ko"
,
"en"
,
"es"
,
"tl"
,
"sr"
,
"sw"
,
"be"
,
"az"
,
"gu"
,
"ilo"
,
"pt"
,
"sk"
,
"new"
,
"eu"
,
"tt"
,
"af"
,
"oc"
,
"ht"
,
"mzn"
,
"ce"
,
"th"
,
"fa"
,
"ta"
,
"cs"
,
"is"
,
"el"
,
"fy"
,
"ar"
,
"sl"
,
"am"
,
"sh"
,
"el"
,
"fo"
,
"yo"
,
"gl"
,
"fi"
,
"mrj"
,
"wa"
,
"lv"
,
"ml"
,
"os"
,
"lmo"
,
"cdo"
,
"io"
,
"te"
,
"fr"
,
"mn"
,
"si"
,
"la"
,
"nl"
,
"cy"
,
"vi"
,
"et"
,
"arz"
,
"nan"
,
"vec"
,
"hsb"
,
"sq"
,
"br"
,
"cv"
,
"lb"
,
"mg"
,
"mk"
,
"gu"
,
"mai"
,
"br"
,
"gd"
,
"mzn"
,
"sa"
,
"ast"
,
"ta"
,
"vo"
,
"bg"
,
"or"
,
"jv"
,
"mr"
,
"be"
,
"sa"
,
"wuu"
,
"hi"
,
"he"
,
"uk"
,
"ro"
,
"pa"
,
"nn"
,
"qu"
,
"su"
,
"gd"
,
"bn"
,
"uz"
,
"nds"
,
"kk"
,
"mhr"
,
"ka"
,
"xmf"
,
"nap"
,
"it"
,
"jv"
,
"gl"
,
"is"
,
"tl"
,
"fa"
,
"ce"
,
"ht"
,
"it"
,
"eu"
,
"ar"
,
"az"
,
"hu"
,
"de"
,
"ba"
,
"bar"
,
"uk"
,
"fi"
,
"pl"
,
"sw"
,
"mn"
,
"es"
,
"tt"
,
"bug"
,
"bs"
,
"eo"
,
"scn"
,
"my"
,
"yi"
,
"ckb"
,
"sq"
,
"mg"
,
"af"
,
"mrj"
,
"ilo"
,
"su"
,
"fr"
,
"io"
,
"hy"
,
"ru"
,
"als"
,
"ro"
,
"th"
,
"sk"
,
"ky"
,
"fy"
,
"min"
,
"eo"
,
"sl"
,
"li"
,
"hr"
,
"tg"
,
"pa"
,
"hi"
,
"tr"
,
"sv"
,
"hsb"
,
"cdo"
,
"mr"
,
"or"
,
"bar"
,
"zh"
,
"bg"
,
"mhr"
,
"ba"
,
"vo"
,
"lt"
,
"yue"
,
"ckb"
,
"ast"
,
"kn"
,
"mk"
,
"la"
,
"bpy"
,
"id"
,
"hy"
,
"sco"
,
"ur"
,
"nan"
,
"pt"
,
"uz"
,
"bpy"
,
"xmf"
,
"nl"
,
"cs"
,
"lv"
,
"ne"
,
"batsmg"
,
"sv"
,
"ca"
)
hr
pagesLangs
.
toList
.
sortWith
(
_
<
_
).
foreach
(
str
=>
print
(
"\""
+
str
+
"\", "
))
/* check if something is missing */
// val pageIdsConfig = extractorMap.filter(_._2.contains("page_ids")).keys.toSet
//
// pageIdsConfig.filterNot(pagesLangs).foreach(println)
//
// hr
//
// pagesLangs.filterNot(pageIdsConfig).foreach(println)
private
def
hr
:
Unit
=
{
println
(
"--------------------------------------------------------------------------------------"
)
}
configuredButNotExtracted
()
private
def
configuredButNotExtracted
()
:
Unit
=
{
val
c
=
(
"af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,bpy,br,bs,bug,ca,cdo,ceb,ce,ckb,cs,cv,cy,da,de,el,"
+
"eml,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,"
+
"li,lmo,lt,lv,mai,mg,mhr,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,"
+
"sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,wa,war,wuu,xmf,yi,yo,"
+
"zh,zh-min-nan,zh-yue,commons,en"
)
.
split
(
","
).
toSet
println
c
.
toList
.
sortWith
(
_
<
_
).
foreach
(
s
=>
print
(
"\""
+
s
+
"\", "
))
// val d = extractorMap.keys.toSet
//
// d.filterNot(c).foreach(println)
}
}
dashboard/src/main/scala/org/dbpedia/release/config/Config.scala
View file @
52796cff
...
...
@@ -127,6 +127,38 @@ object Config {
}
}
object
extractionInput
{
// TODO handle batsmg nan yue
def
wikisByGroup
(
group
:
String
)
:
List
[
String
]
=
group
match
{
case
"mappings"
=>
List
(
"ar"
,
"az"
,
"be"
,
"bg"
,
"bn"
,
"ca"
,
"commons"
,
"cs"
,
"cy"
,
"de"
,
"el"
,
"en"
,
"eo"
,
"es"
,
"eu"
,
"fr"
,
"ga"
,
"gl"
,
"hi"
,
"hr"
,
"hu"
,
"hy"
,
"id"
,
"it"
,
"ja"
,
"ko"
,
"lv"
,
"mk"
,
"nl"
,
"pl"
,
"pt"
,
"ro"
,
"ru"
,
"sk"
,
"sl"
,
"sr"
,
"sv"
,
"tr"
,
"uk"
,
"vi"
)
case
"generic"
=>
List
(
"af"
,
"als"
,
"am"
,
"an"
,
"ar"
,
"arz"
,
"ast"
,
"az"
,
"azb"
,
"ba"
,
"bar"
,
"bat_smg"
,
"be"
,
"bg"
,
"bn"
,
"bpy"
,
"br"
,
"bs"
,
"bug"
,
"ca"
,
"cdo"
,
"ce"
,
"ceb"
,
"ckb"
,
"commons"
,
"cs"
,
"cv"
,
"cy"
,
"da"
,
"de"
,
"el"
,
"eml"
,
"en"
,
"eo"
,
"es"
,
"et"
,
"eu"
,
"fa"
,
"fi"
,
"fo"
,
"fr"
,
"fy"
,
"ga"
,
"gd"
,
"gl"
,
"gu"
,
"he"
,
"hi"
,
"hr"
,
"hsb"
,
"ht"
,
"hu"
,
"hy"
,
"ia"
,
"id"
,
"ilo"
,
"io"
,
"is"
,
"it"
,
"ja"
,
"jv"
,
"ka"
,
"kk"
,
"kn"
,
"ko"
,
"ku"
,
"ky"
,
"la"
,
"lb"
,
"li"
,
"lmo"
,
"lt"
,
"lv"
,
"mai"
,
"mg"
,
"mhr"
,
"min"
,
"mk"
,
"ml"
,
"mn"
,
"mr"
,
"mrj"
,
"ms"
,
"my"
,
"mzn"
,
"nap"
,
"nds"
,
"ne"
,
"new"
,
"nl"
,
"nn"
,
"no"
,
"oc"
,
"or"
,
"os"
,
"pa"
,
"pl"
,
"pms"
,
"pnb"
,
"pt"
,
"qu"
,
"ro"
,
"ru"
,
"sa"
,
"sah"
,
"scn"
,
"sco"
,
"sd"
,
"sh"
,
"si"
,
"simple"
,
"sk"
,
"sl"
,
"sq"
,
"sr"
,
"su"
,
"sv"
,
"sw"
,
"ta"
,
"te"
,
"tg"
,
"th"
,
"tl"
,
"tr"
,
"tt"
,
"uk"
,
"ur"
,
"uz"
,
"vec"
,
"vi"
,
"vo"
,
"wa"
,
"war"
,
"wuu"
,
"xmf"
,
"yi"
,
"yo"
,
"zh"
,
"zh_min_nan"
,
"zh_yue"
,
)
case
"wikidata"
=>
List
(
"wikidata"
)
case
_
=>
List
()
}
object
downloadCheckUrl
{
val
mappings
=
new
URL
(
"http://dbpedia-mappings.tib.eu/logs/download-checks/"
)
val
generic
=
new
URL
(
"http://dbpedia-mappings.tib.eu/logs/download-checks/"
)
val
wikidata
=
new
URL
(
"http://dbpedia-wikidata.tib.eu/logs/download-checks/"
)
}
}
object
extractionLogs
{
object
baseUrl
{
val
mappings
=
new
URL
(
"http://dbpedia-mappings.tib.eu/logs/mappings/"
)
...
...
@@ -143,6 +175,15 @@ object Config {
val
unredirected
=
"unRedirected/"
}
val
descriptionsBylogName
=
Map
(
name
.
downloadMappings
->
"Download of latest mappings from <a href=\"http://mappings.dbpedia.org\">mappings.dbpedia.org</a>"
,
name
.
downloadOntology
->
"Download of latest DBpedia ontology"
,
name
.
downloadWikidumps
->
"Download of latest Wiki-Dumps from <a href=\"http://dumps.wikimedia.org\">dumps.wikimedia.org</a>"
,
name
.
extraction
->
"DIEF extraction process"
,
name
.
postProcess
->
"Post-processing of redirects and more"
,
name
.
unredirected
->
"Files with unresolved redirects (pre post-processing)"
)
val
names
:
Array
[
String
]
=
Array
(
name
.
downloadMappings
,
...
...
dashboard/src/main/scala/org/dbpedia/release/handler/InputDumpHandler.scala
View file @
52796cff
package
org.dbpedia.release.handler
import
java.io.
{
BufferedReader
,
InputStreamReader
}
import
java.net.URL
import
java.util.stream.Stream
import
org.dbpedia.release.config.Config
import
org.dbpedia.release.model.
{
DownloadStatus
,
DownloadsStatus
}
import
scala.collection.mutable.ListBuffer
import
scala.util.matching.Regex
object
InputDumpHandler
{
private
def
getDownloadCheckUrl
(
group
:
String
)
:
Option
[
URL
]
=
{
group
match
{
case
"mappings"
=>
Some
(
Config
.
extractionInput
.
downloadCheckUrl
.
mappings
)
case
"generic"
=>
Some
(
Config
.
extractionInput
.
downloadCheckUrl
.
generic
)
case
"wikidata"
=>
Some
(
Config
.
extractionInput
.
downloadCheckUrl
.
wikidata
)
case
_
=>
None
}
}
private
val
HrefLink
:
Regex
=
".*<a href=\"([a-zA-Z]\\S*)\">.*"
.
r
private
val
FailedDownload
:
Regex
=
"(.*)wiki\\.FAILED"
.
r
private
val
SucceededDownload
:
Regex
=
"(.*)wiki\\.SUCCESS"
.
r
def
getDownloadChecks
(
group
:
String
,
version
:
String
)
:
Option
[
DownloadsStatus
]
=
{
try
{
getDownloadCheckUrl
(
group
).
map
((
downloadCheckUrl
:
URL
)
=>
{
val
done
=
new
ListBuffer
[
String
]
//val wait = new ListBuffer[String]
val
fail
=
new
ListBuffer
[
String
]
val
expected
=
Config
.
extractionInput
.
wikisByGroup
(
group
).
toSet
val
remoteStates
=
new
BufferedReader
(
new
InputStreamReader
(
new
URL
(
downloadCheckUrl
,
version
+
"/"
).
openStream
()
)).
lines
().
flatMap
({
case
HrefLink
(
fileName
)
=>
fileName
match
{
case
SucceededDownload
(
lang
)
=>
Stream
.
of
(
DownloadStatus
(
lang
,
DownloadStatus
.
DONE
))
case
FailedDownload
(
lang
)
=>
Stream
.
of
(
DownloadStatus
(
lang
,
DownloadStatus
.
FAIL
))
case
e
=>
println
(
e
);
Stream
.
of
()
}
case
_
=>
Stream
.
of
()
}).
toArray
[
DownloadStatus
](
size
=>
new
Array
[
DownloadStatus
](
size
))
.
groupBy
(
_
.
lang
).
foreach
{
case
(
lang
,
states
)
=>
if
(
expected
.
contains
(
lang
))
states
.
map
(
_
.
state
).
max
match
{
case
DownloadStatus
.
DONE
=>
done
.
append
(
lang
)
//case DownloadStatus.WAIT => wait.append(lang) // TODO if remote tracks waiting
case
DownloadStatus
.
FAIL
=>
fail
.
append
(
lang
)
}
}
DownloadsStatus
(
done
.
toList
,
expected
.
filterNot
((
done
++
fail
).
toSet
).
toList
,
fail
.
toList
)
})
}
catch
{
case
e
:
Exception
=>
None
}
}