Commit 1243216e authored by vehnem's avatar vehnem
Browse files

api: completeness & logs

parent 38f7d54e
package org.dbpedia.release.config
import java.net.URL
/**
* Dashboard config
*/
object Config {
object completeness {
object query {
def mappings(version:String): String =
s"""
|
|""".stripMargin
def generic(version:String): String =
s"""
|
|""".stripMargin
def wikidata(version: String): String =
s"""PREFIX dataid: <http://dataid.dbpedia.org/ns/core#>
|PREFIX dct: <http://purl.org/dc/terms/>
|PREFIX dcat: <http://www.w3.org/ns/dcat#>
|
|SELECT ?expected_files ?actual_files ?delta ?artifact {
| {SELECT ?expected_files (COUNT(DISTINCT ?distribution) as ?actual_files) ((?actual_files-?expected_files)AS ?delta) ?artifact {
| VALUES (?artifact ?expected_files) {
|( <https://databus.dbpedia.org/dbpedia/wikidata/alias> 2 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/debug> 3 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/description> 2 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/geo-coordinates> 1 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/images> 1 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/instance-types> 2 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/labels> 2 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/mappingbased-literals> 1 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/mappingbased-objects-uncleaned> 2 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/mappingbased-properties-reified> 2 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/ontology-subclassof> 1 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/page> 2 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/properties> 1 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/redirects> 2 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/references> 1 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/revision> 2 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/sameas-all-wikis> 1 )
|( <https://databus.dbpedia.org/dbpedia/wikidata/sameas-external> 1 )
| }
| ?dataset dataid:artifact ?artifact .
| ?dataset dct:hasVersion ?versionString .
| ?dataset dcat:distribution ?distribution .
| FILTER(str(?versionString) = '$version')
| } GROUP BY ?artifact ?expected_files ?actual_files }
| FILTER(?delta != 0)
|}
|""".stripMargin
}
}
object extractionLogs {
object baseUrl {
val mappings = new URL("http://dbpedia-mappings.tib.eu/logs/")
val generic = new URL("http://dbpedia-generic.tib.eu/logs/")
val wikidata = new URL("http://dbpedia-wikidata.tib.eu/logs/")
}
object name {
val downloadMappings = "downloadMappings.log"
val downloadOntology = "downloadOntology.log"
val downloadWikidumps = "downloadWikidumps.log"
val extraction = "extraction.log"
val postProcess = "postProcess.log"
val unredirected = "unRedirected/"
}
val names: Array[String] =
Array(
name.downloadMappings,
name.downloadOntology,
name.downloadWikidumps,
name.extraction,
name.postProcess,
name.unredirected
)
}
}
package org.dbpedia.release.handler
import org.apache.jena.query.QueryExecutionFactory
import org.dbpedia.release.config.Config
import org.dbpedia.release.model.VersionStatus
import scala.collection.mutable.ListBuffer
object CompletenessHandler {
private def getQuery(group: String, version: String): Option[String] = {
group match {
case "mappings" => Some(Config.completeness.query.mappings(version))
case "generic" => Some(Config.completeness.query.generic(version))
case "wikidata" => Some(Config.completeness.query.wikidata(version))
case _ => None
}
}
def getStatus(group: String, version: String): Option[Array[VersionStatus]] = {
getQuery(group, version).map(query => {
val arrayBuffer = new ListBuffer[VersionStatus]
QueryExecutionFactory
.sparqlService("https://databus.dbpedia.org/repo/sparql", query)
.execSelect()
.forEachRemaining(resRow => {
val expectedFiles = resRow.get("?expected_files").asLiteral().getLexicalForm.toInt
val actualFiles = resRow.get("?actual_files").asLiteral().getLexicalForm.toInt
val artifact = resRow.get("?artifact").asResource().getURI.split("/").last
arrayBuffer.append(VersionStatus(group, artifact.toString, version, expectedFiles, actualFiles))
})
arrayBuffer.toArray
})
}
def main(args: Array[String]): Unit = {
getStatus("wikidata", "2020.03.01").foreach(_.foreach(x => println(x.artifact, x.actual, x.expected)))
}
}
package org.dbpedia.release.handler
object InputDumpHandler {
}
package org.dbpedia.release.handler
import java.io.{BufferedReader, InputStreamReader}
import java.net.URL
import java.util.stream.Stream
import org.dbpedia.release.config.Config
import org.dbpedia.release.model.LogFile
import scala.util.matching.Regex
object ReleaseLogHandler {
private def getLogsUrl(group: String): Option[URL] = {
group match {
case "mappings" => Some(Config.extractionLogs.baseUrl.mappings)
case "generic" => Some(Config.extractionLogs.baseUrl.generic)
case "wikidata" => Some(Config.extractionLogs.baseUrl.wikidata)
case _ => None
}
}
private val HrefPattern: Regex = ".*<a href=\"([a-zA-Z]\\S*)\">.*".r
def getLogFiles(group: String, version: String): Option[Array[LogFile]] = {
try {
getLogsUrl(group).map((baseUrl: URL) => {
val set =
new BufferedReader(new InputStreamReader(
new URL(baseUrl, version).openStream()
)).lines().flatMap({
case HrefPattern(fileName) =>
LogFile.apply(new URL(baseUrl, version + "/"), fileName).getOrElse() match {
case Some(logFile) => Stream.of(logFile)
case _ => Stream.of()
}
case _ => Stream.of()
}).toArray[LogFile](size => new Array[LogFile](size)).map(x => x.logName -> x).toMap
Config.extractionLogs.names.map(logName => {
if (set.contains(logName)) set(logName)
else LogFile("", logName, "WAIT")
})
})
} catch {
case _: Exception => None
}
}
def main(args: Array[String]): Unit = {
val group = "wikidata"
val version = "2020-05-01"
val logFiles = getLogFiles(group, version)
logFiles.foreach(_.foreach(x => println(x.logName,x.state)))
}
}
package org.dbpedia.release.model
import java.net.URL
import org.dbpedia.release.config.Config
object LogFile {
def apply(baseUrl: URL, fileName: String): Option[LogFile] = {
// TODO if else looks better
val logNameOption = fileName match {
case mappings if mappings.startsWith("downloadMappings") => Some(Config.extractionLogs.name.downloadMappings)
case ontology if ontology.startsWith("downloadOntology") => Some(Config.extractionLogs.name.downloadOntology)
case dumps if dumps.startsWith("downloadWikidumps") => Some(Config.extractionLogs.name.downloadWikidumps)
case extract if extract.startsWith("extraction") => Some(Config.extractionLogs.name.extraction)
case postproc if postproc.startsWith("postProcessing") => Some(Config.extractionLogs.name.postProcess)
case unredirect if unredirect.startsWith("unredirected") => Some(Config.extractionLogs.name.unredirected)
case _ => None
}
val state = if (fileName.endsWith(".log.bz2")) "DONE" else "RUN"
logNameOption.map(logName => new LogFile(new URL(baseUrl, fileName).toString, logName, state))
}
}
case class LogFile(url: String, logName: String, state: String)
package org.dbpedia.release.model
case class VersionStatus(group: String, artifact: String, version: String, expected : Int, actual: Int)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment