diff --git a/.gitignore b/.gitignore index fcc1021473653666cbe5419c8b769002aaaa0cd0..347a298fe1a88c03a19f89277565024f8f34b56a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -databus-maven-plugin/ +databus-poms/dbpedia/*/*/*/ extraction-framework/ wikidumps/ logs/ diff --git a/databus-poms/dbpedia/generic/anchor-text/anchor-text.md b/databus-poms/dbpedia/generic/anchor-text/anchor-text.md new file mode 100644 index 0000000000000000000000000000000000000000..e4630b88d23f88ec3d6f372b097c31d15bc94cdd --- /dev/null +++ b/databus-poms/dbpedia/generic/anchor-text/anchor-text.md @@ -0,0 +1,3 @@ +# Anchor text of Wikilinks +Human readable text (anchor text) of Wikilinks to refer to other Wikipedia articles + diff --git a/databus-poms/dbpedia/generic/anchor-text/pom.xml b/databus-poms/dbpedia/generic/anchor-text/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..0b89e5a05271a8ba3c5fb2dec8bb8e41d98779ed --- /dev/null +++ b/databus-poms/dbpedia/generic/anchor-text/pom.xml @@ -0,0 +1,19 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2018.08.15</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>anchor-text</artifactId> + <packaging>jar</packaging> + + <properties> + <databus.codeReference>https://github.com/dbpedia/extraction-framework/blob/master/core/src/org/dbpedia/extraction/mappings/AnchorTextExtractor.scala</databus.codeReference> + </properties> + +</project> diff --git a/databus-poms/dbpedia/generic/article-templates/article-templates.md b/databus-poms/dbpedia/generic/article-templates/article-templates.md new file mode 100644 index 0000000000000000000000000000000000000000..7eddb568cb357eb4549f3babf842eccbaed03a8e --- /dev/null +++ b/databus-poms/dbpedia/generic/article-templates/article-templates.md @@ -0,0 +1,4 @@ +# Used Wikipedia Templates per Article +Dataset contains identifiers for used templates per Wikipedia article using the wikiPageUsesTemplate property + +Template identifiers (objects) use prefix `http://dbpedia.org/resource/Template:` followed by template name. diff --git a/databus-poms/dbpedia/generic/article-templates/pom.xml b/databus-poms/dbpedia/generic/article-templates/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..630bb6060fb3d893137f4b0aac0cf2fa63549b16 --- /dev/null +++ b/databus-poms/dbpedia/generic/article-templates/pom.xml @@ -0,0 +1,19 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>article-templates</artifactId> + <packaging>jar</packaging> + + <properties> + <databus.codeReference>https://github.com/dbpedia/extraction-framework/blob/master/core/src/org/dbpedia/extraction/mappings/ArticleTemplatesExtractor.scala</databus.codeReference> + </properties> + +</project> diff --git a/databus-poms/dbpedia/generic/categories/categories.md b/databus-poms/dbpedia/generic/categories/categories.md new file mode 100644 index 0000000000000000000000000000000000000000..37ef96e4806ddfe43aeaead191c2a70f6579c348 --- /dev/null +++ b/databus-poms/dbpedia/generic/categories/categories.md @@ -0,0 +1,5 @@ +# Wikipedia Article Categories and categories metadata +Contains the Wikipedia categories per Article and category metadata (hierarchy and labels) + +The dataset is split into 3 different types of files. The `_articles` files contain the Wikipedia categories assigned per article connected via `dct:subject`. The `skos` files partially describe the Wikipedia Category system modelled with the [SKOS vocabulary]. For each category `skos:prefLabel`, parent categories (using `skos:broader`), and `skos:related` categories are extracted. The `labels` categories contain the `rdfs:label` of the category resources. + diff --git a/databus-poms/dbpedia/generic/categories/pom.xml b/databus-poms/dbpedia/generic/categories/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..7450a67e0f66c4e48b8a5a6043b70b436598aebd --- /dev/null +++ b/databus-poms/dbpedia/generic/categories/pom.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>categories</artifactId> + <packaging>jar</packaging> + + +</project> diff --git a/databus-poms/dbpedia/generic/citations/citations.md b/databus-poms/dbpedia/generic/citations/citations.md new file mode 100644 index 0000000000000000000000000000000000000000..01b6fd73b23b2bcd003deb19cf7ed4eeec036696 --- /dev/null +++ b/databus-poms/dbpedia/generic/citations/citations.md @@ -0,0 +1,4 @@ +# Citations and References +Contains citations from articles and bibliographic information of the references + +The `_links` files contain the citations of one article by linking the citation URL via `isCitedBy` property. The `_data` files contain bibliographic metadata (e.g. format, title, author, website, accessdate, etc. if available) for the citations. diff --git a/databus-poms/dbpedia/generic/citations/pom.xml b/databus-poms/dbpedia/generic/citations/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..d577de4d2ad9fd38d71d50243c0853c36cf03a27 --- /dev/null +++ b/databus-poms/dbpedia/generic/citations/pom.xml @@ -0,0 +1,18 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>citations</artifactId> + <packaging>jar</packaging> + + <properties> + <databus.codeReference>https://github.com/dbpedia/extraction-framework/blob/master/core/src/org/dbpedia/extraction/mappings/CitationExtractor.scala</databus.codeReference> + </properties> +</project> diff --git a/databus-poms/dbpedia/generic/commons-sameas-links/commons-sameas-links.md b/databus-poms/dbpedia/generic/commons-sameas-links/commons-sameas-links.md new file mode 100644 index 0000000000000000000000000000000000000000..e857ed006076abf52978652864248afaccb041bc --- /dev/null +++ b/databus-poms/dbpedia/generic/commons-sameas-links/commons-sameas-links.md @@ -0,0 +1,5 @@ +# Commons sameAs Links +Links non-commons DBpedia resources to their DBpedia Commons + +Extracts `owl:sameAs` to Commons Resources based on `{{Commons}}` template. + \ No newline at end of file diff --git a/databus-poms/dbpedia/generic/commons-sameas-links/pom.xml b/databus-poms/dbpedia/generic/commons-sameas-links/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..086db7de31e15e3f033bee39b79b95320ec44333 --- /dev/null +++ b/databus-poms/dbpedia/generic/commons-sameas-links/pom.xml @@ -0,0 +1,19 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>commons-sameas-links</artifactId> + <packaging>jar</packaging> + + <properties> + <databus.codeReference>https://github.com/dbpedia/extraction-framework/blob/master/core/src/org/dbpedia/extraction/mappings/CommonsResourceExtractor.scala</databus.codeReference> + </properties> + +</project> diff --git a/databus-poms/dbpedia/generic/disambiguations/disambiguations.md b/databus-poms/dbpedia/generic/disambiguations/disambiguations.md new file mode 100644 index 0000000000000000000000000000000000000000..f8c9cfd11da3418e6baf6589a5e44a17631fb884 --- /dev/null +++ b/databus-poms/dbpedia/generic/disambiguations/disambiguations.md @@ -0,0 +1,7 @@ +# Entity Disambiguation Links +Disambiguation Links extracted from Wikipedia disambiguation pages + +The links are extracted from [Wikipedia](http://en.wikipedia.org/wiki/Wikipedia:Disambiguation disambiguation) pages. Since Wikipedia has no syntax to distinguish disambiguation links from ordinary links, heuristics are used. + + + diff --git a/databus-poms/dbpedia/generic/disambiguations/pom.xml b/databus-poms/dbpedia/generic/disambiguations/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..e714829bfa232cdae66bc222012ea6f317ee65a8 --- /dev/null +++ b/databus-poms/dbpedia/generic/disambiguations/pom.xml @@ -0,0 +1,19 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>disambiguations</artifactId> + <packaging>jar</packaging> + + <properties> + <databus.codeReference>https://github.com/dbpedia/extraction-framework/blob/master/core/src/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala</databus.codeReference> + </properties> + +</project> diff --git a/databus-poms/dbpedia/generic/external-links/external-links.md b/databus-poms/dbpedia/generic/external-links/external-links.md new file mode 100644 index 0000000000000000000000000000000000000000..f088e43f13cb41b3a26ce00dcd0121971e24c3ad --- /dev/null +++ b/databus-poms/dbpedia/generic/external-links/external-links.md @@ -0,0 +1,4 @@ +# External links +Links extracted from Wikitext to external web pages + +Every link spotted in the Wikitext of a resource will be added via `wikiPageExternalLink` property. \ No newline at end of file diff --git a/databus-poms/dbpedia/generic/external-links/pom.xml b/databus-poms/dbpedia/generic/external-links/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..1e2087413d1ab23c9fb9c96475916ed844b092b8 --- /dev/null +++ b/databus-poms/dbpedia/generic/external-links/pom.xml @@ -0,0 +1,20 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>external-links</artifactId> + <packaging>jar</packaging> + + <properties> + <databus.codeReference>https://github.com/dbpedia/extraction-framework/blob/master/core/src/org/dbpedia/extraction/mappings/ExternalLinksExtractor.scala</databus.codeReference> + </properties> + + +</project> diff --git a/databus-poms/dbpedia/generic/geo-coordinates/geo-coordinates.md b/databus-poms/dbpedia/generic/geo-coordinates/geo-coordinates.md new file mode 100644 index 0000000000000000000000000000000000000000..a58abf1fc9b09072a67d7bfa9e488e37e7b4ae4b --- /dev/null +++ b/databus-poms/dbpedia/generic/geo-coordinates/geo-coordinates.md @@ -0,0 +1,2 @@ +# geo-coordinates dataset +help needed diff --git a/databus-poms/dbpedia/generic/geo-coordinates/pom.xml b/databus-poms/dbpedia/generic/geo-coordinates/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..cfdacbac7ce4d75969a886373964166be24330a9 --- /dev/null +++ b/databus-poms/dbpedia/generic/geo-coordinates/pom.xml @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>geo-coordinates</artifactId> + <packaging>jar</packaging> + +</project> diff --git a/databus-poms/dbpedia/generic/homepages/homepages.md b/databus-poms/dbpedia/generic/homepages/homepages.md new file mode 100644 index 0000000000000000000000000000000000000000..e5a33d1c60d4c6ca09a6d3d381ee72cfb9c4f0a6 --- /dev/null +++ b/databus-poms/dbpedia/generic/homepages/homepages.md @@ -0,0 +1,2 @@ +# homepages dataset +help needed diff --git a/databus-poms/dbpedia/generic/homepages/pom.xml b/databus-poms/dbpedia/generic/homepages/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..5dab398503385b6fe65813ac7827b79de247ccaa --- /dev/null +++ b/databus-poms/dbpedia/generic/homepages/pom.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>homepages</artifactId> + <packaging>jar</packaging> + + +</project> diff --git a/databus-poms/dbpedia/generic/infobox-properties/infobox-properties.md b/databus-poms/dbpedia/generic/infobox-properties/infobox-properties.md new file mode 100644 index 0000000000000000000000000000000000000000..2c212765552a244d16200ece82e0facd56de47e6 --- /dev/null +++ b/databus-poms/dbpedia/generic/infobox-properties/infobox-properties.md @@ -0,0 +1,2 @@ +# Extracted facts from Wikipedia Infoboxes +Data from Wikipedia's infoboxes, as is with some smart automatic (generic) parsing. This dataset has the best fact coverage, but has less consistency than the cleaned mapping infobox dataset. diff --git a/databus-poms/dbpedia/generic/infobox-properties/pom.xml b/databus-poms/dbpedia/generic/infobox-properties/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..ff6667b8dc98a3c08e63c8109ddb15bc88001d55 --- /dev/null +++ b/databus-poms/dbpedia/generic/infobox-properties/pom.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>infobox-properties</artifactId> + <packaging>jar</packaging> + + +</project> diff --git a/databus-poms/dbpedia/generic/infobox-property-definitions/infobox-property-definitions.md b/databus-poms/dbpedia/generic/infobox-property-definitions/infobox-property-definitions.md new file mode 100644 index 0000000000000000000000000000000000000000..7759407e9fabfabd5b2a5af747456a86e7d881f7 --- /dev/null +++ b/databus-poms/dbpedia/generic/infobox-property-definitions/infobox-property-definitions.md @@ -0,0 +1,2 @@ +# infobox-property-definitions dataset +help needed diff --git a/databus-poms/dbpedia/generic/infobox-property-definitions/pom.xml b/databus-poms/dbpedia/generic/infobox-property-definitions/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..a31c7431326466bf0cb86d152c8e94ef8ff0b580 --- /dev/null +++ b/databus-poms/dbpedia/generic/infobox-property-definitions/pom.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>infobox-property-definitions</artifactId> + <packaging>jar</packaging> + + +</project> diff --git a/databus-poms/dbpedia/generic/interlanguage-links/interlanguage-links.md b/databus-poms/dbpedia/generic/interlanguage-links/interlanguage-links.md new file mode 100644 index 0000000000000000000000000000000000000000..54767009e0f00c58f5f866376802a9c858376c74 --- /dev/null +++ b/databus-poms/dbpedia/generic/interlanguage-links/interlanguage-links.md @@ -0,0 +1,2 @@ +# interlanguage-links dataset +help needed diff --git a/databus-poms/dbpedia/generic/interlanguage-links/pom.xml b/databus-poms/dbpedia/generic/interlanguage-links/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..63d33001c74755910fc39ea91b55f56d3a5f4ea9 --- /dev/null +++ b/databus-poms/dbpedia/generic/interlanguage-links/pom.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>interlanguage-links</artifactId> + <packaging>jar</packaging> + + +</project> diff --git a/databus-poms/dbpedia/generic/labels/labels.md b/databus-poms/dbpedia/generic/labels/labels.md new file mode 100644 index 0000000000000000000000000000000000000000..58491ac36b33cbc2428f73e7a975754f4f4fd69f --- /dev/null +++ b/databus-poms/dbpedia/generic/labels/labels.md @@ -0,0 +1,4 @@ +# Wikipedia page title as rdfs:label +Contains rdfs:label for every resource based on Wikipedia article title + +The Wikipedia page article is extracted as rdfs:label using the 2 letter language code of the Wikipedia version as RDF language tag. Only labels for Article (Main namespace) are contained. Labels for Categories are stored in [categories dataset](https://databus.dbpedia.org/dbpedia/generic/categories). diff --git a/databus-poms/dbpedia/generic/labels/pom.xml b/databus-poms/dbpedia/generic/labels/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..1afb10ff15cf9a3e8543fb69865708ecefd4e6ba --- /dev/null +++ b/databus-poms/dbpedia/generic/labels/pom.xml @@ -0,0 +1,19 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>labels</artifactId> + <packaging>jar</packaging> + + <properties> + <databus.codeReference>https://github.com/dbpedia/extraction-framework/blob/master/core/src/main/scala/org/dbpedia/extraction/mappings/LabelExtractor.scala</databus.codeReference> + </properties> + +</project> diff --git a/databus-poms/dbpedia/generic/page/page.md b/databus-poms/dbpedia/generic/page/page.md new file mode 100644 index 0000000000000000000000000000000000000000..a919b395c679612c4f30edb55de9ad7576bbbac9 --- /dev/null +++ b/databus-poms/dbpedia/generic/page/page.md @@ -0,0 +1,2 @@ +# page dataset +help needed diff --git a/databus-poms/dbpedia/generic/page/pom.xml b/databus-poms/dbpedia/generic/page/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..2221f0d11db5562d88b3523ddb40e85737c0dc62 --- /dev/null +++ b/databus-poms/dbpedia/generic/page/pom.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>page</artifactId> + <packaging>jar</packaging> + + +</project> diff --git a/databus-poms/dbpedia/generic/persondata/persondata.md b/databus-poms/dbpedia/generic/persondata/persondata.md new file mode 100644 index 0000000000000000000000000000000000000000..8856b1f723587581d24aca85f704f86bd3ee6e4b --- /dev/null +++ b/databus-poms/dbpedia/generic/persondata/persondata.md @@ -0,0 +1,2 @@ +# persondata dataset +help needed diff --git a/databus-poms/dbpedia/generic/persondata/pom.xml b/databus-poms/dbpedia/generic/persondata/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..f958fc5e615a4b484e16f4a001163eecb5170e9d --- /dev/null +++ b/databus-poms/dbpedia/generic/persondata/pom.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>persondata</artifactId> + <packaging>jar</packaging> + + +</project> diff --git a/databus-poms/dbpedia/generic/pom.xml b/databus-poms/dbpedia/generic/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..362b83ee151c74d536d122f603b598315b220760 --- /dev/null +++ b/databus-poms/dbpedia/generic/pom.xml @@ -0,0 +1,220 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <!-- the super-pom deactivates software compilation and configures the plugin to run in default phases --> + <parent> + <groupId>org.dbpedia.databus</groupId> + <artifactId>super-pom</artifactId> + <version>1.3-SNAPSHOT</version> + </parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <packaging>pom</packaging> + <version>2020.02.01</version> + <modules> + <!--module>anchor-text</module--> + <module>article-templates</module> + <module>categories</module> + <module>citations</module> + <module>commons-sameas-links</module> + <module>disambiguations</module> + <module>external-links</module> + <module>geo-coordinates</module> + <module>homepages</module> + <module>infobox-properties</module> + <module>infobox-property-definitions</module> + <module>interlanguage-links</module> + <module>labels</module> + <module>page</module> + <module>persondata</module> + <module>redirects</module> + <module>revisions</module> + <module>topical-concepts</module> + <module>wikilinks</module> + <module>wikipedia-links</module> + </modules> + <properties> + <databus.codeReference>https://github.com/dbpedia/extraction-framework/blob/master/core/src/main/scala/org/dbpedia/extraction/mappings/</databus.codeReference> + <databus.issueTracker>https://github.com/dbpedia/extraction-framework/issues</databus.issueTracker> + <databus.documentationLocation>https://github.com/dbpedia/databus-maven-plugin/blob/master/dbpedia/${project.groupId}/${project.artifactId}</databus.documentationLocation> + <databus.feedbackChannel>https://forum.dbpedia.org/c/data/databus/14</databus.feedbackChannel> + <databus.tryVersionAsIssuedDate>true</databus.tryVersionAsIssuedDate> + <databus.packageDirectory> + /media/bigone/25TB/www/downloads.dbpedia.org/repo/lts/${project.groupId}/${project.artifactId} + </databus.packageDirectory> + <databus.downloadUrlPath> + https://downloads.dbpedia.org/repo/lts/${project.groupId}/${project.artifactId}/${project.version}/ + </databus.downloadUrlPath> + <databus.publisher>https://webid.dbpedia.org/webid.ttl#this</databus.publisher> + <databus.license>http://purl.oclc.org/NET/rdflicense/cc-by3.0</databus.license> + <databus.documentation>< +**2018.08.15 - 2018.12.14** +* were created as new modular releases, some issues remain: +* language normalisation to iso codes, zh-min-nan to nan, zh-yue to yue, bat-smg to batsmg (no iso code available) +* we used rapper 2.0.14 to parse and `LC_ALL=C sort` to sort and ascii2uni -a U to unescape unicdoe +characters +* link to Wikimedia dump version is missing + ]]></databus.documentation> + <!-- used for derive plugin --> + <databus.deriveversion>2019.10.01</databus.deriveversion> + </properties> + + <!-- currently still needed to find the super-pom, once the super-pom is in maven central, + this can be removed as well --> + <repositories> + <repository> + <id>archiva.internal</id> + <name>Internal Release Repository</name> + <url>http://databus.dbpedia.org:8081/repository/internal</url> + </repository> + <repository> + <id>archiva.snapshots</id> + <name>Internal Snapshot Repository</name> + <url>http://databus.dbpedia.org:8081/repository/snapshots</url> + <snapshots> + <updatePolicy>always</updatePolicy> + </snapshots> + </repository> + </repositories> + + <build> + <plugins> + <plugin> + <groupId>org.dbpedia.databus</groupId> + <artifactId>databus-derive-maven-plugin</artifactId> + <version>1.0-SNAPSHOT</version> + <executions> + <execution> + <id>DeriveFromMarvin</id> + <!--phase>initialize</phase--> + <goals> + <goal>clone</goal> + </goals> + + </execution> + </executions> + <configuration> + <skipParsing>false</skipParsing> + <skipCloning>false</skipCloning> + <!-- query +PREFIX dataid: <http://dataid.dbpedia.org/ns/core#> +SELECT distinct (?derive) WHERE { + + ?dataset dataid:group <https://databus.dbpedia.org/marvin/generic> . + ?dataset dataid:artifact ?artifact . + ?dataset dataid:version ?version . + ?dataset dct:hasVersion "2019.08.30"^^xsd:string + BIND (CONCAT("<version>",?artifact,"/${databus.deriveversion}</version>") as ?derive) +} +order by asc(?derive) +--> + <versions> + <version>https://databus.dbpedia.org/marvin/generic/article-templates/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/categories/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/citations/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/commons-sameas-links/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/disambiguations/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/external-links/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/geo-coordinates/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/homepages/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/infobox-properties/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/infobox-property-definitions/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/interlanguage-links/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/labels/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/page/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/persondata/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/redirects/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/revisions/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/topical-concepts/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/wikilinks/${databus.deriveversion}</version> + +<version>https://databus.dbpedia.org/marvin/generic/wikipedia-links/${databus.deriveversion}</version> +</versions> + </configuration> + </plugin> + </plugins> + <extensions> + <extension> + <groupId>org.apache.maven.wagon</groupId> + <artifactId>wagon-webdav-jackrabbit</artifactId> + <version>3.0.0</version> + </extension> + </extensions> + </build> + + <profiles> + <profile> + <id>webdav</id> + <build> + <plugins> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>wagon-maven-plugin</artifactId> + <version>2.0.0</version> + <executions> + <execution> + <id>upload-databus</id> + <phase>install</phase> + <goals> + <goal>upload</goal> + </goals> + <configuration> + <fromDir>${project.build.directory}/databus/repo/${project.groupId}/${project.artifactId}</fromDir> + <url>dav:https://downloads.dbpedia.org/repo/</url> + <toDir>dbpedia/${project.groupId}/${project.artifactId}</toDir> + <serverId>downloads-dbpedia-org</serverId> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </build> + </profile> + </profiles> +</project> diff --git a/databus-poms/dbpedia/generic/redirects/pom.xml b/databus-poms/dbpedia/generic/redirects/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..c8152006609816eef2614efbf313ed6131913186 --- /dev/null +++ b/databus-poms/dbpedia/generic/redirects/pom.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>redirects</artifactId> + <packaging>jar</packaging> + + +</project> diff --git a/databus-poms/dbpedia/generic/redirects/redirects.md b/databus-poms/dbpedia/generic/redirects/redirects.md new file mode 100644 index 0000000000000000000000000000000000000000..aac8e5682e3d851a27b41e355ba17f254ee7202e --- /dev/null +++ b/databus-poms/dbpedia/generic/redirects/redirects.md @@ -0,0 +1,2 @@ +# redirects dataset +help needed diff --git a/databus-poms/dbpedia/generic/revisions/pom.xml b/databus-poms/dbpedia/generic/revisions/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..55a745d1d03660899ce0878dea4af814510767a0 --- /dev/null +++ b/databus-poms/dbpedia/generic/revisions/pom.xml @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>revisions</artifactId> + <packaging>jar</packaging> + +</project> diff --git a/databus-poms/dbpedia/generic/revisions/revisions.md b/databus-poms/dbpedia/generic/revisions/revisions.md new file mode 100644 index 0000000000000000000000000000000000000000..085959df6309cd84d788faaf64e7ce20a6eb5a62 --- /dev/null +++ b/databus-poms/dbpedia/generic/revisions/revisions.md @@ -0,0 +1,2 @@ +# revisions dataset +help needed diff --git a/databus-poms/dbpedia/generic/topical-concepts/pom.xml b/databus-poms/dbpedia/generic/topical-concepts/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..2711eb6bd8320f6eb74514498b60ac1d8b12c42b --- /dev/null +++ b/databus-poms/dbpedia/generic/topical-concepts/pom.xml @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>topical-concepts</artifactId> + <packaging>jar</packaging> + +</project> diff --git a/databus-poms/dbpedia/generic/topical-concepts/topical-concepts.md b/databus-poms/dbpedia/generic/topical-concepts/topical-concepts.md new file mode 100644 index 0000000000000000000000000000000000000000..5889b344e4d119e102a8b05b0430cc153ca5d8c9 --- /dev/null +++ b/databus-poms/dbpedia/generic/topical-concepts/topical-concepts.md @@ -0,0 +1,2 @@ +# topical-concepts dataset +help needed diff --git a/databus-poms/dbpedia/generic/wikilinks/pom.xml b/databus-poms/dbpedia/generic/wikilinks/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..34511afca5cfcf6ab5c8dbe81f94512552d76c34 --- /dev/null +++ b/databus-poms/dbpedia/generic/wikilinks/pom.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>wikilinks</artifactId> + <packaging>jar</packaging> + + +</project> diff --git a/databus-poms/dbpedia/generic/wikilinks/wikilinks.md b/databus-poms/dbpedia/generic/wikilinks/wikilinks.md new file mode 100644 index 0000000000000000000000000000000000000000..93de4388b22aee49a19b6a33be14dcac3c510242 --- /dev/null +++ b/databus-poms/dbpedia/generic/wikilinks/wikilinks.md @@ -0,0 +1,2 @@ +# wikilinks dataset +help needed diff --git a/databus-poms/dbpedia/generic/wikipedia-links/pom.xml b/databus-poms/dbpedia/generic/wikipedia-links/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..9d1c731d6a99389e1680098c6de34f24d65f44b7 --- /dev/null +++ b/databus-poms/dbpedia/generic/wikipedia-links/pom.xml @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <groupId>generic</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>generic</groupId> + <artifactId>wikipedia-links</artifactId> + <packaging>jar</packaging> + +</project> diff --git a/databus-poms/dbpedia/generic/wikipedia-links/wikipedia-links.md b/databus-poms/dbpedia/generic/wikipedia-links/wikipedia-links.md new file mode 100644 index 0000000000000000000000000000000000000000..ab324eb1910247fb0257838e920527fbcee4c50a --- /dev/null +++ b/databus-poms/dbpedia/generic/wikipedia-links/wikipedia-links.md @@ -0,0 +1,2 @@ +# wikipedia-links dataset +help needed diff --git a/databus-poms/dbpedia/mappings/geo-coordinates-mappingbased/geo-coordinates-mappingbased.md b/databus-poms/dbpedia/mappings/geo-coordinates-mappingbased/geo-coordinates-mappingbased.md new file mode 100644 index 0000000000000000000000000000000000000000..7902b936e6a3e8d0a85dcc6d0d14ff0fcc8b59a7 --- /dev/null +++ b/databus-poms/dbpedia/mappings/geo-coordinates-mappingbased/geo-coordinates-mappingbased.md @@ -0,0 +1,23 @@ +# Geo-coordinates extracted with mappings +Contains geographic coordinates from the Wikipedia Infoboxes refined by the mapping-based extraction. + +The dataset contains all triples extracted with the help of the [Geocoordinates Mappings](http://mappings.dbpedia.org/index.php/Template:GeocoordinatesMapping). Whereas [generic geo coordinates datasets](https://databus.dbpedia.org/dbpedia/generic/geo-coordinates) spot any geocoordinate in an infobox without contextualizing it, the mappings allow to describe which kind of location the coordinates are describing. This can be the coordinates of the actual location of the resource itself + + + <http://dbpedia.org/resource/Atlantic_Ocean> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2003/01/geo/wgs84_pos#SpatialThing> . + <http://dbpedia.org/resource/Atlantic_Ocean> <http://www.w3.org/2003/01/geo/wgs84_pos#lat> "0.0"^^<http://www.w3.org/2001/XMLSchema#float> . + <http://dbpedia.org/resource/Atlantic_Ocean> <http://www.w3.org/2003/01/geo/wgs84_pos#long> "-25.0"^^<http://www.w3.org/2001/XMLSchema#float> . + <http://dbpedia.org/resource/Atlantic_Ocean> <http://www.georss.org/georss/point> "0.0 -25.0" . + + +but also coordinates of locations associated with the resource (e.g. the resting place of Alfred Nobel) + + + <http://dbpedia.org/resource/Alfred_Nobel> <http://dbpedia.org/ontology/restingPlacePosition> <http://dbpedia.org/resource/Alfred_Nobel__restingPlacePosition__1> . + <http://dbpedia.org/resource/Alfred_Nobel__restingPlacePosition__1> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2003/01/geo/wgs84_pos#SpatialThing> . + <http://dbpedia.org/resource/Alfred_Nobel__restingPlacePosition__1> <http://www.w3.org/2003/01/geo/wgs84_pos#lat> "59.356811111111114"^^<http://www.w3.org/2001/XMLSchema#float> . + <http://dbpedia.org/resource/Alfred_Nobel__restingPlacePosition__1> <http://www.w3.org/2003/01/geo/wgs84_pos#long> "18.01928611111111"^^<http://www.w3.org/2001/XMLSchema#float> . + <http://dbpedia.org/resource/Alfred_Nobel__restingPlacePosition__1> <http://www.georss.org/georss/point> "59.356811111111114 18.01928611111111" . + + +You can have a look at the mappings used for [Alfred Nobel (Person)](http://mappings.dbpedia.org/index.php/Mapping_en:Infobox_person) and [Atlantic Ocean (body of water)](http://mappings.dbpedia.org/index.php/Mapping_en:Infobox_body_of_water) . diff --git a/databus-poms/dbpedia/mappings/geo-coordinates-mappingbased/pom.xml b/databus-poms/dbpedia/mappings/geo-coordinates-mappingbased/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..9d465209fc465c0d5bb7164e5e06dcbca90111dc --- /dev/null +++ b/databus-poms/dbpedia/mappings/geo-coordinates-mappingbased/pom.xml @@ -0,0 +1,19 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>group-metadata</artifactId> + <groupId>mappings</groupId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <artifactId>geo-coordinates-mappingbased</artifactId> + <groupId>mappings</groupId> + <packaging>jar</packaging> + + <properties> + <databus.codeReference>https://github.com/dbpedia/extraction-framework/blob/master/core/src/main/scala/org/dbpedia/extraction/mappings/GeoCoordinatesMapping.scala</databus.codeReference> + </properties> + +</project> diff --git a/databus-poms/dbpedia/mappings/instance-types/instance-types.md b/databus-poms/dbpedia/mappings/instance-types/instance-types.md new file mode 100644 index 0000000000000000000000000000000000000000..f30018009e4a5a6a9e74982807f5bb6b89418c3a --- /dev/null +++ b/databus-poms/dbpedia/mappings/instance-types/instance-types.md @@ -0,0 +1,8 @@ +# DBpedia Ontology instance types +Classification of instances with the DBpedia Ontology. Contains triples of the form `<$resource> rdf:type <$dbpedia_ontology_class>` generated by the mappings extraction. + +## Most specific vs. transitive files +The dataset contains a file with just the types as classified with the mapping extractor. These types are normally the most specific class, they originate directly from the `map to class` template on [mappings.dbpedia.org](http://mappings.dbpedia.org). In addition, a file with the `_transitive` tag is generated by the [DIEF Transitive Closure Class](https://github.com/dbpedia/extraction-framework/blob/master/core/src/main/scala/org/dbpedia/extraction/util/TransitiveClosure.scala) on release, containing the pre-calculated forward-inferences (transitive closure), i.e. with all superclasses. If your application/store does not support reasoning, additionally use the `_transitive` file. + +## Entities with double underscore +In addition to the `map to class` types, entities ending with `__[number]` are not duplicates but conform to the knowledge graph modelling choices and are generated by the [DIEF Node class](https://github.com/dbpedia/extraction-framework/blob/master/core/src/main/scala/org/dbpedia/extraction/wikiparser/Node.scala). Most of them are instances of `dbo:CareerStation`(see [examples](https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=select+%3Fs+%7B+%3Fs+a+dbo%3ACareerStation+%7D+limit+100&format=text%2Fhtml&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+)), which represent time periods, therefore provide statements that hold true only within a determined time span. In the case of [Ada Lovelace](http://dbpedia.org/resource/Ada_Lovelace), potential other metadata can be added to the person function "Countess of Lovelace" instead of adding predicates directly to the entity dbr:Ada_Lovelace. (see [Duplicate resource names with underscore](https://forum.dbpedia.org/t/duplicate-resource-names-with-underscore/104)) diff --git a/databus-poms/dbpedia/mappings/instance-types/pom.xml b/databus-poms/dbpedia/mappings/instance-types/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..d23f3fc1356187da227afaf5d0a3508eaabfa463 --- /dev/null +++ b/databus-poms/dbpedia/mappings/instance-types/pom.xml @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>group-metadata</artifactId> + <groupId>mappings</groupId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <artifactId>instance-types</artifactId> + <groupId>mappings</groupId> + <packaging>jar</packaging> + +</project> diff --git a/databus-poms/dbpedia/mappings/mappingbased-literals/mappingbased-literals.md b/databus-poms/dbpedia/mappings/mappingbased-literals/mappingbased-literals.md new file mode 100644 index 0000000000000000000000000000000000000000..cb84a20901ecfc1c45ec71b592e0c947c4f15be4 --- /dev/null +++ b/databus-poms/dbpedia/mappings/mappingbased-literals/mappingbased-literals.md @@ -0,0 +1,9 @@ +# Literals extracted with mappings +High-quality literal (datatyped) properties (numeric data and text) refined by the mappings extraction. + +Contains strings and typed literal values (dates, numbers, currency, etc.) extracted from infoboxes refined by community-written mappings that help the parser to unify extracted values. The extracted triples are based on the parts of the mappings making use of [datatype properties](http://mappings.dbpedia.org/index.php/Template:DatatypeProperty) from the DBpedia ontology. Therefore values across different languages are comparable since they use the same property identifiers and units of measurement are normalized to standard units (e.g. inches to meters). + +The `rdfs:range` of the datatype property defines the datatype expected as outcome of the mapped value. As a consequence, the appropriate [data parser](https://github.com/dbpedia/extraction-framework/tree/master/core/src/main/scala/org/dbpedia/extraction/dataparser) will be picked to extract the data from the infobox entry text and try its best to extract this type of value (e.g. a date for [dbo:birthDate](http://mappings.dbpedia.org/index.php/OntologyProperty:BirthDate)) or nothing. For datatype properties with (convertible) units of measurement the [unit parser](https://github.com/dbpedia/extraction-framework/blob/master/core/src/main/scala/org/dbpedia/extraction/dataparser/UnitValueParser.scala) will check if one of the known units for the unit dimension (see e.g. [units for Length](https://github.com/dbpedia/extraction-framework/blob/68e95bcab1d859d47690cc0c1536eaace7b01d3b/core/src/main/scala/org/dbpedia/extraction/ontology/OntologyDatatypes.scala#L371)) matches and will convert it into the standard/base unit for the dimension. Moreover, the parser will try to interpret quantity modifiers (e.g. Mio, millions), however they need to be defined and maintained by the community for every language invidually in a [config file](https://github.com/dbpedia/extraction-framework/blob/master/core/src/main/scala/org/dbpedia/extraction/config/dataparser/ParserUtilsConfig.scala). Available unit dimsenions and their standard units, units and their abbreviations can be found in [code](https://github.com/dbpedia/extraction-framework/blob/master/core/src/main/scala/org/dbpedia/extraction/ontology/OntologyDatatypes.scala). + + + diff --git a/databus-poms/dbpedia/mappings/mappingbased-literals/pom.xml b/databus-poms/dbpedia/mappings/mappingbased-literals/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..92c52542f076173836a2fb6b0e92a511f1f8b5ff --- /dev/null +++ b/databus-poms/dbpedia/mappings/mappingbased-literals/pom.xml @@ -0,0 +1,13 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>group-metadata</artifactId> + <groupId>mappings</groupId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <artifactId>mappingbased-literals</artifactId> + <groupId>mappings</groupId> + <packaging>jar</packaging> +</project> diff --git a/databus-poms/dbpedia/mappings/mappingbased-objects-uncleaned/mappingbased-objects-uncleaned.md b/databus-poms/dbpedia/mappings/mappingbased-objects-uncleaned/mappingbased-objects-uncleaned.md new file mode 100644 index 0000000000000000000000000000000000000000..0f4c6b15a0fd0d4232eb887f72a6897cd9a303e2 --- /dev/null +++ b/databus-poms/dbpedia/mappings/mappingbased-objects-uncleaned/mappingbased-objects-uncleaned.md @@ -0,0 +1,11 @@ +# (Uncleaned) Object properties extracted with mappings +Uncleand High quality statements with IRI object values extracted by the mappings extraction from Wikipedia Infoboxes. + +Offers complementary statements (Entity-to-Entity relations) from Wikipedia Infoboxes to [mappingbased-literals](https://databus.dbpedia.org/dbpedia/${project.groupId}/mappingbased-literals/${project.version}) + +NOTE: There also is a [cleaned version](https://databus.dbpedia.org/dbpedia/${project.groupId}/mappingbased-objects/${project.version}) of this dataset available: + + +Uncleaned means that two post-processing steps are *not* performed on this dataset: +* type consistency check, i.e. type of object matches the range of property +* redirecting of objects, i.e. http://dbpedia.org/resource/Billl_Clinton to Bill Clinton diff --git a/databus-poms/dbpedia/mappings/mappingbased-objects-uncleaned/pom.xml b/databus-poms/dbpedia/mappings/mappingbased-objects-uncleaned/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..ec7b04afc4bf2fab8c0c76b45e24155d416e0b0f --- /dev/null +++ b/databus-poms/dbpedia/mappings/mappingbased-objects-uncleaned/pom.xml @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>group-metadata</artifactId> + <groupId>mappings</groupId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <artifactId>mappingbased-objects-uncleaned</artifactId> + <groupId>mappings</groupId> + <packaging>jar</packaging> + +</project> diff --git a/databus-poms/dbpedia/mappings/mappingbased-objects/mappingbased-objects.md b/databus-poms/dbpedia/mappings/mappingbased-objects/mappingbased-objects.md new file mode 100644 index 0000000000000000000000000000000000000000..649119987a8a85b2d1e8e1ea25a12994337a3aa8 --- /dev/null +++ b/databus-poms/dbpedia/mappings/mappingbased-objects/mappingbased-objects.md @@ -0,0 +1,7 @@ +# Cleaned object properties extracted with mappings +Cleaned version of high quality statements with IRI object values extracted by the mappings extraction from Wikipedia Infoboxes. + +Statements are based on input from [mappingbased-objects-uncleaned](https://databus.dbpedia.org/dbpedia/${project.groupId}/mappingbased-objects-uncleaned/${project.version}) after applying **post processing steps**: + + 1. Canonicalization of all object values replacing them by their (transitive) redirects, i.e. `http://dbpedia.org/resource/Barack_Obama_Jr` will be replaced by `http://dbpedia.org/resource/Barack_Obama` . The `_transitive` file of the corresponding language chapter from [redirects dataset](https://databus.dbpedia.org/dbpedia/generic/redirects/${project.version}) will be used to resolve the transitive redirects. See [code](https://github.com/dbpedia/extraction-framework/blob/master/scripts/src/main/scala/org/dbpedia/extraction/scripts/MapObjectUris.scala) for more details. + 2. Type consistency filtering: extracted `rdf:type` statements from [instance-types](https://databus.dbpedia.org/dbpedia/${project.groupId}/instance-types/${project.version}) are used to check domain and range according to the definition of the properties in the [DBpedia ontology](https://databus.dbpedia.org/dbpedia/ontology/dbo-snapshots). Statements with predicate *p* for which the subject resource is from a different type than specified in `rdfs:domain` of *p* are passed to `_disjointDomain` files, whereas statements with an object resource disjoint from `rdfs:range` will be passed `_disjointRange` files. Statements where the types match or are subtypes of the expected ones are passed to the regular dataset files (without content variant). See [code](https://github.com/dbpedia/extraction-framework/blob/master/scripts/src/main/scala/org/dbpedia/extraction/scripts/TypeConsistencyCheck.scala) for more details. We keep the `disjoint*` files since they can contain also false positives due to incomplete type information (e.g. no infobox exists for a specific resource or infobox class mapping is incomplete). If you union all 3 files the results is same as applying only step 1. diff --git a/databus-poms/dbpedia/mappings/mappingbased-objects/pom.xml b/databus-poms/dbpedia/mappings/mappingbased-objects/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..b19c38f7ac868df35a58148f58ca26b523dbe1b5 --- /dev/null +++ b/databus-poms/dbpedia/mappings/mappingbased-objects/pom.xml @@ -0,0 +1,13 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>group-metadata</artifactId> + <groupId>mappings</groupId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <artifactId>mappingbased-objects</artifactId> + <groupId>mappings</groupId> + <packaging>jar</packaging> + +</project> diff --git a/databus-poms/dbpedia/mappings/pom.xml b/databus-poms/dbpedia/mappings/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..867221b5bfab918cb8e573f79d208fd3d8e85263 --- /dev/null +++ b/databus-poms/dbpedia/mappings/pom.xml @@ -0,0 +1,152 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.dbpedia.databus</groupId> + <artifactId>super-pom</artifactId> + <version>1.3-SNAPSHOT</version> + </parent> + <groupId>mappings</groupId> + <artifactId>group-metadata</artifactId> + <version>2020.02.01</version> + <packaging>pom</packaging> + <modules> + <module>mappingbased-literals</module> + <module>specific-mappingbased-properties</module> + <module>geo-coordinates-mappingbased</module> + <module>instance-types</module> + <module>mappingbased-objects</module> + <module>mappingbased-objects-uncleaned</module> + </modules> + <properties> + <databus.documentation>## Attribution fulfilled by +* (when deriving another dataset and releasing to the Databus) adding the Databus link to the provenance https://databus.dbpedia.org/dbpedia/${project.groupId}/${project.artifactId}/${project.artifactId}/${project.version} +* on your website: + * include the DBpedia logo and mention the usage of DBpedia with this link: https://databus.dbpedia.org/dbpedia + * include backlinks from your pages to the individual entities, e.g. http://dbpedia.org/resource/Berlin +* in academic publications cite: DBpedia - A Large-scale, Multilingual Knowledge Base Extracted from Wikipedia, J. Lehmann, R. Isele, M. Jakob, A. Jentzsch, D. Kontokostas, P. Mendes, S. Hellmann, M. Morsey, P. van Kleef, S. Auer, and C. Bizer. Semantic Web Journal 6 (2): 167--195 (2015) + +## How to contribute +DBpedia is a community project, help us with: +* editing the mappings at http://mappings.dbpedia.org +* improve this documentation at https://github.com/dbpedia/databus-maven-plugin/tree/master/dbpedia/mappings/${project.artifactId}/${project.artifactId}.md +* help with the software relevant for extraction: +** https://github.com/dbpedia/extraction-framework/tree/master/core/src/main/scala/org/dbpedia/extraction/mappings +** in particular https://github.com/dbpedia/extraction-framework/blob/master/core/src/main/scala/org/dbpedia/extraction/mappings/InfoboxMappingsExtractor.scala + +## Debug +Parselogs are currently kept here: http://downloads.dbpedia.org/temporary/parselogs/ + +## Origin +This dataset was extracted using the wikipedia-dumps available on https://dumps.wikimedia.org/ +using the DBpedia Extraction-Framework available at https://github.com/dbpedia/extraction-framework +For more technical information on how these datasets were generated, please visit http://dev.dbpedia.org + +## Changelog +* 2019.10.16 + * fixed encoding issue + * fixed https://github.com/dbpedia/extraction-framework/issues/595 where a nullpointer caused some instance extractions to crash +* 2018.09.12 + * were created as new modular releases, some issues remain + * we used rapper 2.0.14 to parse and `LC_ALL=C sort` to sort and ascii2uni -a U to unescape unicdoe xcharacters + * parsing removed 250k triples total, debugging pending + * object-uncleaned was not transformed into objects-cleaned and is missing + * link to Wikimedia dump version is missing +* 2016.10.01 + * was taken from the previous BIG DBpedia releases under http://downloads.dbpedia.org/2016-10/ and included for completeness</databus.documentation> + <databus.license>http://purl.oclc.org/NET/rdflicense/cc-by3.0</databus.license> + <databus.codeReference>https://github.com/dbpedia/extraction-framework/blob/master/core/src/main/scala/org/dbpedia/extraction/mappings/MappingExtractor.scala</databus.codeReference> + <databus.issueTracker>https://github.com/dbpedia/extraction-framework/issues</databus.issueTracker> + <databus.documentationLocation>https://github.com/dbpedia/databus-maven-plugin/blob/master/dbpedia/${project.groupId}/${project.artifactId}</databus.documentationLocation> + <databus.downloadUrlPath>https://downloads.dbpedia.org/repo/lts/${project.groupId}/${project.artifactId}/${project.version}/</databus.downloadUrlPath> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <databus.packageDirectory>/media/bigone/25TB/www/downloads.dbpedia.org/repo/lts/${project.groupId}/${project.artifactId}</databus.packageDirectory> + <databus.tryVersionAsIssuedDate>true</databus.tryVersionAsIssuedDate> + <databus.publisher>https://webid.dbpedia.org/webid.ttl#this</databus.publisher> + <databus.feedbackChannel>https://forum.dbpedia.org/c/data/databus/14</databus.feedbackChannel> + <!-- used for derive plugin --> + <databus.deriveversion>2019.09.01</databus.deriveversion> + </properties> + <repositories> + <repository> + <id>archiva.internal</id> + <name>Internal Release Repository</name> + <url>http://databus.dbpedia.org:8081/repository/internal</url> + </repository> + <repository> + <snapshots> + <updatePolicy>always</updatePolicy> + </snapshots> + <id>archiva.snapshots</id> + <name>Internal Snapshot Repository</name> + <url>http://databus.dbpedia.org:8081/repository/snapshots</url> + </repository> + </repositories> + <build> + <plugins> + <plugin> + <groupId>org.dbpedia.databus</groupId> + <artifactId>databus-derive-maven-plugin</artifactId> + <version>1.0-SNAPSHOT</version> + <executions> + <execution> + <id>DeriveFromMarvin</id> + <goals> + <goal>clone</goal> + </goals> + </execution> + </executions> + <configuration> + <skipParsing>false</skipParsing> + <skipCloning>false</skipCloning> + <versions> + <version>https://databus.dbpedia.org/marvin/mappings/geo-coordinates-mappingbased/${databus.deriveversion}</version> + <version>https://databus.dbpedia.org/marvin/mappings/instance-types/${databus.deriveversion}</version> + <version>https://databus.dbpedia.org/marvin/mappings/mappingbased-literals/${databus.deriveversion}</version> + <version>https://databus.dbpedia.org/marvin/mappings/mappingbased-objects-uncleaned/${databus.deriveversion}</version> + <version>https://databus.dbpedia.org/marvin/mappings/mappingbased-objects/${databus.deriveversion}</version> + <version>https://databus.dbpedia.org/marvin/mappings/specific-mappingbased-properties/${databus.deriveversion}</version> + </versions> + </configuration> + </plugin> + </plugins> + <extensions> + <extension> + <groupId>org.apache.maven.wagon</groupId> + <artifactId>wagon-webdav-jackrabbit</artifactId> + <version>3.0.0</version> + </extension> + </extensions> + </build> + + <profiles> + <profile> + <id>webdav</id> + <build> + <plugins> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>wagon-maven-plugin</artifactId> + <version>2.0.0</version> + <executions> + <execution> + <id>upload-databus</id> + <phase>install</phase> + <goals> + <goal>upload</goal> + </goals> + <configuration> + <fromDir>${project.build.directory}/databus/repo/${project.groupId}/${project.artifactId}</fromDir> + <url>dav:https://downloads.dbpedia.org/repo/</url> + <toDir>dbpedia/${project.groupId}/${project.artifactId}</toDir> + <serverId>downloads-dbpedia-org</serverId> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </build> + </profile> + </profiles> +</project> diff --git a/databus-poms/dbpedia/mappings/specific-mappingbased-properties/pom.xml b/databus-poms/dbpedia/mappings/specific-mappingbased-properties/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..0bfa20ca98eda4e00f0a9e087e59ab1c427efd78 --- /dev/null +++ b/databus-poms/dbpedia/mappings/specific-mappingbased-properties/pom.xml @@ -0,0 +1,13 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>group-metadata</artifactId> + <groupId>mappings</groupId> + <version>2020.02.01</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <artifactId>specific-mappingbased-properties</artifactId> + <groupId>mappings</groupId> + <packaging>jar</packaging> + +</project> diff --git a/databus-poms/dbpedia/mappings/specific-mappingbased-properties/specific-mappingbased-properties.md b/databus-poms/dbpedia/mappings/specific-mappingbased-properties/specific-mappingbased-properties.md new file mode 100644 index 0000000000000000000000000000000000000000..12a84ef2f7b460a9040ee34323a241ecfd016bcc --- /dev/null +++ b/databus-poms/dbpedia/mappings/specific-mappingbased-properties/specific-mappingbased-properties.md @@ -0,0 +1,5 @@ +# Numeric Literals converted to designated units with class-specific property mappings +Infobox numerical data from the mappings extraction using units of measurement more convenient for the resource class/type. + +The triples in [mappingbased-literals](https://databus.dbpedia.org/dbpedia/${project.groupId}/mappingbased-literals) use normalized values according to the base unit for the property (see [docu](https://databus.dbpedia.org/dbpedia/${project.groupId}/mappingbased-literals/${project.version}) for more details). However, this dataset contains triples where the values are converted to a specific unit of measurement more convenient for the resource class (e.g. square kilometres instead of square metres for the area of a city or runtime of a movie in minutes instead of seconds). To distinguish between values from [mappingbased-literals](https://databus.dbpedia.org/dbpedia/${project.groupId}/mappingbased-literals) which are normalized to base units, specific properties use the namespace of the form `http://dbpedia.org/ontology/$className/$propertyName`. The target conversion unit can be defined via [SpecificProperty mapping](http://mappings.dbpedia.org/index.php/Template:SpecificProperty) in the corresponding class (see e.g. [Work mapping](http://mappings.dbpedia.org/index.php/OntologyClass:Work)) and can also be retrieved via `rdfs:range` of the specific property (see e.g. [runtime specific property](http://dbpedia.org/ontology/Work/runtime)) from the [DBpedia Ontology](https://databus.dbpedia.org/dbpedia/ontology/dbo-snapshots). Moreover, the datatype IRI of the typed literal also denotes the converted unit via [DBpedia Datatypes](http://mappings.dbpedia.org/index.php?title=Special:AllPages&namespace=206), + diff --git a/databus-release.sh b/databus-release.sh index d0447ad3fa51c50694a2f9dea005d75b7149546a..bb50e98ed5a22516f4fc93d3caa04c7fb6e2ece7 100755 --- a/databus-release.sh +++ b/databus-release.sh @@ -15,6 +15,19 @@ then exit 1 fi +################## +# Server DownloadURL +################## +DOMAIN=$GROUP + +if [ "$GROUP" = "generic" ] || [ "$GROUP" = "mappings" ] +then + DOMAIN="mappings" +elif [ "$GROUP" = "text" ] +then + DOMAIN="generic" +fi + ################## # Setup and clone pom files @@ -22,33 +35,44 @@ fi #/data/extraction/wikidumps/enwiki/20191001 -git clone "https://github.com/dbpedia/databus-maven-plugin.git" $DATABUSDIR &>/dev/null -cd $DATABUSDIR -git pull +# git clone "https://github.com/dbpedia/databus-maven-plugin.git" $DATABUSDIR &>/dev/null +# cd $DATABUSDIR +# git pull -# copy +# creates links in databus dir # iterate all .ttl.bz2 files # uncomment for testing for path in $(find "$EXTRACTIONBASEDIR" -name "*.ttl.bz2" | sort); do - mapAndCopy $path + mapAndLink $path done + # deploy cd $DATABUSDIR/dbpedia/$GROUP; mvn versions:set -DnewVersion=$(ls * | grep '^[0-9]\{4\}.[0-9]\{2\}.[0-9]\{2\}$' | sort -u | tail -1); # get git commit link -GITSHORTHASH=${git log | head -1 | cut -f2 -d ' ' | grep -o "^......." } -GITHUBLINK=${git log | head -1 | cut -f2 -d ' ' | sed 's|^|https://github.com/dbpedia/extraction-framework/commit/|'} - +GITHUBLINK="$(diefCommitLink)" PUBLISHER="https://vehnem.github.io/webid.ttl#this"; -# TODO marvin: shouldn't this be the web dir directly? -PACKAGEDIR="/data/extraction/release/\${project.groupId}/\${project.artifactId}"; -DOWNLOADURL="http://dbpedia-$GROUP.tib.eu/release/\${project.groupId}/\${project.artifactId}/\${project.version}/"; +PACKAGEDIR="/var/www/dbpedia-$DOMAIN.tib.eu/databus-repo/marvin/\${project.groupId}/\${project.artifactId}"; +DOWNLOADURL="http://dbpedia-$DOMAIN.tib.eu/databus-repo/marvin/\${project.groupId}/\${project.artifactId}/\${project.version}/"; LABELPREFIX="(pre-release) "; -# todo replace with markdown or html when supported by upload client -COMMENTPREFIX="(MARVIN is the DBpedia bot for monthly raw releases (unparsed, unsorted) for debugging the DIEF software. After its releases, data is cleaned and persisted under the dbpedia account. Commit: $GITHUBLINK) " ; +COMMENTPREFIX="(MARVIN is the DBpedia bot for monthly raw releases (unparsed, unsorted) for debugging the DIEF software, commit: $GITHUBLINK . After its releases, data is cleaned and persisted under the DBpedia account. ) " ; -mvn clean deploy -Ddatabus.publisher="$PUBLISHER" -Ddatabus.packageDirectory="$PACKAGEDIR" -Ddatabus.downloadUrlPath="$DOWNLOADURL" -Ddatabus.labelPrefix="$LABELPREFIX" -Ddatabus.commentPrefix="$COMMENTPREFIX"; +echo "VARS: +GITHUBLINK: $GITHUBLINK +PUBLISHER: $PUBLISHER +PACKAGEDIR: $PACKAGEDIR +DOWNLOADURL: $DOWNLOADURL +LABELPREFIX: $LABELPREFIX +COMMENTPREFIX:$COMMENTPREFIX +" +# TODO workaround for the read time out exception +for i in `ls` ; +do + cd $i ; + mvn clean deploy -Ddatabus.publisher="$PUBLISHER" -Ddatabus.packageDirectory="$PACKAGEDIR" -Ddatabus.downloadUrlPath="$DOWNLOADURL" -Ddatabus.labelPrefix="$LABELPREFIX" -Ddatabus.commentPrefix="$COMMENTPREFIX"; + cd .. +done diff --git a/extractionConfiguration/download.generic.properties b/extractionConfiguration/download.generic.properties index 8d0fb170951e54f5a666de65c389e09d9d4a4703..27fdec695ae1b2c411296527ad92488c0cb5e957 100644 --- a/extractionConfiguration/download.generic.properties +++ b/extractionConfiguration/download.generic.properties @@ -18,4 +18,4 @@ retry-max=5 retry-millis=10000 #for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used -dump-date= +dump-date=20200201 diff --git a/extractionConfiguration/download.mappings.properties b/extractionConfiguration/download.mappings.properties index dff4b47ae850946a5378e47edd290b418c3fb204..ed4240da0c8e0c335a8c66c58ede7218e5a69543 100644 --- a/extractionConfiguration/download.mappings.properties +++ b/extractionConfiguration/download.mappings.properties @@ -16,4 +16,4 @@ retry-max=5 retry-millis=10000 #for specific dump dates (e.g. 20170101) if empty: the most recent dump-date is used -dump-date= +dump-date=20200201 diff --git a/extractionConfiguration/extraction.generic.en.properties b/extractionConfiguration/extraction.generic.en.properties index e3b7cac4fdbb501ce50f6e20fc97eafadfff6f4a..2070b68d08920126d9ff3df263bb61e1feb9d6de 100644 --- a/extractionConfiguration/extraction.generic.en.properties +++ b/extractionConfiguration/extraction.generic.en.properties @@ -23,7 +23,7 @@ languages=en extractors=.ArticleCategoriesExtractor,.ArticlePageExtractor,.ArticleTemplatesExtractor,.CategoryLabelExtractor,\ .ExternalLinksExtractor,.GeoExtractor,.InfoboxExtractor,.InterLanguageLinksExtractor,.LabelExtractor,.PageIdExtractor,\ .PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,\ -.WikiPageLengthExtractor,.WikiPageOutDegreeExtractor +.WikiPageLengthExtractor,.WikiPageOutDegreeExtractor,.ImageExtractorNew extractors.en=.CitationExtractor,.DisambiguationExtractor,.HomepageExtractor,.PersondataExtractor,.PndExtractor,.TopicalConceptsExtractor,.AnchorTextExtractor,.CommonsResourceExtractor diff --git a/extractionConfiguration/extraction.mappings.properties b/extractionConfiguration/extraction.mappings.properties index 5983b57fe2b7857a1fe9df7db97ab466feb23a80..b30a0c336b645829543fe7c2034b4412d40fa0cf 100644 --- a/extractionConfiguration/extraction.mappings.properties +++ b/extractionConfiguration/extraction.mappings.properties @@ -16,7 +16,7 @@ languages=@mappings # extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings" -extractors=.MappingExtractor +extractors=.MappingExtractor,.RedirectExtractor #extractors.ar=.MappingExtractor,.TopicalConceptsExtractor # diff --git a/functions.sh b/functions.sh index 2dcd03548de0d2d1deba46c3b375cde6442d7be1..1a77f9ca4656f52c6f4fe3bf394484d9963b0170 100755 --- a/functions.sh +++ b/functions.sh @@ -18,7 +18,7 @@ CONFIGDIR="$ROOT/extractionConfiguration" DIEFDIR="$ROOT/marvin-extraction/extraction-framework" LOGDIR="$ROOT/marvin-extraction/logs/$(date +%Y-%m-%d)" && mkdir -p $LOGDIR EXTRACTIONBASEDIR="$ROOT/marvin-extraction/wikidumps" && mkdir -p $EXTRACTIONBASEDIR -DATABUSDIR="$ROOT/marvin-extraction/databus-maven-plugin" +DATABUSDIR="$ROOT/databus-poms" ############## # functions @@ -148,7 +148,9 @@ mapNamesToDatabus() { esac } -mapAndCopy() { +# creates links in databus dir +mapAndLink() { + # each individual file path=$1 # split filename @@ -181,12 +183,12 @@ mapAndCopy() { if [ -d "$DATABUSDIR/dbpedia/$GROUP/$artifact" ]; then mkdir -p $targetFolder else - echo "\"$artifact\" (artifact not found) $path" >&2; + echo "[DEBUG]\"$artifact\" (artifact not found, might not be in group $GROUP) $path" >&2; fi # TODO proper handling of "_redirected" # TODO see above, redirected are moved to logdir and overwrite the unredirected - # concerns onlyy generic: + # concerns only generic: # < enwiki/20191001/enwiki-20191001-disambiguations_redirected.ttl.bz2 # < enwiki/20191001/enwiki-20191001-infobox-properties_redirected.ttl.bz2 # < enwiki/20191001/enwiki-20191001-page-links_redirected.ttl.bz2 @@ -195,13 +197,14 @@ mapAndCopy() { # copy # TODO enable after testing - cp -n "$path" "$targetFolder/$targetFile" + #cp -n "$path" "$targetFolder/$targetFile" + ln -s "$path" "$targetFolder/$targetFile" echo -e "< $path\n> $targetFolder/$targetFile\n----------------------" } +diefCommitLink() { - - - - + cd $DIEFDIR + echo "https://github.com/dbpedia/extraction-framework/commit/$(git rev-parse @)" +}