Commit 4bc115fc authored by Janos Borst's avatar Janos Borst
Browse files

Merge branch '10-documentation' into 'dev'

Resolve "Documentation"

See merge request !21
parents ee44e3c6 ed7405c0
Pipeline #45361 failed with stage
in 16 minutes and 8 seconds
......@@ -83,9 +83,21 @@ class MultiLabelDataset(Dataset):
self.target_dtype = target_dtype
def __len__(self):
"""
Returns the length of the dataset. The length is determined by the size
of the list containing the input text.
:return: Length of the dataset
"""
return len(self.x)
def __getitem__(self, idx):
"""
Retrieves a single entry from the dataset.
:param idx: Index of the entry
:return: Dictionary containing the text and labels of the entry
"""
if self.one_hot:
labels = [self.classes[tag] for tag in self.y[idx]]
labels = torch.nn.functional.one_hot(torch.LongTensor(labels), len(self.classes)).sum(0)
......@@ -125,6 +137,12 @@ class MultiLabelDataset(Dataset):
return {"x": self.x, "y": self.y, "classes": list(self.classes.keys())}
def __add__(self, o):
"""
Merges dataset with another dataset.
:param o: Another dataset
:return: MultiLabelDataset containing x, y and classes of both datasets
"""
new_classes = list(set(list(self.classes.keys()) + list(o.classes.keys())))
new_classes.sort()
new_classes = dict(zip(new_classes, range(len(new_classes))))
......@@ -236,16 +254,77 @@ class MultiLabelDataset(Dataset):
class SingleLabelDataset(MultiLabelDataset):
def __init__(self, *args, **kwargs):
"""
Class constructor. Creates an instance of SingleLabelDataset.
:param classes: A class mapping from label strings to successive indices
:param x: A list of the input text
:param y: A list of corresponding label sets
:param target_dtype: The final cast on the label output. (Some of torch's loss functions expect other data types. This argument defines
a function that is applied to the final output of the label tensors. (default: torch._cast_Float)
:param kwargs: Any additional information that is given by named keywords will be saved as metadata
Example:
```
x = ["This is a text about science",
"This is another text about philosophy"]
y = [['science'],
['politics']]
classes = {
"science": 0,
"philosophy": 1,
}
dataset = mlmc.data.SingleLabelDataset(x=x, y=y, classes=classes)
dataset[0]
```
"""
super(SingleLabelDataset, self).__init__(*args, **kwargs)
assert all(
[len(x) == 1 for x in self.y]), "This is not a single label dataset. Some labels contain multiple labels."
def __getitem__(self, idx):
"""
Retrieves a single entry from the dataset.
:param idx: Index of the entry
:return: Dictionary containing the text and labels of the entry
"""
return {'text': self.x[idx], 'labels': torch.tensor(self.classes[self.y[idx][0]])}
class MultiOutputMultiLabelDataset(Dataset):
def __init__(self, classes, x, y, target_dtype=torch._cast_Float, **kwargs):
"""
Class constructor. Creates an instance of MultiOutputMultiLabelDataset.
:param classes: A class mapping from label strings to successive indices
:param x: A list of the input text
:param y: A list of corresponding label sets
:param target_dtype: The final cast on the label output. (Some of torch's loss functions expect other data types. This argument defines
a function that is applied to the final output of the label tensors. (default: torch._cast_Float)
:param kwargs: Any additional information that is given by named keywords will be saved as metadata
Example:
```
x = ["Text sample 1", "Text sample 2"]
y = [[["label0", "label1"], ["label2"]],
[["label1"], ["label1", "label2"]]]
classes = [{
"label0": 0,
"label1": 1
}, {
"label1": 0,
"label2": 1
}]
dataset = mlmc.data.MultiOutputMultiLabelDataset(x=x, y=y, classes=classes)
dataset[0]
```
"""
super(MultiOutputMultiLabelDataset, self).__init__(**kwargs)
if isinstance(classes, dict):
self.classes = [classes.copy() for _ in range(len(y[0]))]
......@@ -263,9 +342,21 @@ class MultiOutputMultiLabelDataset(Dataset):
self.y = y
def __len__(self):
"""
Returns the length of the dataset. The length is determined by the size
of the list containing the input text.
:return: Length of the dataset
"""
return len(self.x)
def __getitem__(self, item):
"""
Retrieves a single entry from the dataset.
:param idx: Index of the entry
:return: Dictionary containing the text and labels of the entry
"""
result = {"text": self.x[item]}
label_one_hot = [
torch.stack([torch.nn.functional.one_hot(torch.tensor(x[label]), len(x)) for label in labelset], 0) for
......@@ -273,9 +364,33 @@ class MultiOutputMultiLabelDataset(Dataset):
result.update({f"labels_{i}": v.sum(0) for i, v in enumerate(label_one_hot)})
return result
class MultiOutputSingleLabelDataset(Dataset):
def __init__(self, classes, x, y=None, **kwargs):
"""
Class constructor. Creates an instance of MultiOutputSingleLabelDataset.
:param classes: A class mapping from label strings to successive indices
:param x: A list of the input text
:param y: A list of corresponding label sets
:param kwargs: Any additional information that is given by named keywords will be saved as metadata
Example:
```
x = ["Text sample 1", "Text sample 2"]
y = [[["label0"], ["label2"]],
[["label1"], ["label2"]]]
classes = [{
"label0": 0,
"label1": 1
}, {
"label2": 0
}]
dataset = mlmc.data.MultiOutputSingleLabelDataset(x=x, y=y, classes=classes)
dataset[0]
```
"""
super(MultiOutputSingleLabelDataset, self).__init__(**kwargs)
if y is not None:
if isinstance(classes, dict):
......@@ -293,15 +408,36 @@ class MultiOutputSingleLabelDataset(Dataset):
self.y = y
def __getitem__(self, item):
"""
Retrieves a single entry from the dataset.
:param idx: Index of the entry
:return: Dictionary containing the text and labels of the entry
"""
if self.y is None:
return {'text': self.x[item]}
else:
return {'text': self.x[item], 'labels': torch.tensor([d[y[0]] for d, y in zip(self.classes, self.y[item])])}
def __len__(self):
"""
Returns the length of the dataset. The length is determined by the size
of the list containing the input text.
:return: Length of the dataset
"""
return len(self.x)
def reduce(self, subset):
"""
Reduces the dataset to a subset of the classes.
The resulting dataset will only contain instances with at least one label that appears in the subset argument.
The subset can also provide a new mapping from the new label names to indices (dict).
All labels not in subset will be removed. Instances with an empty label set will be removed.
:param subset: A mapping of classes to indices
"""
assert len(subset) == len(self.classes), "Subset and existing classes have varying outputsizes"
assert all([all([x in c.keys() for x in s.keys()]) for s, c in
zip(subset, self.classes)]), "Subset contains classes not present in dataset"
......@@ -312,6 +448,12 @@ class MultiOutputSingleLabelDataset(Dataset):
self.classes = subset
def __add__(self, o):
"""
Merges dataset with another dataset.
:param o: Another dataset
:return: MultiOutputSingleLabelDataset containing x, y and classes of both datasets
"""
new_classes = [list(set(list(c1.keys()) + list(c2.keys()))) for c1, c2 in zip(self.classes, o.classes)]
new_classes = [dict(zip(c, range(len(c)))) for c in new_classes]
......@@ -404,6 +546,12 @@ def get_singlelabel_dataset(name):
return get_dataset(name, type=SingleLabelDataset, ensure_valid=False, target_dtype=torch._cast_Float)
def get(name):
"""
Universal get function for datasets.
:param name: Name of the dataset
:return: A dataset if the name exists
"""
try:
try:
return get_singlelabel_dataset(name)
......@@ -414,6 +562,12 @@ def get(name):
print(register.keys())
def is_multilabel(x):
"""
Checks if input is a multilabel dataset.
:param x: A dataset
:return: True if multilabel, else False.
"""
return type(x) in (MultiLabelDataset, MultiOutputMultiLabelDataset)
## Sampler import
......
......@@ -22,6 +22,12 @@ CACHE = Path.home() / ".mlmc" / "datasets"
URL = "https://aspra29.informatik.uni-leipzig.de:9090/"
def _load_from_tmp(dataset):
"""
Loads a dataset from cache.
:param dataset: Name of the dataset
:return: Tuple of form (data, classes) if dataset exists in cache, else None
"""
if not Path.exists(CACHE):
Path.mkdir(CACHE)
if Path.is_file(CACHE / dataset):
......@@ -33,6 +39,13 @@ def _load_from_tmp(dataset):
return None
def _save_to_tmp(dataset, data):
"""
Saves a dataset to cache.
:param dataset: Name of the dataset
:param data: Tuple of form (data, classes)
:return: Path to the saved dataset if dataset didn't exist in cache, else False.
"""
if not Path.exists(CACHE):
Path.mkdir(CACHE)
if not Path.is_file(CACHE / dataset):
......@@ -48,6 +61,12 @@ def _save_to_tmp(dataset, data):
def load_aapd():
"""
Loads AAPD (Arxiv Academic Paper Dataset) from cache. If it doesn't exist in cache the dataset will be downloaded.
Task: Multilabel Classification
:return: Tuple of form (data, classes)
"""
data = _load_from_tmp("aapd")
if data is not None:
return data
......@@ -76,6 +95,14 @@ def load_aapd():
def load_rcv1(path=None):
"""
Loads rcv1 (Reuters Corpus Volume I) from cache. This dataset can't be downloaded automatically so a path to the
archive has to be provided when first called.
Task: Multilabel Classification
:param path: Path to unprocessed corpus
:return: Tuple of form (data, classes)
"""
data = _load_from_tmp("rcv1")
if data is not None:
return data
......@@ -163,6 +190,14 @@ def load_rcv1(path=None):
def load_wiki30k(path="/disk1/users/jborst/Data/Test/MultiLabel/wiki30k"):
"""
Loads wiki30k from cache. This dataset can't be downloaded automatically so a path to the archive has to be provided
when first called.
Task: Multilabel Classification
:param path: Path to pickled dataset
:return: Tuple of form (data, classes)
"""
import pickle
with open(Path(path) / "wiki30k_raw_text.p", "rb") as f:
content = pickle.load(f)
......@@ -175,6 +210,12 @@ def load_wiki30k(path="/disk1/users/jborst/Data/Test/MultiLabel/wiki30k"):
def load_eurlex():
"""
Loads EUR-Lex from cache. If it doesn't exist in cache the dataset will be downloaded.
Task: Multilabel Classification
:return: Tuple of form (data, classes)
"""
data = _load_from_tmp("eurlex")
if data is not None:
return data
......@@ -206,6 +247,13 @@ def load_eurlex():
def load_huffpost(test_split=0.25):
"""
Loads Huffington Post dataset from cache. If it doesn't exist in cache the dataset will be downloaded.
Task: Multilabel Classification
:param test_split: Size of test split as fraction of 1
:return: Tuple of form (data, classes)
"""
data = _load_from_tmp("huffpost")
if data is not None:
return data
......@@ -237,6 +285,13 @@ def load_huffpost(test_split=0.25):
def load_moviesummaries(test_split=0.25):
"""
Loads Movie Summaries dataset from cache. If it doesn't exist in cache the dataset will be downloaded.
Task: Multilabel Classification
:param test_split: Size of test split as fraction of 1
:return: Tuple of form (data, classes)
"""
data = _load_from_tmp("moviesummaries")
if data is not None:
return data
......@@ -285,6 +340,12 @@ def load_moviesummaries(test_split=0.25):
# ----------------------------------------------
def load_blurbgenrecollection():
"""
Loads Blurb Genre Collection dataset from cache. If it doesn't exist in cache the dataset will be downloaded.
Task: Multilabel Classification
:return: Tuple of form (data, classes)
"""
url = "https://fiona.uni-hamburg.de/ca89b3cf/blurbgenrecollectionen.zip"
data = _load_from_tmp("blurbgenrecollection")
if data is not None:
......@@ -319,6 +380,13 @@ def load_blurbgenrecollection():
def load_blurbgenrecollection_de():
"""
Loads German version of Blurb Genre Collection dataset from cache. If it doesn't exist in cache the dataset will be
downloaded.
Task: Multilabel Classification
:return: Tuple of form (data, classes)
"""
url = "https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc/germeval2019t1-public-data-final.zip"
data = _load_from_tmp("blurbgenrecollection_de")
if data is not None:
......@@ -369,6 +437,12 @@ def load_webofscience():
# zipfile = ZipFile(BytesIO(resp.read()))
def load_20newsgroup():
"""
Loads 20newsgroup dataset from cache. If it doesn't exist in cache the dataset will be downloaded.
Task: Singlelabel Classification
:return: Tuple of form (data, classes)
"""
url = "http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz"
data = _load_from_tmp("20newsgroup")
if data is not None:
......@@ -419,6 +493,12 @@ def load_20newsgroup():
return data, classes
def load_agnews():
"""
Loads AG News dataset from cache. If it doesn't exist in cache the dataset will be downloaded.
Task: Singlelabel Classification
:return: Tuple of form (data, classes)
"""
url = "https://s3.amazonaws.com/fast-ai-nlp/ag_news_csv.tgz"
data = _load_from_tmp("agnews")
if data is not None:
......@@ -460,6 +540,12 @@ def load_agnews():
return data, classes
def load_dbpedia():
"""
Loads DBpedia dataset from cache. If it doesn't exist in cache the dataset will be downloaded.
Task: Singlelabel Classification
:return: Tuple of form (data, classes)
"""
url = "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz"
data = _load_from_tmp("dbpedia")
if data is not None:
......@@ -501,6 +587,12 @@ def load_dbpedia():
return data, classes
def load_ohsumed():
"""
Loads Ohsumed dataset from cache. If it doesn't exist in cache the dataset will be downloaded.
Task: Multilabel Classification
:return: Tuple of form (data, classes)
"""
url = "http://disi.unitn.eu/moschitti/corpora/ohsumed-first-20000-docs.tar.gz"
url_classes = "http://disi.unitn.eu/moschitti/corpora/First-Level-Categories-of-Cardiovascular-Disease.txt"
data = _load_from_tmp("ohsumed")
......@@ -589,6 +681,13 @@ def load_ohsumed():
return data, classes
def export(data, classes, path=Path("./export")):
"""
Exports the data and class dictionaries of a dataset to text files.
:param data: Data dictionary containing training, validation and test splits
:param classes: Class dictionary mapping labels to indices
:param path: Path to write the text files to
"""
path = Path(path)
if not path.exists():
path.mkdir()
......@@ -607,6 +706,12 @@ def export(data, classes, path=Path("./export")):
o.writelines([x + "\n" for x in classes.keys()])
def load_yahoo_answers():
"""
Loads Yahoo Answers dataset from cache. If it doesn't exist in cache the dataset will be downloaded.
Task: Singlelabel Classification
:return: Tuple of form (data, classes)
"""
url = (URL+"/yahoo_answers_csv.tar.gz").replace("https","http")
data = _load_from_tmp("yahoo_answers")
if data is not None:
......
......@@ -5,8 +5,8 @@ import torch
def clean(x):
"""
Remove every character in a string that is not ascii, punctuation or whitespace
:param x:
:return:
:param x: String
:return: Cleaned string
"""
import string
return "".join([c for c in x if c in string.ascii_letters + string.punctuation + " "])
......
"""
Provides functions for loading predefined graphs
"""
from .helpers import cooc_matrix
from .embeddings import get_nmf, get_node2vec, get_random_projection
from .graph_loaders import load_wordnet, load_wordnet_sample, load_NELL,load_elsevier,load_conceptNet, load_stw, load_nasa, \
......@@ -18,6 +21,12 @@ register = {
def get_graph(name: str):
"""
Loads a graph.
:param name: Name of the graph (see register.keys())
:return: Function call of the chosen graph
"""
fct = register.get(name)
if fct is None:
raise FileNotFoundError
......@@ -26,8 +35,12 @@ def get_graph(name: str):
import networkx as nx
def get(name: [list, str]):
"""
Loads a graph. If multiple names are provided the union of the graphs is returned.
:param name: Name(s) of the graph(s) to compose (see register.keys())
:return: Merged graph
"""
if isinstance(name, str):
name = [name]
return nx.compose_all([get_graph(x) for x in name])
......@@ -7,6 +7,14 @@ from sklearn.decomposition import NMF
def subgraph_extract(X, graph, subnodelist):
"""
Extracts a subset of node embeddings from a graph.
:param X: Node embeddings of graph
:param graph: A networkx graph
:param subnodelist: Dictionary of nodes for which the embedding will be returned
:return: Embeddings of all nodes in subnodelist
"""
new = np.zeros_like(X)
for i, nm in enumerate(graph.nodes):
if nm in subnodelist.keys():
......
......@@ -6,9 +6,25 @@ import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
def ngrams(x, k):
"""
Splits text into n-grams.
:param x: A string
:param k: Size of each n-gram
:return: List of n-grams
"""
return [x[i:(i + k)] for i in range(len(x) - k + 1)]
def _mh(x, k, num_perm, wk):
"""
Calculates MinHash of a string for estimating Jaccard similarity using shingling. Multiple shingling sizes may be
specified.
:param x: A string
:param k: Shingling size(s)
:param num_perm: Number of permutation functions
:return: MinHash
"""
from datasketch import MinHash, MinHashLSH
x = x.upper()
k = k if isinstance(k, (tuple, list)) else [k]
......@@ -22,6 +38,16 @@ def _mh(x, k, num_perm, wk):
return m1
def _subwordsplits_mh(x, k,num_perm, wk):
"""
Calculates MinHash of a string for estimating Jaccard similarity using shingling. The initial shingling is further
split in substrings. Multiple shingling sizes may be specified.
:param x: A string
:param k: Shingling size(s)
:param num_perm: Number of permutation functions
:param wk: Shingling size(s) of the substrings
:return: MinHash
"""
x = x.upper()
from datasketch import MinHash, MinHashLSH
k = k if isinstance(k, (tuple, list)) else [k]
......@@ -35,6 +61,19 @@ def _subwordsplits_mh(x, k,num_perm, wk):
return m1
def edges(l1, l2,_mh, num_perm=48, n=(2, 3), threshold=0.65, wk=(1,2,3)):
"""
Compares two lists using Jaccard similarity and shingling. Multiple shingling sizes may be specified.
:param l1: List of nodes
:param l2: List of nodes
:param _mh: MinHash function (_mh or _subwordsplits_mh)
:param num_perm: Number of permutation functions
:param n: Shingling size(s)
:param threshold: Jaccard similarity threshold
:param wk: Shingling size(s) of the substrings
:return: Dictionary containing all objects of l1 as keys and the corresponding objects of l2 above the specified
threshold as values in list form.
"""
# helper for
from datasketch import MinHash, MinHashLSH
......@@ -48,6 +87,7 @@ def edges(l1, l2,_mh, num_perm=48, n=(2, 3), threshold=0.65, wk=(1,2,3)):
def graph_insert_labels(data, kb, explanations):
# TODO: Documentation
l1 = list(data["classes"].keys())
l2 = list(kb.nodes())
......
......@@ -14,6 +14,14 @@ from ..data.data_loaders import _save_to_tmp, _load_from_tmp
def transform(x, rg, lang="en"):
"""
Transforms rdflib terms into plain text.
:param x: A term (see rdflib.term)
:param rg: An RDF Graph
:param lang: Language of the wikidata article
:return: String representation of term
"""
if isinstance(x, rdflib.term.URIRef):
from rdflib.namespace import SKOS
if str(SKOS) in str(x): return [x.split("#")[-1]]
......@@ -30,12 +38,25 @@ def transform(x, rg, lang="en"):
def transform_triples(rg,lang="en"):
"""
Transforms rdflib triples into plain text.
:param rg: An RDF Graph
:param lang: Language of the wikidata article
:return: List containing transformed triples
"""
new_list = [[transform(x, rg, lang) for x in t] for t in tqdm(rg)]
new_list = [x for x in new_list
if (not (any([r==[] for r in x]) or any([r is None for r in x]))) and ( "prefLabel" not in x[1])]
return new_list
def get_wikidata_desc(x):
"""
Queries wikidata using ID's to retrieve descriptions.
:param x: List of ID's
:return: Dictionary of input ID's corresponding to its wikidata descriptions
"""
import requests
output = []
......@@ -61,6 +82,11 @@ def get_wikidata_desc(x):
return descriptions
def load_mesh():
"""
Loading the MeSH thesaurus graph as a networkx.DiGraph.