Commit 0ec67d9a authored by Janos Borst's avatar Janos Borst
Browse files

saVE

parent 20fb99b9
Pipeline #50094 passed with stage
in 9 minutes and 56 seconds
\begin{tabular}{lllll}
\toprule
& & Accuracy & Micro F1 & Duration \\
\midrule
AGNews & BART-large & 0.681579 & NaN & 00:06 \\
& RoBERTa & 0.415263 & NaN & 00:06 \\
BGC & BART-large & NaN & 0.221105 & 16:00 \\
& BART-large & NaN & 0.194714 & 15:59 \\
& RoBERTa & NaN & 0.119461 & 09:22 \\
DBpedia & BART-large & 0.547257 & NaN & 02:55 \\
& RoBERTa & 0.7181 & NaN & 02:30 \\
RCV1 & BART-large & NaN & 0.127119 & 07:19 \\
& BART-large & NaN & 0.144038 & 07:18 \\
& RoBERTa & NaN & 0.09477 & 03:34 \\
\bottomrule
\end{tabular}
artifact_location: file.///data/mlflow/zeroshot/0
experiment_id: '0'
lifecycle_stage: active
name: Default
artifact_location: file.///data/mlflow/zeroshot/1
experiment_id: '1'
lifecycle_stage: active
name: 7 - Fewshot
......@@ -251,6 +251,21 @@ class MultiLabelDataset(Dataset):
"""
return sum([len(x) for x in self.y]) / len(self.y)
@staticmethod
def from_pandas(df, x, y, sep=" ", classes=None):
y = y if isinstance(y, str) else y[0]
label = [l for l in df[y].to_list()]
cls_list = list(set(sum(label,[])))
return MultiLabelDataset(
x = df[x].applymap(str).agg(sep.join, axis=1).to_list(),
y = label,
classes=classes if classes is not None else {cls:i for i, cls in enumerate(sorted(cls_list))}
)
def to_pandas(self):
import pandas as pd
return pd.DataFrame.from_dict({"x": self.x, "y": self.y})
class SingleLabelDataset(MultiLabelDataset):
def __init__(self, *args, **kwargs):
......@@ -285,6 +300,19 @@ class SingleLabelDataset(MultiLabelDataset):
assert all(
[len(x) == 1 for x in self.y]), "This is not a single label dataset. Some labels contain multiple labels."
@staticmethod
def from_pandas(df, x, y, sep=" ", classes=None):
y = y if isinstance(y, str) else y[0]
return SingleLabelDataset(
x = df[x].applymap(str).agg(sep.join, axis=1).to_list(),
y = [[str(l)] for l in df[y].tolist()],
classes=classes if classes is not None else {cls:i for i, cls in enumerate(sorted(df[y].map(str).unique()))}
)
def to_pandas(self):
import pandas as pd
return pd.DataFrame.from_dict({"x": self.x, "y": self.y})
def __getitem__(self, idx):
"""
Retrieves a single entry from the dataset.
......@@ -292,10 +320,7 @@ class SingleLabelDataset(MultiLabelDataset):
:param idx: Index of the entry
:return: Dictionary containing the text and labels of the entry
"""
try:
return {'text': self.x[idx], 'labels': torch.tensor(self.classes[self.y[idx][0]])}
except:
print("shit")
return {'text': self.x[idx], 'labels': torch.tensor(self.classes[self.y[idx][0]])}
def to_csv(self, filename):
with open(filename, "w") as f:
......
......@@ -11,6 +11,7 @@ class MaskNoiseWrapper(torch.nn.Module):
def forward(self, x, y):
l = self.loss(x, y)
with torch.no_grad():
# This seems to be counter productive according to new research...(Well classified exmaples...)
mask = (l < l.mean() + 1*l.std()) #& (l > l.mean() - 1*l.std())
weight = (1 -l).softmax(-1)
return (l * mask * weight * l.shape[0]).mean()
......@@ -66,6 +66,7 @@ class TextClassificationAbstract(torch.nn.Module):
self._config = {
"classes": classes,
"n_classes": len(classes),
"target": target,
"representation": representation,
"activation": activation, "loss": loss,
......
......@@ -23,7 +23,7 @@ class SimpleEncoder(EncoderAbstract, TextClassificationAbstractZeroShot):
e = e.reshape((int(x["input_ids"].shape[0] / self._config["n_classes"]), self._config["n_classes"]))
elif self._config["target"] == "multi":
e = torch.log(e[:, [0, 2]].softmax(-1)[:, -1])
e = e.squeeze(.1)
e = e.reshape((int(x["input_ids"].shape[0] / len(self._config["classes"])), len(self._config["classes"])))
else:
assert not self._config["target"], f"Target {self._config['target']} not defined"
......
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment