Commit 73cb8273 authored by Janos Borst's avatar Janos Borst
Browse files

Auto stash before rebase of "origin/dev"

parent 24b72fbc
Pipeline #49960 passed with stage
in 9 minutes and 47 seconds
from .cb_save import CallbackSaveAndRestore
\ No newline at end of file
from .cb_save import CallbackSaveAndRestore
from .cb_nla import CallbackNLA, CallbackLNA
\ No newline at end of file
......@@ -8,3 +8,7 @@ class Callback:
pass
def on_epoch_start(self, model):
pass
def on_batch_start(self, model):
pass
def on_batch_end(self, model):
pass
import torch
from .abstract import Callback
class CallbackNLA(Callback):
def __init__(self, threshold=0.9, target_threshold=0.5, difference=False, quiet=False, epochs=None):
super(CallbackNLA, self).__init__()
self.name = "NLA"
self.threshold = threshold
self.difference = difference
self.target_threshold = target_threshold
self.quiet = quiet
self.epochs = epochs
def on_epoch_end(self, model):
labels, scores, _ = model.predict_dataset(model.train_data_set, return_scores=True, pbar=False)
scores = scores.max(-1)[0].cpu()
keep = [not(x!=y and scores[i].item()>self.threshold) for i,(x,y) in enumerate(zip(labels, model.train_data_set.y))]
new_data = type(model.train_data_set)(x=[x for x,k in zip(model.train_data_set.x, keep) if k],
y=[y for y, k in zip(model.train_data_set.y, keep) if k],
classes = model.train_data_set.classes
)
if not self.quiet:
from collections import Counter
class_distr = Counter([y[0] for y, k in zip(model.train_data_set.y, keep) if not k])
print(f"NLA removed {len(model.train_data_set)-len(new_data)} examples")
print(class_distr)
model.train_data_set = new_data
if self.epochs is not None:
self.threshold = self.threshold - (self.threshold-self.target_threshold)/self.epochs
else:
if self.threshold>self.target_threshold:
self.threshold = self.threshold - (self.threshold-self.target_threshold)/10
def on_train_end(self, model):
pass
def on_epoch_start(self, model):
pass
class CallbackLNA(Callback):
def __init__(self, threshold=0.9, target_threshold=0.5, difference=False, quiet=False, epochs=None):
super(CallbackLNA, self).__init__()
self.name = "LNA"
self.threshold = threshold
self.difference = difference
self.target_threshold = target_threshold
self.quiet = quiet
self.epochs = epochs
def on_epoch_end(self, model):
labels, scores, _ = model.predict_dataset(model.train_data_set, return_scores=True, pbar=False)
with torch.no_grad():
l = model.loss.loss(scores, torch.tensor([x["labels"] for x in model.train_data_set]).to(scores.device)).cpu()
mask = (l < (l.mean() + 1 * l.std())) #& (l > l.mean() - 1 * l.std())
mask = mask.cpu().tolist()
new_data = type(model.train_data_set)(x=[x for x,k in zip(model.train_data_set.x, mask) if k],
y=[y for y, k in zip(model.train_data_set.y, mask) if k],
classes = model.train_data_set.classes
)
if not self.quiet:
from collections import Counter
class_distr = Counter([y[0] for y, k in zip(model.train_data_set.y, mask) if not k])
print(f"LNA removed {len(model.train_data_set)-len(new_data)} examples")
print(class_distr)
model.train_data_set = new_data
def on_train_end(self, model):
pass
def on_epoch_start(self, model):
pass
......@@ -99,4 +99,17 @@ SFORMATTER = {"agnews": lambda x: f"The topic of this is {label_dicts['agnews'].
"trec50": lambda x: f"This is a {label_dicts['trec50'][x]}",
"yelpfull": lambda x: f"This is a {label_dicts['yelpfull'][x]}",
"amazonfull": lambda x: f"This is a {label_dicts['amazonfull'][x]}",
}
SFORMATTER_TARS = {"agnews": lambda x: f"label topic {label_dicts['agnews'].get(x,x)}",
"yahoo_answers": lambda x: f"label topic {x}",
"rcv1": lambda x: f"label topics {x}",
"blurbgenrecollection": lambda x: f"label topics {x}",
"movies_summaries": lambda x: f"label genre {x}",
"movie_reviews": lambda x: f"{x} sentiment",
"dbpedia": lambda x: f"label topic {label_dicts['dbpedia'].get(x,x)}",
"trec6": lambda x: f"question {label_dicts['trec6'][x]}",
"trec50": lambda x: f"question {label_dicts['trec50'][x]}",
"yelpfull": lambda x: f"sentiment {label_dicts['yelpfull'][x]}",
"amazonfull": lambda x: f"sentiment {label_dicts['amazonfull'][x]}",
}
\ No newline at end of file
import torch.cuda
import transformers
import tqdm
from transformers import pipeline
import random
import mlmc
import mlflow
dataset = "agnews"
def prompt(cls, example):
if dataset == "dbpedia":
cls = mlmc.data.label_dicts[dataset].get(cls,cls)
s = "Wikipedia article:\nText: {example}\n\n"
example = [example] if isinstance(example, str) else example
s = "\n\n".join([s.replace("{example}", x) for x in example])
return s + f"Wikipedia Article\n Category: {cls}\nText: "
elif dataset =="agnews":
cls = mlmc.data.label_dicts[dataset].get(cls,cls).replace("World", "World, General")
s = "News article:\nText: {example}\n\n"
example = [example] if isinstance(example, str) else example
s = "\n\n".join([s.replace("{example}", x)[:512] for x in example])
return s + f"news Article\n Category: {cls}\nText: "
def prompt_ctrl(cls, example):
cls = mlmc.data.label_dicts[dataset].get(cls, cls)
if dataset == "dbpedia":
return f"Wikipedia {cls} {example[0].split(' ')[0]}"
elif dataset =="agnews":
return f"News {cls} {example[0].split(' ')[0]}"
def create_synth_dataset(model="microsoft/DialoGPT-large", classes=[], examples = [], k=32, n=4, s=8):
tokenizer = transformers.AutoTokenizer.from_pretrained(model)
model = transformers.AutoModelWithLMHead.from_pretrained(model,pad_token_id=tokenizer.eos_token_id)
text_generation = pipeline("text-generation", model=model, tokenizer=tokenizer, device=1)
x = []
y = []
for cls in classes.keys():
for _ in tqdm.trange(int(k/n/s)):
prefix_text = prompt(cls, random.choices(examples, k=2))
if n>1:
generated_text = sum(text_generation([prefix_text]*n, max_length=512, do_sample=True, return_full_text=False, num_samples=n, num_return_sequences=s),[])
else:
generated_text = text_generation(prefix_text, max_length=512, do_sample=True, return_full_text=False, num_return_sequences=s)
x.extend([x['generated_text'] for x in generated_text])
y.extend([[cls]]*n*s)
del text_generation
del model
torch.cuda.empty_cache()
return mlmc.data.SingleLabelDataset(x=x, y=y, classes=classes)
gen_model="t5-large"
mlflow.set_tracking_uri("file:///home/jborst/generation")
id = mlflow.set_experiment("data")
with mlflow.start_run(run_name=f"{gen_model}_example_prompt"):
mlflow.log_param("dataset", dataset)
d = mlmc.data.get(dataset)
u = mlmc.data.sampler(d["train"], absolute=32)
usample = u.x[:32]
lsample = u.y[:32]
k=2#048
s=32
synth = create_synth_dataset(model=gen_model, classes = d["classes"], examples=usample, k=k, n=1, s=s)
mlflow.log_param("K", k)
mlflow.log_param("s", s)
mlflow.log_param("gen_model", gen_model)
import pickle
with open("synth.data", "wb") as f: pickle.dump(synth, f)
mlflow.log_artifact("synth.data")
import torch.cuda
import transformers
import tqdm
from transformers import pipeline
import random
import mlmc
import mlflow
dataset = "agnews"
device = "cuda:1"
def prompt(cls, example):
if dataset == "dbpedia":
cls = mlmc.data.label_dicts[dataset].get(cls,cls)
s = "Wikipedia article:\nText: {example}\n\n"
example = [example] if isinstance(example, str) else example
s = "\n\n".join([s.replace("{example}", x) for x in example])
return s + f"Wikipedia Article\n Category: {cls}\nText: "
elif dataset =="agnews":
cls = mlmc.data.label_dicts[dataset].get(cls,cls).replace("World", "World, General")
s = "News article:\nText: {example}\n\n"
example = [example] if isinstance(example, str) else example
s = "\n\n".join([s.replace("{example}", x)[:512] for x in example])
return s + f"news Article\n Category: {cls}\nText: "
def prompt_ctrl(cls, example):
cls = mlmc.data.label_dicts[dataset].get(cls, cls)
if dataset == "dbpedia":
return f"Wikipedia {cls} {example[0].split(' ')[0]}"
elif dataset =="agnews":
return f"News {cls} {example[0].split(' ')[0]}"
def create_synth_dataset(model="microsoft/DialoGPT-large", classes=[], examples = [], k=32, n=4, s=8):
tokenizer = transformers.AutoTokenizer.from_pretrained(model)
model = transformers.T5ForConditionalGeneration.from_pretrained(model,pad_token_id=tokenizer.eos_token_id).to(device)
x = []
y = []
for cls in classes.keys():
for _ in tqdm.trange(int(k/n/s)):
prefix_text = prompt(cls, random.choices(examples, k=2))
input_ids = tokenizer.encode("Generate News Article Sports:", return_tensors='pt')
greedy_output = model.generate(input_ids.to(device), num_beams=7, no_repeat_ngram_size=2, min_length=50,max_length=100)
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))
x.extend([x['generated_text'] for x in generated_text])
y.extend([[cls]]*n*s)
del text_generation
del model
torch.cuda.empty_cache()
return mlmc.data.SingleLabelDataset(x=x, y=y, classes=classes)
gen_model="t5-large"
mlflow.set_tracking_uri("file:///home/jborst/generation")
id = mlflow.set_experiment("data")
with mlflow.start_run(run_name=f"{gen_model}_example_prompt"):
mlflow.log_param("dataset", dataset)
d = mlmc.data.get(dataset)
u = mlmc.data.sampler(d["train"], absolute=32)
usample = u.x[:32]
lsample = u.y[:32]
k=2#048
s=32
synth = create_synth_dataset(model=gen_model, classes = d["classes"], examples=usample, k=k, n=1, s=s)
mlflow.log_param("K", k)
mlflow.log_param("s", s)
mlflow.log_param("gen_model", gen_model)
import pickle
with open("synth.data", "wb") as f: pickle.dump(synth, f)
mlflow.log_artifact("synth.data")
from .loss_labelwise_ranking import RelativeRankingLoss
\ No newline at end of file
from .loss_labelwise_ranking import RelativeRankingLoss
from .loss_mask_noise_wrapper import MaskNoiseWrapper
\ No newline at end of file
import torch
class MaskNoiseWrapper(torch.nn.Module):
def __init__(self, loss, warm_up=0):
super(MaskNoiseWrapper, self).__init__()
self.loss = loss
self.average = 0
self.warm_up = warm_up
self.step = 0
def forward(self, x, y):
l = self.loss(x, y)
with torch.no_grad():
mask = (l < l.mean() + 1*l.std()) #& (l > l.mean() - 1*l.std())
weight = (1 -l).softmax(-1)
return (l * mask * weight * l.shape[0]).mean()
......@@ -34,4 +34,5 @@ class EncoderAbstract(LabelEmbeddingAbstract):
from transformers import AutoModelForSequenceClassification, AutoTokenizer
self.embedding = AutoModelForSequenceClassification.from_pretrained(self.representation, num_labels=3)
self.tokenizer = AutoTokenizer.from_pretrained(self.representation)
self.embeddings_dim = self.embedding(self.tokenizer("test"))
for param in self.embedding.parameters(): param.requires_grad = self.finetune
......@@ -4,4 +4,5 @@ from .embedding.embedding_weighted import EmbeddingBasedWeighted
from .embedding.embedding_entailment import EmbeddingBasedEntailment
from .embedding.embedding_entailment_fft import EmbeddingBasedEntailmentFFT
from .embedding.embedding_entailment_cps import EmbeddingBasedEntailmentCPS
from .embedding.embedding_weighted_euclidean import EmbeddingBasedWeightedEuclidean
from .graph.graph import GraphBased
\ No newline at end of file
from .embedding_weighted import EmbeddingBasedWeighted
from .embedding_entailment import EmbeddingBasedEntailment
from .embedding_entailment_fft import EmbeddingBasedEntailmentFFT
from .embedding_entailment_cps import EmbeddingBasedEntailmentCPS
\ No newline at end of file
from .embedding_entailment_cps import EmbeddingBasedEntailmentCPS
from .embedding_weighted_euclidean import EmbeddingBasedWeightedEuclidean
\ No newline at end of file
import torch
from ...abstracts.abstracts_zeroshot import TextClassificationAbstractZeroShot
from ...abstracts.abstract_sentence import SentenceTextClassificationAbstract
class EmbeddingBasedWeightedEuclidean(SentenceTextClassificationAbstract,TextClassificationAbstractZeroShot):
"""
Zeroshot model based on cosine distance of embedding vectors.
"""
def __init__(self, mode="vanilla", entailment_output=3, *args, **kwargs):
"""
Zeroshot model based on cosine distance of embedding vectors.
This changes the default activation to identity function (lambda x:x)
Args:
mode: one of ("vanilla", "max", "mean", "max_mean", "attention", "attention_max_mean"). determines how the sequence are weighted to build the input representation
entailment_output: the format of the entailment output if NLI pretraining is used. (experimental)
*args:
**kwargs:
"""
if "act" not in kwargs:
kwargs["activation"] = lambda x: x
super(EmbeddingBasedWeightedEuclidean, self).__init__(*args, **kwargs)
self.modes = ("vanilla","max","mean","max_mean")
assert mode in self.modes, f"Unknown mode: '{mode}'!"
self.set_mode(mode=mode)
self.create_labels(self.classes)
self.parameter = torch.nn.Linear(self.embeddings_dim,256)
self.entailment_projection = torch.nn.Linear(3 * self.embeddings_dim, entailment_output)
self.build()
def set_mode(self, mode):
"""Set weighting mode"""
self.mode = mode.split("_")
self._config["mode"] = mode
def forward(self, x, *args, **kwargs):
input_embedding = self.embedding(**x)[0]
label_embedding = self.embedding(**self.label_dict)[0]
if "mean" in self.mode:
label_embedding = label_embedding - label_embedding.mean(0)
# print("mean added")
if "max" in self.mode:
input_embedding2 = input_embedding / input_embedding.norm(p=2, dim=-1, keepdim=True)
label_embedding2 = label_embedding / label_embedding.norm(p=2, dim=-1, keepdim=True)
word_scores = 0.5*(1+torch.einsum("ijk,lnk->iljn", input_embedding2, label_embedding2)).max(-1)[0].max(-1)[0].log_softmax(-1)
# print("attnetion or max added")
input_embedding = self._mean_pooling(input_embedding, x["attention_mask"])
label_embedding = self._mean_pooling(label_embedding, self.label_dict["attention_mask"])
r = -(input_embedding[:,None] - label_embedding[None]).norm(p=2,dim=-1)
if "max" in self.mode:
r = r + word_scores
return r
def embed(self, x):
"""
Method to return input embeddings.
ToDo: Modularize the forward to avoid code copying.
Args:
x: list of input texts
Returns: a tuple of:
A tensor of embeddings shape (b, e), where b is the number of input texts and e the embedding dimension
A tensor of embeddings shape (l, e), where l is the number of labels and e the embedding dimension
"""
x = self.transform(x)
input_embedding = self.embedding(**x)[0]
label_embedding = self.embedding(**self.label_dict)[0]
if "mean" in self.mode:
label_embedding = label_embedding - label_embedding.mean(0)
if "attention" in self.mode or "max" in self.mode:
input_embedding2 = input_embedding / input_embedding.norm(p=2, dim=-1, keepdim=True)
label_embedding2 = label_embedding / label_embedding.norm(p=2, dim=-1, keepdim=True)
word_scores = torch.einsum("ijk,lnk->iljn", input_embedding2, label_embedding2)
if "attention" in self.mode:
attentions = torch.relu(word_scores.mean(-1))
input_embedding = self._mean_pooling((attentions[..., None] * input_embedding[:, None]).transpose(1, 2),
x["attention_mask"][:, :, None])
label_embedding = self._mean_pooling(label_embedding, self.label_dict["attention_mask"])
input_embedding = input_embedding / (input_embedding.norm(p=2, dim=-1, keepdim=True) + 1e-25)
label_embedding = label_embedding / (label_embedding.norm(p=2, dim=-1, keepdim=True) + 1e-25)
else:
input_embedding = self._mean_pooling(input_embedding, x["attention_mask"])
label_embedding = self._mean_pooling(label_embedding, self.label_dict["attention_mask"])
input_embedding = input_embedding / (input_embedding.norm(p=2, dim=-1, keepdim=True) + 1e-25)
label_embedding = label_embedding / (label_embedding.norm(p=2, dim=-1, keepdim=True) + 1e-25)
return input_embedding, label_embedding
def scores(self, x):
"""
Returns 2D tensor with length of x and number of labels as shape: (N, L)
Args:
x:
Returns:
"""
self.eval()
assert not (self._config["target"] == "single" and self._config["threshold"] != "max"), \
"You are running single target mode and predicting not in max mode."
if not hasattr(self, "classes_rev"):
self.classes_rev = {v: k for k, v in self.classes.items()}
x = self.transform(x)
with torch.no_grad():
output = self.act(self(x))
if self._loss_name == "ranking":
output = 0.5*(output+1)
self.train()
return output
def single(self, loss="ranking"):
"""Helper function to set the model into single label mode"""
from ....loss import RelativeRankingLoss
self._config["target"] = "single"
self.set_threshold("max")
self.set_activation(lambda x: x)
self._loss_name = loss
if loss == "ranking":
self.set_loss(RelativeRankingLoss(0.5))
else:
self.set_loss(torch.nn.CrossEntropyLoss())
self._all_compare=True
self.build()
def multi(self, loss="ranking"):
"""Helper function to set the model into multi label mode"""
from ....loss import RelativeRankingLoss
self._config["target"] = "multi"
self.set_threshold("mcut")
self.set_activation(lambda x: x)
self._loss_name = loss
if loss == "ranking":
self.set_loss(RelativeRankingLoss(0.5))
else:
self.set_loss(torch.nn.BCELoss())
self._all_compare=True
self.build()
def sts(self):
"""Helper function to set the model into multi label mode"""
from ....loss import RelativeRankingLoss
self._config["target"] = "multi"
self._loss_name="ranking"
self.set_threshold("hard")
self.set_activation(lambda x: x)
self.set_loss(RelativeRankingLoss(0.5))
self.build()
def entailment(self):
self._config["target"] = "single"
self._config["entailment_output"] = 3
self.target = "single"
self.set_sformatter(lambda x: x)
self.set_threshold("max")
self.set_activation(torch.softmax)
self.set_loss(torch.nn.CrossEntropyLoss())
self._all_compare = False
self.build()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment