Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Fabian Ziegner
ZeroshotEvaluation
Commits
b8a23e24
Commit
b8a23e24
authored
Sep 06, 2021
by
Fabian Ziegner
Browse files
Added scripts
parent
d70e9a88
Changes
4
Hide whitespace changes
Inline
Side-by-side
transform.py
0 → 100644
View file @
b8a23e24
import
pandas
as
pd
import
mdbh
pd
.
set_option
(
'display.max_rows'
,
500
)
pd
.
set_option
(
'display.max_columns'
,
500
)
pd
.
set_option
(
'display.width'
,
1000
)
db_name
=
""
uri
=
mdbh
.
get_mongodb
(
".mongo.conf"
,
db_name
)
df
=
mdbh
.
get_dataframe
(
uri
)
# df = pd.read_pickle("zeroshot_results.pkl")
df
=
df
[
df
[
"id"
]
>
50
]
df
=
df
[[
"experiment.name"
,
'metrics.test_accuracy'
,
'metrics.test_multilabel_report_micro avg_f1-score'
,
'metrics.test_multilabel_report_micro avg_precision'
,
'metrics.test_multilabel_report_micro avg_recall'
,
'metrics.test_multilabel_report_macro avg_f1-score'
,
'metrics.test_multilabel_report_macro avg_precision'
,
'metrics.test_multilabel_report_macro avg_recall'
,
"metrics.test_p@1"
,
"metrics.test_p@3"
,
"metrics.test_p@5"
,
"start_time"
,
"stop_time"
,
"config.batch_size"
,
"config.dataset"
,
"config.representation"
,
"config.threshold"
,
"config.target"
,
"config.formatted"
,
"config.method"
,
"config.whole_dataset"
,
"config.cut_sample"
,
"config.dataset_size"
]]
mapping
=
{
"experiment.name"
:
"name"
,
"config.batch_size"
:
"batch_size"
,
"config.dataset"
:
"dataset"
,
"config.threshold"
:
"threshold"
,
"config.target"
:
"target"
,
"config.formatted"
:
"formatted"
,
"config.method"
:
"method"
,
"config.whole_dataset"
:
"whole_dataset"
,
"config.cut_sample"
:
"cut_sample"
,
"config.dataset_size"
:
"dataset_size"
,
"metrics.test_accuracy"
:
"accuracy"
,
"metrics.test_multilabel_report_micro avg_f1-score"
:
"micro_f1"
,
"metrics.test_multilabel_report_micro avg_precision"
:
"micro_precision"
,
"metrics.test_multilabel_report_micro avg_recall"
:
"micro_recall"
,
"metrics.test_multilabel_report_macro avg_f1-score"
:
"macro_f1"
,
"metrics.test_multilabel_report_macro avg_precision"
:
"macro_precision"
,
"metrics.test_multilabel_report_macro avg_recall"
:
"macro_recall"
,
"metrics.test_p@1"
:
"precision@1"
,
"metrics.test_p@3"
:
"precision@3"
,
"metrics.test_p@5"
:
"precision@5"
,
"config.representation"
:
"representation"
}
df
=
df
.
rename
(
columns
=
mapping
)
# Calculate duration of experiments
time_list
=
[]
for
index
,
row
in
df
.
iterrows
():
time
=
row
[
'stop_time'
]
-
row
[
'start_time'
]
time_list
.
append
(
time
)
df
[
"duration"
]
=
time_list
df
=
df
.
drop
(
columns
=
[
"stop_time"
,
"start_time"
])
df
=
df
.
sort_values
(
'name'
)
df
[
"duration"
]
=
df
[
"duration"
].
map
(
lambda
x
:
str
(
x
).
split
(
" "
)[
-
1
][:
8
])
df
.
loc
[
df
[
"cut_sample"
]
!=
True
,
"cut_sample"
]
=
False
df
.
to_pickle
(
"./zeroshot_results_formatted.pkl"
)
\ No newline at end of file
zeroshot.py
0 → 100644
View file @
b8a23e24
import
flair
import
mlmc
import
torch
from
flair.data
import
Sentence
from
flair.models.text_classification_model
import
TARSClassifier
from
sacred
import
Experiment
,
SETTINGS
from
sacred.observers
import
MongoObserver
from
sacred.utils
import
apply_backspaces_and_linefeeds
from
torch.utils.data
import
DataLoader
from
tqdm
import
tqdm
from
transformers
import
pipeline
,
AutoModelForSequenceClassification
,
AutoTokenizer
name
=
""
user
=
""
host
=
""
database
=
""
auth
=
""
pw
=
""
SETTINGS
.
CAPTURE_MODE
=
"sys"
ex
=
Experiment
(
name
)
ex
.
observers
.
append
(
MongoObserver
(
url
=
"localhost:27017"
,
db_name
=
database
))
ex
.
captured_out_filter
=
apply_backspaces_and_linefeeds
class
ZeroshotClassification
:
"""
Class to store model information and run methods. If Huggingface is used the class attribute self.classifier has to
be called, in case of Flair self.model.predict_zero_shot
"""
def
__init__
(
self
,
classes
,
target
,
threshold
,
representation
=
"facebook/bart-large-mnli"
,
formatted
=
True
,
device
=
0
):
"""
:param classes: A dictionary mapping class label to ID.
:param target: "single" if single-label, "multi" if multi-label.
:param threshold: Score threshold to use. (see mlmc.thresholds.thresholds_dict.keys())
:param representation: A huggingface model. (see https://huggingface.co/models)
:param formatted: If formatting is set to True each class label is replaced by a more descriptive label.
Furthermore, if the huggingface method is used the hypothesis is replaced as well.
:param device: GPU to use.
"""
if
representation
==
"tars-base"
:
self
.
model
=
TARSClassifier
.
load
(
'tars-base'
)
else
:
self
.
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
representation
)
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
representation
)
self
.
classifier
=
pipeline
(
"zero-shot-classification"
,
model
=
self
.
model
,
tokenizer
=
self
.
tokenizer
,
device
=
device
)
self
.
config
=
{
"representation"
:
representation
,
"classes"
:
classes
,
"threshold"
:
threshold
,
"target"
:
target
,
"format"
:
formatted
,
}
def
init_metrics
(
self
,
metrics
=
"default_singlelabel"
):
"""
Initializes metrics to be used. If no metrics are specified then depending on the target the default metrics
for this target will be used. (see mlmc.metrics.metrics_config.items())
:param metrics: Name of the metrics (see mlmc.metrics.metrics_dict.keys() and mlmc.metrics.metrics_config.keys())
:return: A dictionary containing the initialized metrics
"""
metrics
=
mlmc
.
metrics
.
MetricsDict
(
metrics
)
metrics
.
init
(
self
.
config
)
metrics
.
reset
()
return
metrics
@
ex
.
config
def
ex_config
():
"""
:param device: GPU to use.
:param batch_size: Batch size.
:param representation: A huggingface model. (see https://huggingface.co/models)
:param dataset: Dataset to use. (see mlmc.data.register.keys())
:param target: "single" if single-label, "multi" if multi-label.
:param threshold: Score threshold to use. (see mlmc.thresholds.thresholds_dict.keys())
:param formatted: If formatting is set to True each class label is replaced by a more descriptive label.
Furthermore, if the huggingface method is used the hypothesis is replaced as well.
:param cut_sample: Trims the input text to the maximum input size of the language model.
:param method: "huggingface" or "flair"
:param whole_dataset: If True the entire dataset is used for classification.
"""
device
=
0
batch_size
=
1
representation
=
"tals/albert-base-mnli"
dataset
=
"trec6"
target
=
"single"
threshold
=
"max"
if
target
==
"multi"
:
threshold
=
"mcut"
formatted
=
True
cut_sample
=
False
if
target
==
"multi"
:
cut_sample
=
True
method
=
"huggingface"
if
method
==
"flair"
:
flair
.
device
=
torch
.
device
(
f
'cuda:
{
device
}
'
)
whole_dataset
=
True
if
dataset
==
"rcv1"
:
whole_dataset
=
False
dataset_size
=
10000
@
ex
.
automain
def
run
(
_run
,
dataset
,
batch_size
,
representation
,
threshold
,
target
,
formatted
,
method
,
cut_sample
):
"""
Sacred run method. Parameters are automatically retrieved from the configuration.
"""
data
=
mlmc
.
data
.
get
(
dataset
)
hypothesis
=
"This example is {}."
if
formatted
:
# replace hypothesis to be more task specific
hypothesis
=
mlmc
.
data
.
dataset_formatter
.
SFORMATTER
[
dataset
](
"{}"
)
if
dataset
in
[
"trec6"
,
"trec50"
,
"dbpedia"
,
"agnews"
,
"yelpfull"
,
"amazonfull"
]:
formatted_classes
=
{}
for
i
,
c
in
enumerate
(
data
[
"classes"
]):
# replace class to be more descriptive
formatted_class
=
mlmc
.
data
.
dataset_formatter
.
label_dicts
[
dataset
].
get
(
c
,
c
)
formatted_classes
[
formatted_class
]
=
i
data
[
"classes"
]
=
formatted_classes
classes_dict
=
data
[
"classes"
]
classes_list
=
[
x
for
x
in
data
[
"classes"
].
keys
()]
# cut rcv1 to a shorter size as it would take too much to classify
if
dataset
==
"rcv1"
:
data
[
"test"
]
=
mlmc
.
data
.
sampler
(
data
[
"test"
],
absolute
=
10000
)
test_dataloader
=
DataLoader
(
data
[
"test"
],
batch_size
=
batch_size
,
shuffle
=
False
)
zc
=
ZeroshotClassification
(
representation
=
representation
,
classes
=
data
[
"classes"
],
threshold
=
threshold
,
target
=
target
,
formatted
=
formatted
,
device
=
0
)
if
target
==
"multi"
:
initialized_metrics
=
zc
.
init_metrics
(
metrics
=
"default_multilabel"
)
multi_class
=
True
else
:
initialized_metrics
=
zc
.
init_metrics
()
multi_class
=
False
threshold_
=
mlmc
.
thresholds
.
get
(
threshold
)
for
sample
in
tqdm
(
test_dataloader
):
results
,
truth_l
,
pred_l
=
[],
[],
[]
if
method
==
"huggingface"
:
if
cut_sample
:
with
torch
.
no_grad
():
results
.
append
(
zc
.
classifier
(
sample
[
"text"
][
0
][:
zc
.
model
.
config
.
max_position_embeddings
],
classes_list
,
multi_class
=
multi_class
,
hypothesis_template
=
hypothesis
))
else
:
with
torch
.
no_grad
():
results
.
append
(
zc
.
classifier
(
sample
[
"text"
][
0
],
classes_list
,
multi_class
=
multi_class
,
hypothesis_template
=
hypothesis
))
elif
method
==
"flair"
:
sentence
=
Sentence
(
sample
[
"text"
][
0
])
with
torch
.
no_grad
():
zc
.
model
.
predict_zero_shot
(
sentence
,
classes_list
,
multi_label
=
multi_class
)
results_dict
=
{
"labels"
:
[],
"scores"
:
[]}
r
=
[
sentence
.
get_labels
()]
for
result
in
r
:
for
s
in
result
:
# clean the output to get the correct input format for threshold application
results_dict
[
"labels"
].
append
(
str
(
s
).
split
(
" ("
)[
0
])
results_dict
[
"scores"
].
append
(
float
(
str
(
s
).
split
(
" ("
)[
1
].
split
(
")"
)[
0
]))
results
.
append
(
results_dict
)
# order the score tuple for each prediction according to the order of the class dictionary
for
results_dict
in
results
:
scores_list
=
[
x
for
_
,
x
in
sorted
(
zip
(
results_dict
[
"labels"
],
results_dict
[
"scores"
]),
key
=
lambda
y
:
classes_dict
.
get
(
y
[
0
]))]
scores
=
torch
.
tensor
([
scores_list
])
truth_l
.
append
(
torch
.
squeeze
(
sample
[
"labels"
]))
pred_l
.
append
(
torch
.
squeeze
(
threshold_
(
scores
)))
initialized_metrics
.
update_metrics
((
scores
,
torch
.
stack
(
truth_l
),
torch
.
stack
(
pred_l
)))
initialized_metrics
.
compute
()
initialized_metrics
.
log_sacred
(
_run
,
1
,
"test"
)
metrics
=
initialized_metrics
.
print
()
print
(
metrics
)
zeroshot_sacred.py
0 → 100644
View file @
b8a23e24
from
zeroshot
import
ex
device
=
"0"
representation
=
"tars-base"
method
=
"flair"
target
=
"single"
threshold
=
"max"
datasets
=
[
"agnews"
,
"dbpedia"
,
"trec6"
,
"trec50"
,
"yahoo_answers"
,
"amazonfull"
,
"yelpfull"
]
for
dataset
in
datasets
:
experiment
=
ex
.
run
(
options
=
{
'--name'
:
f
'
{
dataset
}
-TARS_base'
},
config_updates
=
{
'representation'
:
representation
,
'target'
:
target
,
'dataset'
:
dataset
,
'threshold'
:
threshold
,
'device'
:
device
,
'method'
:
method
})
target
=
"multi"
threshold
=
"mcut"
datasets
=
[
"blurbgenrecollection"
,
"rcv1"
]
for
dataset
in
datasets
:
experiment
=
ex
.
run
(
options
=
{
'--name'
:
f
'
{
dataset
}
-TARS_base'
},
config_updates
=
{
'representation'
:
representation
,
'target'
:
target
,
'dataset'
:
dataset
,
'threshold'
:
threshold
,
'device'
:
device
,
'method'
:
method
})
zeroshot_uqa.py
0 → 100644
View file @
b8a23e24
from
string
import
ascii_uppercase
import
mlmc
import
torch
from
sacred
import
Experiment
,
SETTINGS
from
sacred.observers
import
MongoObserver
from
sacred.utils
import
apply_backspaces_and_linefeeds
from
torch.utils.data
import
DataLoader
from
tqdm
import
tqdm
from
transformers
import
AutoTokenizer
,
T5ForConditionalGeneration
name
=
""
user
=
""
host
=
""
database
=
""
auth
=
""
pw
=
""
SETTINGS
.
CAPTURE_MODE
=
"sys"
ex
=
Experiment
(
name
)
ex
.
observers
.
append
(
MongoObserver
(
url
=
"localhost:27017"
,
db_name
=
database
))
ex
.
captured_out_filter
=
apply_backspaces_and_linefeeds
class
ZeroshotClassification
:
def
__init__
(
self
,
representation
,
classes
,
target
,
format_
):
self
.
model
=
T5ForConditionalGeneration
.
from_pretrained
(
representation
)
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
representation
)
self
.
config
=
{
"representation"
:
representation
,
"classes"
:
classes
,
"target"
:
target
,
"format"
:
format_
,
}
def
init_metrics
(
self
,
metrics
=
"default_singlelabel"
):
"""
Initializes metrics to be used. If no metrics are specified then depending on the target the default metrics
for this target will be used. (see mlmc.metrics.metrics_config.items())
:param metrics: Name of the metrics (see mlmc.metrics.metrics_dict.keys() and mlmc.metrics.metrics_config.keys())
:return: A dictionary containing the initialized metrics
"""
metrics
=
mlmc
.
metrics
.
MetricsDict
(
metrics
)
metrics
.
init
(
self
.
config
)
metrics
.
reset
()
return
metrics
def
run_model
(
self
,
input_string
,
**
generator_args
):
input_ids
=
self
.
tokenizer
.
encode
(
input_string
,
return_tensors
=
"pt"
)
res
=
self
.
model
.
generate
(
input_ids
,
**
generator_args
)
return
self
.
tokenizer
.
batch_decode
(
res
,
skip_special_tokens
=
True
)
@
ex
.
config
def
ex_config
():
device
=
0
batch_size
=
1
representation
=
"tals/albert-base-mnli"
dataset
=
"agnews"
target
=
"single"
threshold
=
"max"
if
target
==
"multi"
:
threshold
=
"mcut"
formatted
=
True
cut_sample
=
False
if
target
==
"multi"
:
cut_sample
=
True
method
=
"huggingface"
whole_dataset
=
True
if
dataset
==
"rcv1"
:
whole_dataset
=
False
dataset_size
=
10000
@
ex
.
automain
def
run
(
_run
,
dataset
,
formatted
):
data
=
mlmc
.
data
.
get
(
dataset
)
if
formatted
:
if
dataset
in
[
"trec6"
,
"trec50"
,
"dbpedia"
,
"agnews"
,
"yelpfull"
,
"amazonfull"
]:
formatted_classes
=
{}
for
i
,
c
in
enumerate
(
data
[
"classes"
]):
formatted_class
=
mlmc
.
data
.
dataset_formatter
.
label_dicts
[
dataset
].
get
(
c
,
c
)
formatted_classes
[
formatted_class
]
=
i
data
[
"classes"
]
=
formatted_classes
classes
=
data
[
"classes"
]
if
dataset
==
"rcv1"
:
data
[
"test"
]
=
mlmc
.
data
.
sampler
(
data
[
"test"
],
absolute
=
10000
)
test_dataloader
=
DataLoader
(
data
[
"test"
],
batch_size
=
1
,
shuffle
=
False
)
zc
=
ZeroshotClassification
(
"allenai/unifiedqa-t5-small"
,
classes
=
classes
,
target
=
"single"
,
format_
=
formatted
)
initialized_metrics
=
zc
.
init_metrics
()
threshold_
=
mlmc
.
thresholds
.
get
(
"max"
)
question
=
"What is this question about?"
choices
=
""
"""
class_counter = 0
for char1 in ascii_uppercase:
for char2, class_ in zip(ascii_uppercase, classes.keys()):
if class_counter < len(classes.keys()):
choices += "("+char1+char2+") " + class_ + " "
class_counter += 1
"""
for
char1
,
class_
in
zip
(
ascii_uppercase
,
classes
.
keys
()):
choices
+=
"("
+
char1
+
") "
+
class_
+
" "
for
sample
in
tqdm
(
test_dataloader
):
truth_l
,
pred_l
=
[],
[]
text
=
" "
.
join
(
sample
[
"text"
][
0
].
replace
(
"
\n
"
,
""
).
split
())
encoded_input
=
question
+
"
\\
n "
+
choices
+
"
\\
n "
+
text
# num_return_sequences = 5
# output = zc.run_model(encoded_input, num_beams=20, num_return_sequences=num_return_sequences, do_sample=True)
output
=
zc
.
run_model
(
encoded_input
)
for
class_
in
output
:
if
class_
in
classes
:
predicted_class
=
class_
break
scores_list
=
[
1
if
predicted_class
==
class_
else
0
for
class_
in
classes
]
scores
=
torch
.
tensor
([
scores_list
])
truth_l
.
append
(
torch
.
squeeze
(
sample
[
"labels"
]))
pred_l
.
append
(
torch
.
squeeze
(
threshold_
(
scores
)))
initialized_metrics
.
update_metrics
((
scores
,
torch
.
stack
(
truth_l
),
torch
.
stack
(
pred_l
)))
initialized_metrics
.
compute
()
initialized_metrics
.
log_sacred
(
_run
,
1
,
"test"
)
metrics
=
initialized_metrics
.
print
()
print
(
metrics
)
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment