Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
asv-ml
mlmc
Commits
8171ddf4
Commit
8171ddf4
authored
Oct 16, 2021
by
Janos Borst
Browse files
finetune stuff
parent
eb6fe095
Pipeline
#49996
passed with stage
in 9 minutes and 57 seconds
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
mlmc/models/abstracts/abstract_textclassification.py
View file @
8171ddf4
import
torch
import
transformers
from
ignite.metrics
import
Average
from
tqdm
import
tqdm
...
...
@@ -713,3 +714,15 @@ class TextClassificationAbstract(torch.nn.Module):
self
.
set_loss
(
loss
)
self
.
_config
.
update
(
**
kwargs
)
def
finetune_lm
(
self
,
file
,
epochs
=
1
,
batch_size
=
8
,
valid
=
0.1
):
import
subprocess
,
pathlib
,
tempfile
,
os
,
sys
my_env
=
os
.
environ
.
copy
()
my_env
[
"CUDA_VISIBLE_DEVICES"
]
=
""
if
self
.
device
==
"cpu"
else
str
(
self
.
device
).
split
(
":"
)[
-
1
]
cmd
=
pathlib
.
Path
(
__file__
).
parents
[
0
]
/
"pretrain-language-model.py"
with
tempfile
.
TemporaryDirectory
()
as
f
:
subprocess
.
call
(
[
sys
.
executable
,
cmd
,
"--model"
,
self
.
representation
,
"--file"
,
str
(
file
),
"--output"
,
str
(
f
),
"--epochs"
,
str
(
epochs
),
"--batch_size"
,
str
(
batch_size
),
"--valid_frac"
,
str
(
valid
)],
env
=
my_env
)
self
.
embedding
=
transformers
.
AutoModel
.
from_pretrained
(
f
+
"/model"
)
\ No newline at end of file
mlmc/models/abstracts/pretrain-language-model.py
0 → 100644
View file @
8171ddf4
from
transformers
import
AutoModelForMaskedLM
,
AutoTokenizer
,
DataCollatorForLanguageModeling
,
LineByLineTextDataset
import
argparse
import
pathlib
from
transformers
import
Trainer
,
TrainingArguments
import
numpy
as
np
from
copy
import
deepcopy
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Basic Version of finetuning a language on domain specific data.'
)
parser
.
add_argument
(
'--model'
,
metavar
=
'model'
,
type
=
str
,
help
=
'Model name'
)
parser
.
add_argument
(
'--file'
,
dest
=
'file'
,
help
=
'Raw text input file'
)
parser
.
add_argument
(
'--output'
,
dest
=
'output'
,
help
=
'Raw text input file'
)
parser
.
add_argument
(
'--epochs'
,
dest
=
'epochs'
,
type
=
int
,
help
=
'Raw text input file'
)
parser
.
add_argument
(
'--batch_size'
,
dest
=
'batch_size'
,
type
=
int
,
help
=
'Raw text input file'
)
parser
.
add_argument
(
'--valid_fraction'
,
dest
=
'valid'
,
type
=
float
,
help
=
'fraction of data to be used as validation data'
)
args
=
parser
.
parse_args
()
repr
=
args
.
model
print
(
"Language Model Pretraining."
)
model
=
AutoModelForMaskedLM
.
from_pretrained
(
repr
)
tok
=
AutoTokenizer
.
from_pretrained
(
repr
)
data_collator
=
DataCollatorForLanguageModeling
(
tokenizer
=
tok
,
mlm
=
True
,
mlm_probability
=
0.15
)
dataset
=
LineByLineTextDataset
(
tokenizer
=
tok
,
file_path
=
args
.
file
,
block_size
=
512
)
fraction
=
args
.
valid
ind
=
list
(
range
(
len
(
dataset
)))
np
.
random
.
shuffle
(
ind
)
n_samples
=
int
((
1
-
fraction
)
*
len
(
dataset
))
train
=
deepcopy
(
dataset
)
test
=
deepcopy
(
dataset
)
train
.
examples
=
[
dataset
.
examples
[
i
]
for
i
in
ind
[:
n_samples
]]
test
.
examples
=
[
dataset
.
examples
[
i
]
for
i
in
ind
[:
n_samples
]]
training_args
=
TrainingArguments
(
output_dir
=
args
.
output
,
overwrite_output_dir
=
True
,
num_train_epochs
=
args
.
epochs
,
per_gpu_train_batch_size
=
args
.
batch_size
,
save_steps
=
10_000
,
save_total_limit
=
2
,
)
trainer
=
Trainer
(
model
=
model
,
args
=
training_args
,
data_collator
=
data_collator
,
train_dataset
=
train
,
eval_dataset
=
test
)
trainer
.
train
()
trainer
.
save_model
(
str
(
pathlib
.
Path
(
args
.
output
)
/
"model"
)
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment