1
1
#!/usr/bin/env python
2
2
# coding: utf-8
3
3
"""
4
- python bert_imdb_finetune_cpu_mindnlp_trainer .py
4
+ unset MULTI_NPU && python bert_imdb_finetune_cpu_mindnlp_trainer_npus_same .py
5
5
bash bert_imdb_finetune_npu_mindnlp_trainer.sh
6
6
"""
7
7
8
+ import mindspore
9
+ from mindspore .dataset import transforms
10
+ from mindnlp .engine import Trainer
11
+ from mindnlp .dataset import load_dataset
12
+
13
+ from mindnlp .accelerate .utils .constants import accelerate_distributed_type
14
+ from mindnlp .accelerate .utils .dataclasses import DistributedType
15
+
8
16
def main ():
9
- import mindspore
10
- from mindspore .dataset import transforms
11
- from mindnlp .engine import Trainer
12
- from mindnlp .dataset import load_dataset
17
+ """demo
13
18
19
+ Returns:
20
+ desc: _description_
21
+ """
14
22
imdb_ds = load_dataset ('imdb' , split = ['train' , 'test' ])
15
23
imdb_train = imdb_ds ['train' ]
16
- imdb_test = imdb_ds ['test' ]
17
24
imdb_train .get_dataset_size ()
25
+
18
26
from mindnlp .transformers import AutoTokenizer
19
27
# tokenizer
20
28
tokenizer = AutoTokenizer .from_pretrained ('bert-base-cased' )
@@ -41,15 +49,10 @@ def tokenize(text):
41
49
dataset = dataset .padded_batch (batch_size , pad_info = {'input_ids' : (None , tokenizer .pad_token_id ),
42
50
'token_type_ids' : (None , 0 ),
43
51
'attention_mask' : (None , 0 )})
44
-
45
52
return dataset
46
53
47
- # split train dataset into train and valid datasets
48
- imdb_train , imdb_val = imdb_train .split ([0.7 , 0.3 ])
49
54
50
55
dataset_train = process_dataset (imdb_train , tokenizer , shuffle = True )
51
- dataset_val = process_dataset (imdb_val , tokenizer )
52
- dataset_test = process_dataset (imdb_test , tokenizer )
53
56
54
57
next (dataset_train .create_tuple_iterator ())
55
58
@@ -62,36 +65,21 @@ def tokenize(text):
62
65
63
66
training_args = TrainingArguments (
64
67
output_dir = "bert_imdb_finetune_cpu" ,
65
- evaluation_strategy = "epoch" ,
66
68
save_strategy = "epoch" ,
67
69
logging_strategy = "epoch" ,
68
- load_best_model_at_end = True ,
69
70
num_train_epochs = 2.0 ,
70
71
learning_rate = 2e-5
71
72
)
72
- training_args = training_args .set_optimizer (name = "adamw" , beta1 = 0.8 ) # OptimizerNames.SGD
73
-
74
- from mindnlp import evaluate
75
- import numpy as np
76
- metric = evaluate .load ("accuracy" )
77
- def compute_metrics (eval_pred ):
78
- logits , labels = eval_pred
79
- predictions = np .argmax (logits , axis = - 1 )
80
- return metric .compute (predictions = predictions , references = labels )
81
-
73
+ training_args = training_args .set_optimizer (name = "adamw" , beta1 = 0.8 ) # 手动指定优化器,OptimizerNames.SGD
74
+
82
75
trainer = Trainer (
83
76
model = model ,
84
77
args = training_args ,
85
78
train_dataset = dataset_train ,
86
- eval_dataset = dataset_val ,
87
- compute_metrics = compute_metrics
88
79
)
89
80
print ("Start training" )
90
81
trainer .train ()
91
82
92
- print ("Start checking the test set" )
93
- trainer .evaluate (dataset_test )
94
-
95
83
if __name__ == '__main__' :
96
84
main ()
97
85
0 commit comments