Skip to content

Commit e12436e

Browse files
committed
feat: add imdb distributed demo for mindnlp Trainer API
1 parent bbfdeb4 commit e12436e

File tree

2 files changed

+122
-0
lines changed

2 files changed

+122
-0
lines changed
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
"""
4+
python bert_imdb_finetune_cpu_mindnlp_trainer.py
5+
bash bert_imdb_finetune_npu_mindnlp_trainer.sh
6+
"""
7+
8+
def main():
9+
import mindspore
10+
from mindspore.dataset import transforms
11+
from mindnlp.engine import Trainer
12+
from mindnlp.dataset import load_dataset
13+
14+
imdb_ds = load_dataset('imdb', split=['train', 'test'])
15+
imdb_train = imdb_ds['train']
16+
imdb_test = imdb_ds['test']
17+
imdb_train.get_dataset_size()
18+
from mindnlp.transformers import AutoTokenizer
19+
# tokenizer
20+
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
21+
22+
def process_dataset(dataset, tokenizer, max_seq_len=256, batch_size=32, shuffle=False):
23+
is_ascend = mindspore.get_context('device_target') == 'Ascend'
24+
def tokenize(text):
25+
if is_ascend:
26+
tokenized = tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_len)
27+
else:
28+
tokenized = tokenizer(text, truncation=True, max_length=max_seq_len)
29+
return tokenized['input_ids'], tokenized['token_type_ids'], tokenized['attention_mask']
30+
31+
if shuffle:
32+
dataset = dataset.shuffle(batch_size)
33+
34+
# map dataset
35+
dataset = dataset.map(operations=[tokenize], input_columns="text", output_columns=['input_ids', 'token_type_ids', 'attention_mask'])
36+
dataset = dataset.map(operations=transforms.TypeCast(mindspore.int32), input_columns="label", output_columns="labels")
37+
# batch dataset
38+
if is_ascend:
39+
dataset = dataset.batch(batch_size)
40+
else:
41+
dataset = dataset.padded_batch(batch_size, pad_info={'input_ids': (None, tokenizer.pad_token_id),
42+
'token_type_ids': (None, 0),
43+
'attention_mask': (None, 0)})
44+
45+
return dataset
46+
47+
# split train dataset into train and valid datasets
48+
imdb_train, imdb_val = imdb_train.split([0.7, 0.3])
49+
50+
dataset_train = process_dataset(imdb_train, tokenizer, shuffle=True)
51+
dataset_val = process_dataset(imdb_val, tokenizer)
52+
dataset_test = process_dataset(imdb_test, tokenizer)
53+
54+
next(dataset_train.create_tuple_iterator())
55+
56+
from mindnlp.transformers import AutoModelForSequenceClassification
57+
58+
# set bert config and define parameters for training
59+
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
60+
61+
from mindnlp.engine import TrainingArguments
62+
63+
training_args = TrainingArguments(
64+
output_dir="bert_imdb_finetune_cpu",
65+
evaluation_strategy="epoch",
66+
save_strategy="epoch",
67+
logging_strategy="epoch",
68+
load_best_model_at_end=True,
69+
num_train_epochs=2.0,
70+
learning_rate=2e-5
71+
)
72+
training_args = training_args.set_optimizer(name="adamw", beta1=0.8) # OptimizerNames.SGD
73+
74+
from mindnlp import evaluate
75+
import numpy as np
76+
metric = evaluate.load("accuracy")
77+
def compute_metrics(eval_pred):
78+
logits, labels = eval_pred
79+
predictions = np.argmax(logits, axis=-1)
80+
return metric.compute(predictions=predictions, references=labels)
81+
82+
trainer = Trainer(
83+
model=model,
84+
args=training_args,
85+
train_dataset=dataset_train,
86+
eval_dataset=dataset_val,
87+
compute_metrics=compute_metrics
88+
)
89+
print("Start training")
90+
trainer.train()
91+
92+
print("Start checking the test set")
93+
trainer.evaluate(dataset_test)
94+
95+
if __name__ == '__main__':
96+
main()
97+
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
3+
echo "=========================================="
4+
echo "Please run the script as: "
5+
echo "bash bert_imdb_finetune_npu_mindnlp_trainer.sh"
6+
echo "==========================================="
7+
8+
EXEC_PATH=$(pwd)
9+
if [ ! -d "${EXEC_PATH}/data" ]; then
10+
if [ ! -f "${EXEC_PATH}/emotion_detection.tar.gz" ]; then
11+
wget wget https://baidu-nlp.bj.bcebos.com/emotion_detection-dataset-1.0.0.tar.gz -O emotion_detection.tar.gz
12+
fi
13+
tar xvf emotion_detection.tar.gz
14+
fi
15+
export DATA_PATH=${EXEC_PATH}/data/
16+
17+
rm -rf bert_imdb_finetune_cpu_mindnlp_trainer_npus_same
18+
mkdir bert_imdb_finetune_cpu_mindnlp_trainer_npus_same
19+
echo "start training"
20+
21+
export MULTI_NPU="true"
22+
# unset MULTI_NPU
23+
msrun --worker_num=2 --local_worker_num=2 --master_port=8119 \
24+
--log_dir=bert_imdb_finetune_cpu_mindnlp_trainer_npus_same --join=True \
25+
--cluster_time_out=30 bert_imdb_finetune_cpu_mindnlp_trainer_npus_same.py

0 commit comments

Comments
 (0)