Skip to content

Commit c801e6d

Browse files
authored
Merge pull request #2 from Tridu33/openmind
feat: add imdb distributed demo for mindnlp Trainer API
2 parents 9a4b724 + e12436e commit c801e6d

File tree

9 files changed

+172
-44
lines changed

9 files changed

+172
-44
lines changed
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
"""
4+
python bert_imdb_finetune_cpu_mindnlp_trainer.py
5+
bash bert_imdb_finetune_npu_mindnlp_trainer.sh
6+
"""
7+
8+
def main():
9+
import mindspore
10+
from mindspore.dataset import transforms
11+
from mindnlp.engine import Trainer
12+
from mindnlp.dataset import load_dataset
13+
14+
imdb_ds = load_dataset('imdb', split=['train', 'test'])
15+
imdb_train = imdb_ds['train']
16+
imdb_test = imdb_ds['test']
17+
imdb_train.get_dataset_size()
18+
from mindnlp.transformers import AutoTokenizer
19+
# tokenizer
20+
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
21+
22+
def process_dataset(dataset, tokenizer, max_seq_len=256, batch_size=32, shuffle=False):
23+
is_ascend = mindspore.get_context('device_target') == 'Ascend'
24+
def tokenize(text):
25+
if is_ascend:
26+
tokenized = tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_len)
27+
else:
28+
tokenized = tokenizer(text, truncation=True, max_length=max_seq_len)
29+
return tokenized['input_ids'], tokenized['token_type_ids'], tokenized['attention_mask']
30+
31+
if shuffle:
32+
dataset = dataset.shuffle(batch_size)
33+
34+
# map dataset
35+
dataset = dataset.map(operations=[tokenize], input_columns="text", output_columns=['input_ids', 'token_type_ids', 'attention_mask'])
36+
dataset = dataset.map(operations=transforms.TypeCast(mindspore.int32), input_columns="label", output_columns="labels")
37+
# batch dataset
38+
if is_ascend:
39+
dataset = dataset.batch(batch_size)
40+
else:
41+
dataset = dataset.padded_batch(batch_size, pad_info={'input_ids': (None, tokenizer.pad_token_id),
42+
'token_type_ids': (None, 0),
43+
'attention_mask': (None, 0)})
44+
45+
return dataset
46+
47+
# split train dataset into train and valid datasets
48+
imdb_train, imdb_val = imdb_train.split([0.7, 0.3])
49+
50+
dataset_train = process_dataset(imdb_train, tokenizer, shuffle=True)
51+
dataset_val = process_dataset(imdb_val, tokenizer)
52+
dataset_test = process_dataset(imdb_test, tokenizer)
53+
54+
next(dataset_train.create_tuple_iterator())
55+
56+
from mindnlp.transformers import AutoModelForSequenceClassification
57+
58+
# set bert config and define parameters for training
59+
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
60+
61+
from mindnlp.engine import TrainingArguments
62+
63+
training_args = TrainingArguments(
64+
output_dir="bert_imdb_finetune_cpu",
65+
evaluation_strategy="epoch",
66+
save_strategy="epoch",
67+
logging_strategy="epoch",
68+
load_best_model_at_end=True,
69+
num_train_epochs=2.0,
70+
learning_rate=2e-5
71+
)
72+
training_args = training_args.set_optimizer(name="adamw", beta1=0.8) # OptimizerNames.SGD
73+
74+
from mindnlp import evaluate
75+
import numpy as np
76+
metric = evaluate.load("accuracy")
77+
def compute_metrics(eval_pred):
78+
logits, labels = eval_pred
79+
predictions = np.argmax(logits, axis=-1)
80+
return metric.compute(predictions=predictions, references=labels)
81+
82+
trainer = Trainer(
83+
model=model,
84+
args=training_args,
85+
train_dataset=dataset_train,
86+
eval_dataset=dataset_val,
87+
compute_metrics=compute_metrics
88+
)
89+
print("Start training")
90+
trainer.train()
91+
92+
print("Start checking the test set")
93+
trainer.evaluate(dataset_test)
94+
95+
if __name__ == '__main__':
96+
main()
97+
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
3+
echo "=========================================="
4+
echo "Please run the script as: "
5+
echo "bash bert_imdb_finetune_npu_mindnlp_trainer.sh"
6+
echo "==========================================="
7+
8+
EXEC_PATH=$(pwd)
9+
if [ ! -d "${EXEC_PATH}/data" ]; then
10+
if [ ! -f "${EXEC_PATH}/emotion_detection.tar.gz" ]; then
11+
wget wget https://baidu-nlp.bj.bcebos.com/emotion_detection-dataset-1.0.0.tar.gz -O emotion_detection.tar.gz
12+
fi
13+
tar xvf emotion_detection.tar.gz
14+
fi
15+
export DATA_PATH=${EXEC_PATH}/data/
16+
17+
rm -rf bert_imdb_finetune_cpu_mindnlp_trainer_npus_same
18+
mkdir bert_imdb_finetune_cpu_mindnlp_trainer_npus_same
19+
echo "start training"
20+
21+
export MULTI_NPU="true"
22+
# unset MULTI_NPU
23+
msrun --worker_num=2 --local_worker_num=2 --master_port=8119 \
24+
--log_dir=bert_imdb_finetune_cpu_mindnlp_trainer_npus_same --join=True \
25+
--cluster_time_out=30 bert_imdb_finetune_cpu_mindnlp_trainer_npus_same.py

mindnlp/accelerate/accelerator.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -105,20 +105,13 @@ def prepare(self, *args):
105105
"""
106106
result = []
107107

108-
# Only support mindsormers and MULTI_NPU_DP now
108+
# Only support mindsormers and MULTI_NPU now
109109
if self.distributed_type == DistributedType.MINDFORMERS:
110110
result = self._prepare_mindformers(*args)
111-
elif self.distributed_type == DistributedType.MULTI_NPU_DP:
112-
result = self._prepare_data_parallel_native_minspore(*args)
111+
elif self.distributed_type == DistributedType.MULTI_NPU:
112+
pass # nothing prepare for data parallel
113113
return result
114114

115-
def _prepare_data_parallel_native_minspore(self, *args):
116-
# initialize data parallel for native mindspore
117-
mindspore.set_context(mode=mindspore.GRAPH_MODE)
118-
mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True)
119-
mindspore.communication.init()
120-
mindspore.set_seed(numpy.random.seed())
121-
122115
def _prepare_mindformers(self, *args):
123116
mindformers_plugin = self.state.mindformers_plugin
124117

mindnlp/accelerate/state.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -347,9 +347,9 @@ def _prepare_backend(self):
347347
if accelerate_distributed_type == DistributedType.MINDFORMERS and is_mindformers_available():
348348
self.backend = "hccl"
349349
self.distributed_type = DistributedType.MINDFORMERS
350-
elif accelerate_distributed_type == DistributedType.MULTI_NPU_DP:
350+
elif accelerate_distributed_type == DistributedType.MULTI_NPU:
351351
self.backend = "hccl"
352-
self.distributed_type = DistributedType.MULTI_NPU_DP
352+
self.distributed_type = DistributedType.MULTI_NPU
353353

354354
@num_processes.setter
355355
def num_processes(self, value):
@@ -372,8 +372,8 @@ def __init__(self, mindformers_plugin=None, **kwargs):
372372
PartialState(**kwargs)
373373
self.__dict__.update(PartialState._shared_state)
374374
# set distributed_type
375-
if accelerate_distributed_type == DistributedType.MULTI_NPU_DP:
376-
self.distributed_type = DistributedType.MULTI_NPU_DP
375+
if accelerate_distributed_type == DistributedType.MULTI_NPU:
376+
self.distributed_type = DistributedType.MULTI_NPU
377377
elif accelerate_distributed_type == DistributedType.MINDFORMERS:
378378
self.distributed_type = DistributedType.MINDFORMERS
379379
self.mindformers_plugin = mindformers_plugin

mindnlp/accelerate/utils/constants.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,18 @@
11
"""constants"""
22
import os
3+
import mindspore
4+
import numpy
35
from .dataclasses import DistributedType
46

7+
8+
def _prepare_data_parallel_native_minspore():
9+
# initialize data parallel hcc backend for data_loader and Trainer API
10+
mindspore.set_context(mode=mindspore.GRAPH_MODE)
11+
mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True)
12+
mindspore.communication.init()
13+
random_seed = numpy.random.randint(10000)
14+
mindspore.set_seed(random_seed)
15+
516
def detect_accelerate_distributed_type():
617
"""
718
detect distributed_type
@@ -10,8 +21,9 @@ def detect_accelerate_distributed_type():
1021
_type_: According to the factors such as the available parallel software and hardware environment of the current system and the user-specified parallel scheme,
1122
the optimal parallel strategy is comprehensively decided in different situations.
1223
"""
13-
if os.environ.get("MULTI_NPU_DP", None) == "true":
14-
return DistributedType.MULTI_NPU_DP
24+
if os.environ.get("MULTI_NPU", None) == "true":
25+
_prepare_data_parallel_native_minspore()
26+
return DistributedType.MULTI_NPU
1527
if os.environ.get("ACCELERATE_USE_MINDFORMERS", "false") == "true":
1628
return DistributedType.MINDFORMERS
1729
else:

mindnlp/accelerate/utils/dataclasses.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ class DistributedType(str, enum.Enum):
1818
Values:
1919
- **MINDFORMERS** -- Using mindformers
2020
- **NO** -- Not a distributed environment, just a single process.
21-
- **MULTI_NPU_DP** -- Distributed data parallel on multiple NPUs.
21+
- **MULTI_NPU** -- Distributed data parallel on multiple NPUs.
2222
"""
2323

24-
MULTI_NPU_DP = "MULTI_NPU_DP"
24+
MULTI_NPU = "MULTI_NPU"
2525
MINDFORMERS = "MINDFORMERS"
2626
NO = "NO"
2727

mindnlp/dataset/load.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ def load_dataset(
335335
column_names = list(raw_ds.features.keys())
336336
source = TransferDataset(raw_ds, column_names) if isinstance(raw_ds, Dataset) \
337337
else TransferIterableDataset(raw_ds, column_names)
338-
if accelerate_distributed_type == DistributedType.MULTI_NPU_DP:
338+
if accelerate_distributed_type == DistributedType.MULTI_NPU:
339339
ms_ds = GeneratorDataset(source=source,
340340
column_names=column_names,
341341
shuffle=shuffle,

mindnlp/engine/trainer/base.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,6 @@
9090
TrainerControl,
9191
TrainerState,
9292
)
93-
from ..utils import _get_learning_rate
9493

9594

9695
logger = logging.get_logger(__name__)
@@ -126,7 +125,7 @@ def _is_peft_model(model):
126125
class Trainer:
127126
"""
128127
Trainer is a simple but feature-complete training and eval loop for MindSpore, optimized for 🤗 Transformers.
129-
"""
128+
"""
130129
def __init__(
131130
self,
132131
model: Union[PreTrainedModel, nn.Module] = None,
@@ -288,6 +287,29 @@ def __init__(
288287
self._created_lr_scheduler = False
289288
self.actual_distributed_type = accelerate_distributed_type
290289

290+
291+
def _get_learning_rate(self):
292+
r"""
293+
This function retrieves the learning rate used by the optimizer.
294+
295+
Args:
296+
self: An instance of the class containing the optimizer and learning rate scheduler.
297+
298+
Returns:
299+
The learning rate value (float) used by the optimizer.
300+
301+
Raises:
302+
None.
303+
"""
304+
if isinstance(self.lr_scheduler, optim.lr_scheduler.ReduceLROnPlateau):
305+
last_lr = self.optimizer.param_groups[0]["lr"]
306+
else:
307+
last_lr = self.lr_scheduler.get_last_lr()[0]
308+
if ops.is_tensor(last_lr):
309+
last_lr = last_lr.item()
310+
return last_lr
311+
312+
291313
def _activate_neftune(self, model):
292314
r"""
293315
Activates the neftune as presented in this code: https://github.com/neelsjain/NEFTune and paper:
@@ -1136,6 +1158,7 @@ def _inner_training_loop(
11361158
model.parameters(),
11371159
args.max_grad_norm,
11381160
)
1161+
11391162
# Optimizer step
11401163
self.optimizer.step()
11411164

@@ -1376,7 +1399,7 @@ def training_step(self, model: nn.Module, inputs: Dict[str, Union[mindspore.Tens
13761399
inputs = self._prepare_inputs(inputs)
13771400

13781401
def forward(inputs):
1379-
if accelerate_distributed_type == DistributedType.MULTI_NPU_DP:
1402+
if accelerate_distributed_type == DistributedType.MULTI_NPU:
13801403
from mindspore.communication import get_group_size
13811404
import mindspore.ops as msops
13821405
rank_size = get_group_size()

mindnlp/engine/utils.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -505,28 +505,6 @@ def speed_metrics(split, start_time, num_samples=None, num_steps=None, num_token
505505
result[f"{split}_tokens_per_second"] = round(tokens_per_second, 3)
506506
return result
507507

508-
def _get_learning_rate(self):
509-
r"""
510-
This function retrieves the learning rate used by the optimizer.
511-
512-
Args:
513-
self: An instance of the class containing the optimizer and learning rate scheduler.
514-
515-
Returns:
516-
The learning rate value (float) used by the optimizer.
517-
518-
Raises:
519-
None.
520-
"""
521-
if isinstance(self.lr_scheduler, optim.lr_scheduler.ReduceLROnPlateau):
522-
last_lr = self.optimizer.param_groups[0]["lr"]
523-
else:
524-
last_lr = self.lr_scheduler.get_last_lr()[0]
525-
if ops.is_tensor(last_lr):
526-
last_lr = last_lr.item()
527-
return last_lr
528-
529-
530508
def find_batch_size(tensors):
531509
"""
532510
Find the first dimension of a tensor in a nested list/tuple/dict of tensors.

0 commit comments

Comments
 (0)