Skip to content

Commit 67327f3

Browse files
authored
Merge pull request #3 from Tridu33/openmind
fix: fix wrong call time for gradient AllReduce
2 parents c801e6d + c5ef29d commit 67327f3

File tree

5 files changed

+39
-45
lines changed

5 files changed

+39
-45
lines changed
Lines changed: 16 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,28 @@
11
#!/usr/bin/env python
22
# coding: utf-8
33
"""
4-
python bert_imdb_finetune_cpu_mindnlp_trainer.py
4+
unset MULTI_NPU && python bert_imdb_finetune_cpu_mindnlp_trainer_npus_same.py
55
bash bert_imdb_finetune_npu_mindnlp_trainer.sh
66
"""
77

8+
import mindspore
9+
from mindspore.dataset import transforms
10+
from mindnlp.engine import Trainer
11+
from mindnlp.dataset import load_dataset
12+
13+
from mindnlp.accelerate.utils.constants import accelerate_distributed_type
14+
from mindnlp.accelerate.utils.dataclasses import DistributedType
15+
816
def main():
9-
import mindspore
10-
from mindspore.dataset import transforms
11-
from mindnlp.engine import Trainer
12-
from mindnlp.dataset import load_dataset
17+
"""demo
1318
19+
Returns:
20+
desc: _description_
21+
"""
1422
imdb_ds = load_dataset('imdb', split=['train', 'test'])
1523
imdb_train = imdb_ds['train']
16-
imdb_test = imdb_ds['test']
1724
imdb_train.get_dataset_size()
25+
1826
from mindnlp.transformers import AutoTokenizer
1927
# tokenizer
2028
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
@@ -41,15 +49,10 @@ def tokenize(text):
4149
dataset = dataset.padded_batch(batch_size, pad_info={'input_ids': (None, tokenizer.pad_token_id),
4250
'token_type_ids': (None, 0),
4351
'attention_mask': (None, 0)})
44-
4552
return dataset
4653

47-
# split train dataset into train and valid datasets
48-
imdb_train, imdb_val = imdb_train.split([0.7, 0.3])
4954

5055
dataset_train = process_dataset(imdb_train, tokenizer, shuffle=True)
51-
dataset_val = process_dataset(imdb_val, tokenizer)
52-
dataset_test = process_dataset(imdb_test, tokenizer)
5356

5457
next(dataset_train.create_tuple_iterator())
5558

@@ -62,36 +65,21 @@ def tokenize(text):
6265

6366
training_args = TrainingArguments(
6467
output_dir="bert_imdb_finetune_cpu",
65-
evaluation_strategy="epoch",
6668
save_strategy="epoch",
6769
logging_strategy="epoch",
68-
load_best_model_at_end=True,
6970
num_train_epochs=2.0,
7071
learning_rate=2e-5
7172
)
72-
training_args = training_args.set_optimizer(name="adamw", beta1=0.8) # OptimizerNames.SGD
73-
74-
from mindnlp import evaluate
75-
import numpy as np
76-
metric = evaluate.load("accuracy")
77-
def compute_metrics(eval_pred):
78-
logits, labels = eval_pred
79-
predictions = np.argmax(logits, axis=-1)
80-
return metric.compute(predictions=predictions, references=labels)
81-
73+
training_args = training_args.set_optimizer(name="adamw", beta1=0.8) # 手动指定优化器,OptimizerNames.SGD
74+
8275
trainer = Trainer(
8376
model=model,
8477
args=training_args,
8578
train_dataset=dataset_train,
86-
eval_dataset=dataset_val,
87-
compute_metrics=compute_metrics
8879
)
8980
print("Start training")
9081
trainer.train()
9182

92-
print("Start checking the test set")
93-
trainer.evaluate(dataset_test)
94-
9583
if __name__ == '__main__':
9684
main()
9785

examples/parallel/bert_imdb_finetune/bert_imdb_finetune_npu_mindnlp_trainer.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ mkdir bert_imdb_finetune_cpu_mindnlp_trainer_npus_same
1919
echo "start training"
2020

2121
export MULTI_NPU="true"
22-
# unset MULTI_NPU
23-
msrun --worker_num=2 --local_worker_num=2 --master_port=8119 \
22+
export ASCEND_SLOG_PRINT_TO_STDOUT=1
23+
24+
msrun --worker_num=2 --local_worker_num=2 --master_port=8121 \
2425
--log_dir=bert_imdb_finetune_cpu_mindnlp_trainer_npus_same --join=True \
25-
--cluster_time_out=30 bert_imdb_finetune_cpu_mindnlp_trainer_npus_same.py
26+
--cluster_time_out=10 bert_imdb_finetune_cpu_mindnlp_trainer_npus_same.py

mindnlp/accelerate/accelerator.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
"""accelerate"""
22
import os
3-
import mindspore
4-
import numpy
5-
63
from contextlib import contextmanager
74
from typing import Optional
5+
6+
import mindspore
87
from mindspore import nn
98
from mindspore.communication import init
109

mindnlp/dataset/load.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
"""
1919
import os
2020
from typing import Union, Optional, Dict, Sequence, Mapping
21-
from mindspore.dataset import GeneratorDataset
2221
from datasets import load_dataset as hf_load
2322
from datasets import Dataset, IterableDataset, Split, Features, \
2423
DownloadConfig, DownloadMode, VerificationMode, Version
25-
from mindnlp.configs import DEFAULT_ROOT
24+
from mindspore.dataset import GeneratorDataset
2625
from mindspore.communication import get_rank, get_group_size
26+
from mindnlp.configs import DEFAULT_ROOT
2727
from ..accelerate import DistributedType
2828
from ..accelerate.utils import accelerate_distributed_type
2929

mindnlp/engine/trainer/base.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1377,6 +1377,20 @@ def _prepare_inputs(self, inputs: Dict[str, Union[mindspore.Tensor, Any]]) -> Di
13771377

13781378
return inputs
13791379

1380+
1381+
def update_gradient_by_distributed_type(self, model: nn.Module) -> None:
1382+
"""update gradient by distributed_type"""
1383+
if accelerate_distributed_type == DistributedType.NO:
1384+
return
1385+
if accelerate_distributed_type == DistributedType.MULTI_NPU:
1386+
from mindspore.communication import get_group_size
1387+
from mindspore.communication.comm_func import all_reduce
1388+
rank_size = get_group_size()
1389+
for parameter in model.parameters():
1390+
new_grads_mean = all_reduce(parameter.grad) / rank_size
1391+
parameter.grad = new_grads_mean
1392+
1393+
13801394
def training_step(self, model: nn.Module, inputs: Dict[str, Union[mindspore.Tensor, Any]]) -> Tuple[List[mindspore.Tensor], mindspore.Tensor]:
13811395
"""
13821396
Perform a training step on a batch of inputs.
@@ -1399,14 +1413,6 @@ def training_step(self, model: nn.Module, inputs: Dict[str, Union[mindspore.Tens
13991413
inputs = self._prepare_inputs(inputs)
14001414

14011415
def forward(inputs):
1402-
if accelerate_distributed_type == DistributedType.MULTI_NPU:
1403-
from mindspore.communication import get_group_size
1404-
import mindspore.ops as msops
1405-
rank_size = get_group_size()
1406-
for parameter in model.parameters():
1407-
all_reduce_sum = msops.AllReduce(msops.ReduceOp.SUM)
1408-
new_grads_mean = all_reduce_sum(parameter.grad) / rank_size
1409-
parameter.grad = new_grads_mean
14101416
return self.compute_loss(model, inputs)
14111417

14121418
if getattr(self, 'grad_fn', None) is None or self.model_reload:
@@ -1416,7 +1422,7 @@ def forward(inputs):
14161422
self.grad_fn = value_and_grad(forward, weights, attach_grads=True)
14171423

14181424
loss = self.grad_fn(inputs)
1419-
1425+
self.update_gradient_by_distributed_type(model)
14201426
return loss / self.args.gradient_accumulation_steps
14211427

14221428
def compute_loss(self, model, inputs, return_outputs=False):

0 commit comments

Comments
 (0)