feat: add _actual_distributed_type constant to decide parallel DistributedType, add data parallel of native mindspore to mindnlp.Trainer.base

Tridu33 · Tridu33 · commit 59fab8bb3a6c · 2024-11-29T21:07:06.000+08:00
diff --git a/mindnlp/accelerate/__init__.py b/mindnlp/accelerate/__init__.py
@@ -6,6 +6,8 @@
     # DeepSpeedPlugin,
     # DistributedDataParallelKwargs,
     # FullyShardedDataParallelPlugin,
+    accelerate_distributed_type,
+    DistributedType,
     # GradScalerKwargs,
     # InitProcessGroupKwargs,
     # ProfileKwargs,
diff --git a/mindnlp/accelerate/accelerator.py b/mindnlp/accelerate/accelerator.py
@@ -14,7 +14,8 @@
     is_mindformers_available,
     wait_for_everyone
 )
-from ..utils import _actual_distributed_type, logging, DistributedType
+from .utils import DistributedType,accelerate_distributed_type
+from ..utils import logging
 
 if is_mindformers_available():
     from .utils import (
@@ -45,7 +46,7 @@ def __init__(
         # init mindformers_plugin from env variables
         if mindformers_plugin is None:
             mindformers_plugin = (
-                MindFormersPlugin() if _actual_distributed_type == DistributedType.MINDFORMERS else None
+                MindFormersPlugin() if accelerate_distributed_type == DistributedType.MINDFORMERS else None
             )
         else:
             os.environ["ACCELERATE_USE_MINDFORMERS"] = "true"
@@ -104,10 +105,10 @@ def prepare(self, *args):
         """
         result = []
 
-        # Only support mindsormers and MULTI_NPU_DATA_PARALLEL now
+        # Only support mindsormers and MULTI_NPU_DP now
         if self.distributed_type == DistributedType.MINDFORMERS:
             result = self._prepare_mindformers(*args)
-        elif self.distributed_type == DistributedType.MULTI_NPU_DATA_PARALLEL:
+        elif self.distributed_type == DistributedType.MULTI_NPU_DP:
             result = self._prepare_data_parallel_native_minspore(*args)
         return result
 
diff --git a/mindnlp/accelerate/state.py b/mindnlp/accelerate/state.py
@@ -13,7 +13,7 @@
 from .utils import (
     is_mindformers_available
 )
-from ..utils import _actual_distributed_type, DistributedType
+from ..accelerate.utils import accelerate_distributed_type, DistributedType
 
 SharedDict = dict
 
@@ -344,12 +344,12 @@ def print(self, *args, **kwargs):
 
     def _prepare_backend(self):
         # now mindformers and mindspore data parallel only
-        if _actual_distributed_type == DistributedType.MINDFORMERS and is_mindformers_available():
+        if accelerate_distributed_type == DistributedType.MINDFORMERS and is_mindformers_available():
             self.backend = "hccl"
             self.distributed_type = DistributedType.MINDFORMERS
-        elif _actual_distributed_type == DistributedType.MULTI_NPU_DATA_PARALLEL:
+        elif accelerate_distributed_type == DistributedType.MULTI_NPU_DP:
             self.backend = "hccl"
-            self.distributed_type = DistributedType.MULTI_NPU_DATA_PARALLEL
+            self.distributed_type = DistributedType.MULTI_NPU_DP
             
     @num_processes.setter
     def num_processes(self, value):
@@ -372,9 +372,9 @@ def __init__(self, mindformers_plugin=None, **kwargs):
             PartialState(**kwargs)
         self.__dict__.update(PartialState._shared_state)
         # set distributed_type
-        if _actual_distributed_type == DistributedType.MULTI_NPU_DATA_PARALLEL:
-            self.distributed_type = DistributedType.MULTI_NPU_DATA_PARALLEL
-        elif _actual_distributed_type == DistributedType.MINDFORMERS:
+        if accelerate_distributed_type == DistributedType.MULTI_NPU_DP:
+            self.distributed_type = DistributedType.MULTI_NPU_DP
+        elif accelerate_distributed_type == DistributedType.MINDFORMERS:
             self.distributed_type = DistributedType.MINDFORMERS
             self.mindformers_plugin = mindformers_plugin
         else: 
diff --git a/mindnlp/accelerate/utils/__init__.py b/mindnlp/accelerate/utils/__init__.py
@@ -1,5 +1,7 @@
 """accelerate utils"""
+from .constants import accelerate_distributed_type
 from .dataclasses import (
+    DistributedType,
     MindFormersPlugin
 )
 from .environment import (
diff --git a/mindnlp/accelerate/utils/constants.py b/mindnlp/accelerate/utils/constants.py
@@ -0,0 +1,21 @@
+"""constants"""
+import os
+from .dataclasses import DistributedType
+
+def detect_accelerate_distributed_type():
+    """
+    detect distributed_type
+
+    Returns:
+        _type_: According to the factors such as the available parallel software and hardware environment of the current system and the user-specified parallel scheme,
+          the optimal parallel strategy is comprehensively decided in different situations.
+    """
+    if os.environ.get("MULTI_NPU_DP", None) == "true": 
+        return DistributedType.MULTI_NPU_DP
+    if os.environ.get("ACCELERATE_USE_MINDFORMERS", "false") == "true": 
+        return DistributedType.MINDFORMERS
+    else:
+        return DistributedType.NO
+
+accelerate_distributed_type = detect_accelerate_distributed_type()
+    
diff --git a/mindnlp/accelerate/utils/dataclasses.py b/mindnlp/accelerate/utils/dataclasses.py
@@ -11,6 +11,21 @@
 )
 
 
+class DistributedType(str, enum.Enum):
+    """
+    Represents a type of distributed environment.
+
+    Values:
+        - **MINDFORMERS** -- Using mindformers
+        - **NO** -- Not a distributed environment, just a single process.
+        - **MULTI_NPU_DP** -- Distributed data parallel on multiple NPUs.
+    """
+
+    MULTI_NPU_DP = "MULTI_NPU_DP"
+    MINDFORMERS = "MINDFORMERS"
+    NO = "NO"
+
+
 @dataclass
 class MindFormersPlugin:
     """
diff --git a/mindnlp/dataset/load.py b/mindnlp/dataset/load.py
@@ -23,8 +23,10 @@
 from datasets import Dataset, IterableDataset, Split, Features, \
     DownloadConfig, DownloadMode, VerificationMode, Version
 from mindnlp.configs import DEFAULT_ROOT
-from ..utils.constants import _actual_distributed_type
-from ..utils.dataclasses import DistributedType
+from mindspore.communication import get_rank, get_group_size
+from ..accelerate import DistributedType
+from ..accelerate.utils import accelerate_distributed_type
+
 
 class TransferIterableDataset():
     """TransferDataset for Huggingface Dataset."""
@@ -333,20 +335,19 @@ def load_dataset(
         column_names = list(raw_ds.features.keys())
         source = TransferDataset(raw_ds, column_names) if isinstance(raw_ds, Dataset) \
             else TransferIterableDataset(raw_ds, column_names)
-        ms_ds = ms_ds
-        if _actual_distributed_type == DistributedType.MULTI_NPU_DATA_PARALLEL:
-            from mindspore.communication import get_rank, get_group_size
+        if accelerate_distributed_type == DistributedType.MULTI_NPU_DP:
             ms_ds = GeneratorDataset(source=source, 
             column_names=column_names, 
             shuffle=shuffle,
             num_parallel_workers=num_proc if num_proc else 1,
             num_shards=get_group_size(), shard_id=get_rank())
+            datasets_dict[key] = ms_ds
         else:
             ms_ds = GeneratorDataset(source=source,
             column_names=column_names,
             shuffle=shuffle,
             num_parallel_workers=num_proc if num_proc else 1)
-        datasets_dict[key] = ms_ds
+            datasets_dict[key] = ms_ds
 
     if len(datasets_dict) == 1:
         return datasets_dict.popitem()[1]
diff --git a/mindnlp/engine/trainer/base.py b/mindnlp/engine/trainer/base.py
@@ -45,8 +45,8 @@
     WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
 from ...dataset import BaseMapFunction
 from ...utils import logging, find_labels, can_return_loss
-from ...utils.constants import _actual_distributed_type 
-from ...utils.dataclasses import DistributedType
+from ...accelerate.utils import DistributedType
+from ...accelerate.utils import accelerate_distributed_type 
 from ...utils.import_utils import is_safetensors_available
 from ...transformers.modeling_utils import PreTrainedModel
 from ...transformers.configuration_utils import PretrainedConfig
@@ -286,7 +286,7 @@ def __init__(
         # Internal variables to help with automatic batch size reduction
         self._train_batch_size = args.train_batch_size
         self._created_lr_scheduler = False
-        self.actual_distributed_type = _actual_distributed_type
+        self.actual_distributed_type = accelerate_distributed_type
 
     def _activate_neftune(self, model):
         r"""
@@ -1376,6 +1376,14 @@ def training_step(self, model: nn.Module, inputs: Dict[str, Union[mindspore.Tens
         inputs = self._prepare_inputs(inputs)
 
         def forward(inputs):
+            if accelerate_distributed_type == DistributedType.MULTI_NPU_DP:
+                from mindspore.communication import get_group_size
+                import mindspore.ops as msops
+                rank_size = get_group_size()
+                for parameter in model.parameters():
+                    all_reduce_sum =  msops.AllReduce(msops.ReduceOp.SUM)
+                    new_grads_mean = all_reduce_sum(parameter.grad) / rank_size
+                    parameter.grad = new_grads_mean
             return self.compute_loss(model, inputs)
 
         if getattr(self, 'grad_fn', None) is None or self.model_reload:
diff --git a/mindnlp/engine/trainer/default_func.py b/mindnlp/engine/trainer/default_func.py
@@ -19,8 +19,8 @@
 from mindspore.amp import all_finite
 
 from mindnlp.utils import ModelOutput
-from ...utils.constants import _actual_distributed_type
-from ...utils.dataclasses import DistributedType
+from ...accelerate.utils import DistributedType
+from ...accelerate.utils import accelerate_distributed_type
 
 def get_default_forward_fn_with_loss_fn(network, loss_fn, loss_scaler):
     """get default forward function with loss function"""
@@ -66,9 +66,6 @@ def get_default_train_step_fn(forward_fn, optimizer, loss_scaler, check_gradient
     def default_run_step(labels, *args, **kwargs):
         """Core process of each step, including the forward propagation process and back propagation of data."""
         loss, grads = grad_fn(labels, *args, **kwargs)
-        if _actual_distributed_type == DistributedType.MULTI_NPU_DATA_PARALLEL:
-            grads = nn.DistributedGradReducer(optimizer.parameters)
-        grad_reducer = nn.DistributedGradReducer(optimizer.parameters)
         loss = loss_scaler.unscale(loss)
         if check_gradients:
             is_finite = all_finite(grads)
@@ -83,8 +80,6 @@ def default_run_step(labels, *args, **kwargs):
     def default_run_step_for_obj_net(*args, **kwargs):
         """Core process of each step, including the forward propagation process and back propagation of data."""
         loss, grads = grad_fn(*args, **kwargs)
-        if _actual_distributed_type == DistributedType.MULTI_NPU_DATA_PARALLEL:
-            grads = nn.DistributedGradReducer(optimizer.parameters)
         loss = loss_scaler.unscale(loss)
         if check_gradients:
             is_finite = all_finite(grads)
diff --git a/mindnlp/utils/__init__.py b/mindnlp/utils/__init__.py
@@ -21,8 +21,6 @@
 from .download import *
 from .compatibility import *
 from .chat_template_utils import *
-from .dataclasses import DistributedType
-from .constants import _actual_distributed_type
 from .import_utils import requires_backends, is_mindspore_available, OptionalDependencyNotAvailable, is_sentencepiece_available, \
 is_tokenizers_available, direct_transformers_import, is_protobuf_available, is_safetensors_available, \
 is_cython_available, is_pretty_midi_available, is_essentia_available, is_librosa_available, is_scipy_available, is_pyctcdecode_available, \
diff --git a/mindnlp/utils/constants.py b/mindnlp/utils/constants.py
diff --git a/mindnlp/utils/dataclasses.py b/mindnlp/utils/dataclasses.py
diff --git a/mindnlp/utils/devices.py b/mindnlp/utils/devices.py
diff --git a/tests/accelerate/grad_Reduce_ut/test_grad_Reduce.py b/tests/accelerate/grad_Reduce_ut/test_grad_Reduce.py
@@ -0,0 +1,35 @@
+
+
+def test_AllReduce_mean():
+    import numpy as np
+    from mindspore.communication import init, get_rank, get_group_size
+    import mindspore as ms
+    import mindspore.nn as nn
+    import mindspore.ops as ops
+
+    init()
+    rank_size = get_group_size()
+    class Net(nn.Cell):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.all_reduce_sum = ops.AllReduce(ops.ReduceOp.SUM)
+
+        def construct(self, x):
+            new_grads_mean = self.all_reduce_sum(x) / rank_size
+            new_grad = new_grads_mean
+            return new_grad
+
+    rank_id_value = get_rank() # Current NPU number 0,...,7
+    print('rank_id_value=',rank_id_value)
+    input_x = ms.Tensor(np.array([[rank_id_value]]).astype(np.float32))
+    print('input_x=',input_x)
+    net = Net()
+    output = net(input_x)
+    print("mean:",output) # sum(0, rank_size) / rank_size
+
+
+
+
+if __name__ == '__main__':
+    test_AllReduce_mean()
+
diff --git a/tests/accelerate/grad_Reduce_ut/test_grad_Reduce.sh b/tests/accelerate/grad_Reduce_ut/test_grad_Reduce.sh
@@ -0,0 +1,3 @@
+# mpirun -n 8 -H 127.0.0.1:8 --output-filename bak/log_output_mpirun_single/log_ python test_grad_Reduce.py
+msrun --worker_num=4 --local_worker_num=4 --master_port=8123 --log_dir=bak/msrun_log --join=True --cluster_time_out=100 test_grad_Reduce.py
+# msrun --worker_num=8 --local_worker_num=8 --master_port=8123 --log_dir=bak/msrun_log --join=True --cluster_time_out=100 test_grad_Reduce.py

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,7 @@`
`1`	`1`	`"""accelerate utils"""`
	`2`	`+from .constants import accelerate_distributed_type`
`2`	`3`	`from .dataclasses import (`
	`4`	`+ DistributedType,`
`3`	`5`	`MindFormersPlugin`
`4`	`6`	`)`
`5`	`7`	`from .environment import (`