feat: add _actual_distributed_type constant to decide parallel DistributedType, add data parallel of native mindspore to mindnlp.Trainer.base

Tridu33 · Tridu33 · commit 791e2dfc6a08 · 2024-11-28T06:02:22.000+08:00
diff --git a/mindnlp/accelerate/__init__.py b/mindnlp/accelerate/__init__.py
@@ -5,7 +5,6 @@
     # DDPCommunicationHookType,
     # DeepSpeedPlugin,
     # DistributedDataParallelKwargs,
-    # DistributedType,
     # FullyShardedDataParallelPlugin,
     # GradScalerKwargs,
     # InitProcessGroupKwargs,
diff --git a/mindnlp/accelerate/accelerator.py b/mindnlp/accelerate/accelerator.py
@@ -1,20 +1,20 @@
 """accelerate"""
 import os
+import mindspore
+import numpy
+
 from contextlib import contextmanager
 from typing import Optional
-
-import mindspore
 from mindspore import nn
 from mindspore.communication import init
 
 from .state import AcceleratorState
 from .utils import (
-    DistributedType,
     MindFormersPlugin,
     is_mindformers_available,
     wait_for_everyone
 )
-from ..utils import logging
+from ..utils import _actual_distributed_type, logging, DistributedType
 
 if is_mindformers_available():
     from .utils import (
@@ -45,7 +45,7 @@ def __init__(
         # init mindformers_plugin from env variables
         if mindformers_plugin is None:
             mindformers_plugin = (
-                MindFormersPlugin() if os.environ.get("ACCELERATE_USE_MINDFORMERS", "false") == "true" else None
+                MindFormersPlugin() if _actual_distributed_type == DistributedType.MINDFORMERS else None
             )
         else:
             os.environ["ACCELERATE_USE_MINDFORMERS"] = "true"
@@ -104,12 +104,20 @@ def prepare(self, *args):
         """
         result = []
 
-        # Only support mindsormers now
+        # Only support mindsormers and MULTI_NPU_DATA_PARALLEL now
         if self.distributed_type == DistributedType.MINDFORMERS:
             result = self._prepare_mindformers(*args)
-
+        elif self.distributed_type == DistributedType.MULTI_NPU_DATA_PARALLEL:
+            result = self._prepare_data_parallel_native_minspore(*args)
         return result
 
+    def _prepare_data_parallel_native_minspore(self, *args):
+        # initialize data parallel for native mindspore
+        mindspore.set_context(mode=mindspore.GRAPH_MODE)
+        mindspore.set_auto_parallel_context(parallel_mode=mindspore.ParallelMode.DATA_PARALLEL, gradients_mean=True)
+        mindspore.communication.init()
+        mindspore.set_seed(numpy.random.seed())
+
     def _prepare_mindformers(self, *args):
         mindformers_plugin = self.state.mindformers_plugin
 
diff --git a/mindnlp/accelerate/state.py b/mindnlp/accelerate/state.py
@@ -4,14 +4,16 @@
 from contextlib import contextmanager
 from typing import Callable, Any
 from mindspore import communication
+
 try:
     from mindspore.communication.comm_func import barrier
 except:
     barrier = None
 
 from .utils import (
-    DistributedType, is_mindformers_available
+    is_mindformers_available
 )
+from ..utils import _actual_distributed_type, DistributedType
 
 SharedDict = dict
 
@@ -341,11 +343,14 @@ def print(self, *args, **kwargs):
             print(*args, **kwargs)
 
     def _prepare_backend(self):
-        # now mindformers only
-        if is_mindformers_available():
+        # now mindformers and mindspore data parallel only
+        if _actual_distributed_type == DistributedType.MINDFORMERS and is_mindformers_available():
             self.backend = "hccl"
             self.distributed_type = DistributedType.MINDFORMERS
-
+        elif _actual_distributed_type == DistributedType.MULTI_NPU_DATA_PARALLEL:
+            self.backend = "hccl"
+            self.distributed_type = DistributedType.MULTI_NPU_DATA_PARALLEL
+            
     @num_processes.setter
     def num_processes(self, value):
         self._num_processes = value
@@ -366,10 +371,14 @@ def __init__(self, mindformers_plugin=None, **kwargs):
         if PartialState._shared_state:
             PartialState(**kwargs)
         self.__dict__.update(PartialState._shared_state)
-
-        if os.environ.get("ACCELERATE_USE_MINDFORMERS", "false") == "true":
+        # set distributed_type
+        if _actual_distributed_type == DistributedType.MULTI_NPU_DATA_PARALLEL:
+            self.distributed_type = DistributedType.MULTI_NPU_DATA_PARALLEL
+        elif _actual_distributed_type == DistributedType.MINDFORMERS:
             self.distributed_type = DistributedType.MINDFORMERS
             self.mindformers_plugin = mindformers_plugin
+        else: 
+            self.distributed_type = DistributedType.NO
 
         PartialState._shared_state["distributed_type"] = self.distributed_type
 
diff --git a/mindnlp/accelerate/utils/__init__.py b/mindnlp/accelerate/utils/__init__.py
@@ -1,6 +1,5 @@
 """accelerate utils"""
 from .dataclasses import (
-    DistributedType,
     MindFormersPlugin
 )
 from .environment import (
diff --git a/mindnlp/accelerate/utils/dataclasses.py b/mindnlp/accelerate/utils/dataclasses.py
@@ -11,18 +11,6 @@
 )
 
 
-class DistributedType(str, enum.Enum):
-    """
-    Represents a type of distributed environment.
-
-    Values:
-        - **MINDFORMERS** -- Using mindformers
-    """
-
-    MINDFORMERS = "MINDFORMERS"
-    NO = "NO"
-
-
 @dataclass
 class MindFormersPlugin:
     """
diff --git a/mindnlp/dataset/load.py b/mindnlp/dataset/load.py
@@ -23,6 +23,8 @@
 from datasets import Dataset, IterableDataset, Split, Features, \
     DownloadConfig, DownloadMode, VerificationMode, Version
 from mindnlp.configs import DEFAULT_ROOT
+from ..utils.constants import _actual_distributed_type
+from ..utils.dataclasses import DistributedType
 
 class TransferIterableDataset():
     """TransferDataset for Huggingface Dataset."""
@@ -331,8 +333,16 @@ def load_dataset(
         column_names = list(raw_ds.features.keys())
         source = TransferDataset(raw_ds, column_names) if isinstance(raw_ds, Dataset) \
             else TransferIterableDataset(raw_ds, column_names)
-        ms_ds = GeneratorDataset(
-            source=source,
+        ms_ds = ms_ds
+        if _actual_distributed_type == DistributedType.MULTI_NPU_DATA_PARALLEL:
+            from mindspore.communication import get_rank, get_group_size
+            ms_ds = GeneratorDataset(source=source, 
+            column_names=column_names, 
+            shuffle=shuffle,
+            num_parallel_workers=num_proc if num_proc else 1,
+            num_shards=get_group_size(), shard_id=get_rank())
+        else:
+            ms_ds = GeneratorDataset(source=source,
             column_names=column_names,
             shuffle=shuffle,
             num_parallel_workers=num_proc if num_proc else 1)
diff --git a/mindnlp/engine/trainer/base.py b/mindnlp/engine/trainer/base.py
@@ -45,6 +45,8 @@
     WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
 from ...dataset import BaseMapFunction
 from ...utils import logging, find_labels, can_return_loss
+from ...utils.constants import _actual_distributed_type 
+from ...utils.dataclasses import DistributedType
 from ...utils.import_utils import is_safetensors_available
 from ...transformers.modeling_utils import PreTrainedModel
 from ...transformers.configuration_utils import PretrainedConfig
@@ -88,6 +90,7 @@
     TrainerControl,
     TrainerState,
 )
+from ..utils import _get_learning_rate
 
 
 logger = logging.get_logger(__name__)
@@ -124,7 +127,6 @@ class Trainer:
     """
     Trainer is a simple but feature-complete training and eval loop for MindSpore, optimized for 🤗 Transformers.
     """
-    from ..utils import _get_learning_rate
     def __init__(
         self,
         model: Union[PreTrainedModel, nn.Module] = None,
@@ -284,6 +286,7 @@ def __init__(
         # Internal variables to help with automatic batch size reduction
         self._train_batch_size = args.train_batch_size
         self._created_lr_scheduler = False
+        self.actual_distributed_type = _actual_distributed_type
 
     def _activate_neftune(self, model):
         r"""
diff --git a/mindnlp/engine/trainer/default_func.py b/mindnlp/engine/trainer/default_func.py
@@ -15,10 +15,12 @@
 """
 utils for trainer.
 """
-from mindspore import ops, value_and_grad
+from mindspore import nn, ops, value_and_grad
 from mindspore.amp import all_finite
 
 from mindnlp.utils import ModelOutput
+from ...utils.constants import _actual_distributed_type
+from ...utils.dataclasses import DistributedType
 
 def get_default_forward_fn_with_loss_fn(network, loss_fn, loss_scaler):
     """get default forward function with loss function"""
@@ -64,6 +66,9 @@ def get_default_train_step_fn(forward_fn, optimizer, loss_scaler, check_gradient
     def default_run_step(labels, *args, **kwargs):
         """Core process of each step, including the forward propagation process and back propagation of data."""
         loss, grads = grad_fn(labels, *args, **kwargs)
+        if _actual_distributed_type == DistributedType.MULTI_NPU_DATA_PARALLEL:
+            grads = nn.DistributedGradReducer(optimizer.parameters)
+        grad_reducer = nn.DistributedGradReducer(optimizer.parameters)
         loss = loss_scaler.unscale(loss)
         if check_gradients:
             is_finite = all_finite(grads)
@@ -78,6 +83,8 @@ def default_run_step(labels, *args, **kwargs):
     def default_run_step_for_obj_net(*args, **kwargs):
         """Core process of each step, including the forward propagation process and back propagation of data."""
         loss, grads = grad_fn(*args, **kwargs)
+        if _actual_distributed_type == DistributedType.MULTI_NPU_DATA_PARALLEL:
+            grads = nn.DistributedGradReducer(optimizer.parameters)
         loss = loss_scaler.unscale(loss)
         if check_gradients:
             is_finite = all_finite(grads)
diff --git a/mindnlp/utils/__init__.py b/mindnlp/utils/__init__.py
@@ -21,6 +21,8 @@
 from .download import *
 from .compatibility import *
 from .chat_template_utils import *
+from .dataclasses import DistributedType
+from .constants import _actual_distributed_type
 from .import_utils import requires_backends, is_mindspore_available, OptionalDependencyNotAvailable, is_sentencepiece_available, \
 is_tokenizers_available, direct_transformers_import, is_protobuf_available, is_safetensors_available, \
 is_cython_available, is_pretty_midi_available, is_essentia_available, is_librosa_available, is_scipy_available, is_pyctcdecode_available, \
diff --git a/mindnlp/utils/constants.py b/mindnlp/utils/constants.py
@@ -0,0 +1,32 @@
+"""global constants for mindnlp"""
+import os
+import psutil
+
+# from .devices import _is_Ascend_npu_avaliable, _avaliable_Ascend_npus_count #TODU: if use acl
+from .dataclasses import DistributedType
+
+
+
+def detect_actual_distributed_type():
+    """
+    the actual_distributed_type isn't the distributed_type users wanted in the startup command, such as:
+        1. NPU is available, specified 'msrun' ==> NPU
+        2. mschrun specifies parallel npu, but npu is not available ==> cpu execution (reasonable)
+        3. NPU is available, but the user python x.py start without specifying the information of the number of port cards to initialize the communication, and the actual_distributed_type is CPU
+        .etc
+
+    Returns:
+        _type_: According to the factors such as the available parallel software and hardware environment of the current system and the user-specified parallel scheme,
+          the optimal parallel strategy is comprehensively decided in different situations.
+    """
+    if os.environ.get("MULTI_NPU_DATA_PARALLEL", None) == "true": 
+        # TODO: 暂时用环境变量 MULTI_NPU_DATA_PARALLEL 作为开关，讨论是否改为这个取代 DistributedType.MINDFORMERS 作为兜底策略
+        return DistributedType.MULTI_NPU_DATA_PARALLEL
+    if os.environ.get("ACCELERATE_USE_MINDFORMERS", "false") == "true": 
+        # TODO: 在原有逻辑中，没有配置环境变量的情况下默认使用 DistributedType.MINDFORMERS 。这里是否需要删掉
+        return DistributedType.MINDFORMERS
+    else:
+        return DistributedType.NO
+    
+_actual_distributed_type = detect_actual_distributed_type()
+    
diff --git a/mindnlp/utils/dataclasses.py b/mindnlp/utils/dataclasses.py
@@ -0,0 +1,18 @@
+"""data classes for mindnlp"""
+import enum
+
+
+class DistributedType(str, enum.Enum):
+    """
+    Represents a type of distributed environment.
+
+    Values:
+        - **MINDFORMERS** -- Using mindformers
+        - **NO** -- Not a distributed environment, just a single process.
+        - **MULTI_NPU_DATA_PARALLEL** -- Distributed data parallel on multiple NPUs.
+    """
+
+    MULTI_NPU_DATA_PARALLEL = "MULTI_NPU_DATA_PARALLEL"
+    MINDFORMERS = "MINDFORMERS"
+    NO = "NO"
+
diff --git a/mindnlp/utils/devices.py b/mindnlp/utils/devices.py
@@ -0,0 +1,27 @@
+"""""devices manager for mindnlp"""
+import acl
+from loguru import logger
+
+def check_Ascend_npu_available():
+    """check available Ascend npus
+
+    Returns:
+        is_npu_avaliable: True
+        device_count: count of Ascend available npus
+    """
+    ret = acl.init()
+    if ret!= 0:
+        logger.error("ACL initialization fails, NPU may be unavailable, please check the relevant environment and device connection!")
+        return False, 0
+    else:
+        device_count = acl.get_device_count()
+        if device_count == 0:
+            logger.error("If the ACL is initialized, the Ascend NPU may be available.")
+            acl.finalize()
+            return False, acl.get_device_count()
+        else:
+            logger.info(f"{device_count} Ascend NPU devices are detected, and the NPU is available.")
+            acl.finalize()
+            return True, device_count
+
+# _is_Ascend_npu_avaliable, _avaliable_Ascend_npus_count = check_Ascend_npu_available() # acl is a problem

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,5 @@`
`1`	`1`	`"""accelerate utils"""`
`2`	`2`	`from .dataclasses import (`
`3`		`- DistributedType,`
`4`	`3`	`MindFormersPlugin`
`5`	`4`	`)`
`6`	`5`	`from .environment import (`