InternLM · sunpengsdu · Sep 20, 2024 · Jul 15, 2024 · Jul 16, 2024 · Jul 16, 2024
diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml
@@ -70,38 +70,194 @@ jobs:
         exit_code=$?
         sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
-  training_llama2:
+  training_8GPU_4DP2TP:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 15
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
+    - uses: actions/checkout@v3
+    - name: training_8GPU_4DP2TP_910B
+      if: ${{ matrix.runner == '910B' }}
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command"
+
+  training_8GPU_4DP2TPSP:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 15
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
+    - uses: actions/checkout@v3
+    - name: training_8GPU_4DP2TPSP_910B
+      if: ${{ matrix.runner == '910B' }}
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command"
+
+  training_8GPU_4DP2PP:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 15
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
+    - uses: actions/checkout@v3
+    - name: training_8GPU_4DP2PP_910B
+      if: ${{ matrix.runner == '910B' }}
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command"
+
+  training_8GPU_4DP2PP_ZB:
     runs-on: [t_cluster]
-    timeout-minutes: 20
+    timeout-minutes: 15
     steps:
     - name: mask env
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
     - uses: actions/checkout@v3
 
-    - name: training_llama2
+    - name: training_8GPU_4DP2PP_ZB
       run: |
         source activate ${evo_env_torch21_flash2}
         jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_ZB" ./tests/test_training/test_loss.py
         exit_code=$?
         sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
-  training_internlm2:
-    runs-on: [t_cluster]
+  training_16GPU_4DP2TP2PP_MTP:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 15
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
+    - uses: actions/checkout@v3
+    - name: training_16GPU_4DP2TP2PP_MTP_910B
+      if: ${{ matrix.runner == '910B' }}
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
+
+  training_16GPU_4DP2TP2PP_MSP:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 15
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
+    - uses: actions/checkout@v3
+    - name: training_16GPU_4DP2TP2PP_MSP_910B
+      if: ${{ matrix.runner == '910B' }}
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
+
+  training_16GPU_4DP2TP2PP_FSP:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 15
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
+    - uses: actions/checkout@v3
+    - name: training_16GPU_4DP2TP2PP_FSP_910B
+      if: ${{ matrix.runner == '910B' }}
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
+
+  training_llama2:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
     timeout-minutes: 20
     steps:
     - name: mask env
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
     - uses: actions/checkout@v3
+    - name: training_llama2_910B
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command"
 
-    - name: training_internlm2
+  training_internlm2:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 20
+    steps:
+    - name: mask env
       run: |
-        source activate ${evo_env_torch21_flash2}
-        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py
-        exit_code=$?
-        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
+    - uses: actions/checkout@v3
+    - name: training_internlm2_910B
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command"
diff --git a/.github/workflows/weekly_test.yaml b/.github/workflows/weekly_test.yaml
@@ -68,6 +68,26 @@ jobs:
         exit_code=$?
         sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
+  training_8GPU_4DP2TPSP_optimizer_v2:
+    runs-on: [t_cluster]
+    timeout-minutes: 10
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+      with:
+         ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
+
+    - name: training_8GPU_4DP2TPSP_optimizer_v2
+      run: |
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP_optimizer_v2" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
+
   training_8GPU_4DP2PP:
     runs-on: [t_cluster]
     timeout-minutes: 10
@@ -88,6 +108,26 @@ jobs:
         exit_code=$?
         sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
+  training_8GPU_4DP2PP_optimizer_v2:
+    runs-on: [t_cluster]
+    timeout-minutes: 10
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+      with:
+         ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
+
+    - name: training_8GPU_4DP2PP_optimizer_v2
+      run: |
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_optimizer_v2" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
+
   training_8GPU_4DP2PP_InterleavedOverlap:
     runs-on: [t_cluster]
     timeout-minutes: 10
@@ -148,6 +188,26 @@ jobs:
         exit_code=$?
         sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
+  training_16GPU_4DP2TP2PP_MSP_optimizer_v2:
+    runs-on: [t_cluster]
+    timeout-minutes: 15
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+    - uses: actions/checkout@v3
+      with:
+         ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
+
+    - name: training_16GPU_4DP2TP2PP_MSP_optimizer_v2
+      run: |
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP_optimizer_v2" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
+
   training_16GPU_4DP2TP2PP_FSP:
     runs-on: [t_cluster]
     timeout-minutes: 10

diff --git a/internlm/accelerator/abstract_accelerator.py b/internlm/accelerator/abstract_accelerator.py
@@ -10,7 +10,8 @@ class AcceleratorType(enum.Enum):
     NPU = 2
     CPU = 3
     DIPU = 4
-    OTHER = 5
+    DITORCH = 5
+    OTHER = 6
 
 
 internlm_accelerator = None
@@ -81,7 +82,15 @@ def get_accelerator():
 
     accelerator_name = None
     # 1. Detect whether there is override of DeepSpeed accelerators from environment variable.
-    intern_accelerator_LIST = ["cuda", "npu", "dipu"]
+    # 2. ditorch: a unified accelerator tools for torch_npu, torch_dipu backend etc.
+    #    deeplink_ext: implemented FlashSelfAttention, FlashCrossAttention, RmsNorm and RotaryEmbedding operations
+    #           etc. based on torch_dipu and torch_npu backend, respectively.
+    #    ditorch, together with deeplink_ext, provides unified APIs for internlm training.
+    #    usage:
+    #       for torch_dipu backend: export INTERNLM_ACCELERATOR=ditorch; export DEEPLINK_EXT_PLATFORM_TYPE=torch_dipu
+    #       for torch_npu backend: export INTERNLM_ACCELERATOR=ditorch; export DEEPLINK_EXT_PLATFORM_TYPE=torch_npu
+
+    intern_accelerator_LIST = ["cuda", "npu", "dipu", "ditorch"]
     if "INTERNLM_ACCELERATOR" in os.environ:
         accelerator_name = os.environ["INTERNLM_ACCELERATOR"]
         if accelerator_name == "npu":
@@ -99,13 +108,30 @@ def get_accelerator():
                     "DIPU_Accelerator requires torch_dipu and deeplink_ext, which is not installed on this system."
                 )
             pass
+        elif accelerator_name == "ditorch":
+            try:
+                import deeplink_ext  # pylint: disable=unused-import
+                import ditorch  # pylint: disable=unused-import
+            except (ImportError, ModuleNotFoundError):
+                raise ValueError(
+                    "DIPU_Accelerator requires ditorch and deeplink_ext, which is not installed on this system."
+                )
+            pass
         elif accelerator_name != "cuda":
             raise ValueError(
                 f"accelerator_name must be one of {intern_accelerator_LIST}."
                 + " Value '{accelerator_name}' is not supported"
             )
 
     # 2. If no override, detect which accelerator to use automatically
+    if accelerator_name is None:
+        try:
+            import deeplink_ext  # noqa: F401,F811 # type: ignore
+            import ditorch  # noqa: F401,F811 # type: ignore
+
+            accelerator_name = "ditorch"
+        except (ImportError, ModuleNotFoundError):
+            pass
     if accelerator_name is None:
         try:
             import deeplink_ext  # noqa: F401,F811 # type: ignore
@@ -137,5 +163,9 @@ def get_accelerator():
         from .dipu_accelerator import DIPU_Accelerator
 
         internlm_accelerator = DIPU_Accelerator()
+    elif accelerator_name == "ditorch":
+        from .ditorch_accelerator import DITORCH_Accelerator
+
+        internlm_accelerator = DITORCH_Accelerator()
 
     return internlm_accelerator