Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 168 additions & 12 deletions .github/workflows/e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,38 +70,194 @@ jobs:
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

training_llama2:
training_8GPU_4DP2TP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_8GPU_4DP2TP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"

training_8GPU_4DP2TPSP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_8GPU_4DP2TPSP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"

training_8GPU_4DP2PP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_8GPU_4DP2PP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"

training_8GPU_4DP2PP_ZB:
runs-on: [t_cluster]
timeout-minutes: 20
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3

- name: training_llama2
- name: training_8GPU_4DP2PP_ZB
run: |
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_ZB" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

training_internlm2:
runs-on: [t_cluster]
training_16GPU_4DP2TP2PP_MTP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_16GPU_4DP2TP2PP_MTP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"

training_16GPU_4DP2TP2PP_MSP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_16GPU_4DP2TP2PP_MSP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"

training_16GPU_4DP2TP2PP_FSP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_16GPU_4DP2TP2PP_FSP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"

training_llama2:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_llama2_910B
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"

- name: training_internlm2
training_internlm2:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 20
steps:
- name: mask env
run: |
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_internlm2_910B
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
60 changes: 60 additions & 0 deletions .github/workflows/weekly_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,26 @@ jobs:
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

training_8GPU_4DP2TPSP_optimizer_v2:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_8GPU_4DP2TPSP_optimizer_v2
run: |
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP_optimizer_v2" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

training_8GPU_4DP2PP:
runs-on: [t_cluster]
timeout-minutes: 10
Expand All @@ -88,6 +108,26 @@ jobs:
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

training_8GPU_4DP2PP_optimizer_v2:
runs-on: [t_cluster]
timeout-minutes: 10
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_8GPU_4DP2PP_optimizer_v2
run: |
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_optimizer_v2" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

training_8GPU_4DP2PP_InterleavedOverlap:
runs-on: [t_cluster]
timeout-minutes: 10
Expand Down Expand Up @@ -148,6 +188,26 @@ jobs:
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

training_16GPU_4DP2TP2PP_MSP_optimizer_v2:
runs-on: [t_cluster]
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
with:
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}

- name: training_16GPU_4DP2TP2PP_MSP_optimizer_v2
run: |
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP_optimizer_v2" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

training_16GPU_4DP2TP2PP_FSP:
runs-on: [t_cluster]
timeout-minutes: 10
Expand Down
34 changes: 32 additions & 2 deletions internlm/accelerator/abstract_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ class AcceleratorType(enum.Enum):
NPU = 2
CPU = 3
DIPU = 4
OTHER = 5
DITORCH = 5
OTHER = 6


internlm_accelerator = None
Expand Down Expand Up @@ -81,7 +82,15 @@ def get_accelerator():

accelerator_name = None
# 1. Detect whether there is override of DeepSpeed accelerators from environment variable.
intern_accelerator_LIST = ["cuda", "npu", "dipu"]
# 2. ditorch: a unified accelerator tools for torch_npu, torch_dipu backend etc.
# deeplink_ext: implemented FlashSelfAttention, FlashCrossAttention, RmsNorm and RotaryEmbedding operations
# etc. based on torch_dipu and torch_npu backend, respectively.
# ditorch, together with deeplink_ext, provides unified APIs for internlm training.
# usage:
# for torch_dipu backend: export INTERNLM_ACCELERATOR=ditorch; export DEEPLINK_EXT_PLATFORM_TYPE=torch_dipu
# for torch_npu backend: export INTERNLM_ACCELERATOR=ditorch; export DEEPLINK_EXT_PLATFORM_TYPE=torch_npu

intern_accelerator_LIST = ["cuda", "npu", "dipu", "ditorch"]
if "INTERNLM_ACCELERATOR" in os.environ:
accelerator_name = os.environ["INTERNLM_ACCELERATOR"]
if accelerator_name == "npu":
Expand All @@ -99,13 +108,30 @@ def get_accelerator():
"DIPU_Accelerator requires torch_dipu and deeplink_ext, which is not installed on this system."
)
pass
elif accelerator_name == "ditorch":
try:
import deeplink_ext # pylint: disable=unused-import
import ditorch # pylint: disable=unused-import
except (ImportError, ModuleNotFoundError):
raise ValueError(
"DIPU_Accelerator requires ditorch and deeplink_ext, which is not installed on this system."
)
pass
elif accelerator_name != "cuda":
raise ValueError(
f"accelerator_name must be one of {intern_accelerator_LIST}."
+ " Value '{accelerator_name}' is not supported"
)

# 2. If no override, detect which accelerator to use automatically
if accelerator_name is None:
try:
import deeplink_ext # noqa: F401,F811 # type: ignore
import ditorch # noqa: F401,F811 # type: ignore

accelerator_name = "ditorch"
except (ImportError, ModuleNotFoundError):
pass
if accelerator_name is None:
try:
import deeplink_ext # noqa: F401,F811 # type: ignore
Expand Down Expand Up @@ -137,5 +163,9 @@ def get_accelerator():
from .dipu_accelerator import DIPU_Accelerator

internlm_accelerator = DIPU_Accelerator()
elif accelerator_name == "ditorch":
from .ditorch_accelerator import DITORCH_Accelerator

internlm_accelerator = DITORCH_Accelerator()

return internlm_accelerator
Loading
Loading