Skip to content

Commit b99e3ed

Browse files
committed
Extend NCCL test to p5.48xlarge
Extend the possibility to run NCCL test with p5.48xlarge. The case has not been added to the integration tests config. Signed-off-by: Luca Carrogu <carrogu@amazon.com>
1 parent 75efd09 commit b99e3ed

File tree

2 files changed

+19
-8
lines changed

2 files changed

+19
-8
lines changed

tests/integration-tests/tests/efa/test_efa.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@ def test_efa(
5858

5959
_test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor, partition="efa-enabled")
6060

61-
if instance == "p4d.24xlarge" and os != "centos7":
62-
_test_nccl_benchmarks(remote_command_executor, test_datadir, "openmpi", scheduler_commands)
61+
if instance in ["p4d.24xlarge", "p5.48xlarge"] and os != "centos7":
62+
_test_nccl_benchmarks(remote_command_executor, test_datadir, "openmpi", scheduler_commands, instance)
6363

6464
assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)
6565

@@ -102,7 +102,7 @@ def _test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor, p
102102
assert_that(result.stdout).does_not_contain("SHM transfer will be disabled because of ptrace protection")
103103

104104

105-
def _test_nccl_benchmarks(remote_command_executor, test_datadir, mpi_module, scheduler_commands):
105+
def _test_nccl_benchmarks(remote_command_executor, test_datadir, mpi_module, scheduler_commands, instance):
106106
logging.info("Running NCCL benchmarks")
107107
remote_command_executor.run_remote_script(
108108
str(test_datadir / "nccl_benchmarks" / "init_nccl_benchmarks.sh"), args=[mpi_module], hide=True, timeout=600
@@ -139,5 +139,15 @@ def _test_nccl_benchmarks(remote_command_executor, test_datadir, mpi_module, sch
139139
"cat /shared/nccl_tests.out | grep -E '1073741824\\s+268435456' | awk '{print $12}'"
140140
).stdout
141141

142-
# Expected "in-place busbw" bandwidth with 2 nodes, 8 tasks per node is about 27GB/s
143-
assert_that(float(max_bandwidth)).is_greater_than(26.0)
142+
instance_bandwidth_dict = {
143+
# p4d.24xlarge - Expected "in-place busbw" bandwidth with 2 nodes, 8 tasks per node is about 27GB/s
144+
"p4d.24xlarge": 26.0,
145+
# p5.48xlarge - Expected "in-place busbw" bandwidth with 2 nodes, 8 tasks per node is about 250GB/s
146+
"p5.48xlarge": 250.0,
147+
}
148+
149+
expected_bandwidth = instance_bandwidth_dict.get(instance)
150+
if expected_bandwidth is None:
151+
pytest.fail(f"Instance {instance} is not valid for multiple bandwidth tests")
152+
153+
assert_that(float(max_bandwidth)).is_greater_than(expected_bandwidth)

tests/integration-tests/tests/efa/test_efa/test_efa/nccl_benchmarks/init_nccl_benchmarks.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/usr/bin/env bash
2-
set -e
2+
exec &>/shared/init_nccl.out
3+
set -xe
34

45
rm -rf /shared/${1}
56

@@ -8,7 +9,7 @@ NCCL_BENCHMARKS_VERSION='2.13.8'
89
NCCL_VERSION='2.19.4-1'
910
OFI_NCCL_VERSION='1.7.4-aws'
1011
MPI_HOME=$(which mpirun | awk -F '/bin' '{print $1}')
11-
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80" # Arch for NVIDIA A100
12+
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90" # Arch for NVIDIA A100 and H100, ref https://docs.nvidia.com/cuda/ada-compatibility-guide/index.html
1213

1314
mkdir -p /shared/${1}
1415

@@ -24,7 +25,7 @@ cd /shared/${1}
2425
wget https://github.com/NVIDIA/nccl-tests/archive/v${NCCL_BENCHMARKS_VERSION}.tar.gz
2526
tar zxvf "v${NCCL_BENCHMARKS_VERSION}.tar.gz"
2627
cd "nccl-tests-${NCCL_BENCHMARKS_VERSION}/"
27-
NVCC_GENCODE="${NVCC_GENCODE}" make MPI=1 MPI_HOME=${MPI_HOME} NCCL_HOME=/shared/${1}/nccl-${NCCL_VERSION}/build/
28+
NVCC_GENCODE="${NVCC_GENCODE}" make MPI=1 MPI_HOME=${MPI_HOME} NCCL_HOME=/shared/${1}/nccl-${NCCL_VERSION}/build/ CUDA_HOME=/usr/local/cuda
2829

2930
wget https://github.com/aws/aws-ofi-nccl/archive/v${OFI_NCCL_VERSION}.tar.gz
3031
tar xvfz v${OFI_NCCL_VERSION}.tar.gz

0 commit comments

Comments
 (0)