From 4c6b91db67c14937f5f98db3855bafb2e59f7249 Mon Sep 17 00:00:00 2001 From: Yutong Sun Date: Wed, 24 Sep 2025 22:06:54 +0000 Subject: [PATCH] fix: use default maxBytes and ncclBuffSize for p4 running all-to-all Signed-off-by: Yutong Sun --- test/cases/nvidia/mpi_test.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/cases/nvidia/mpi_test.go b/test/cases/nvidia/mpi_test.go index 42997314f..14b90cf35 100644 --- a/test/cases/nvidia/mpi_test.go +++ b/test/cases/nvidia/mpi_test.go @@ -70,8 +70,13 @@ func multiNode(testName string) features.Feature { ncclBuffSize := "4194304" if slices.Contains(instanceSupportsRdmaRead, *nodeType) { t.Log("Instance supports RDMA") - maxBytes = "16G" - ncclBuffSize = "8388608" + // TODO: revisit this with some kind of per-instance optimizer, or maybe use the defaults for all instance types unless specified + if testName == "alltoall_perf" && strings.Contains(*nodeType, "p4") { + // Keep default values for P4 running all-to-all + } else { + maxBytes = "16G" + ncclBuffSize = "8388608" + } } var err error renderedMpiJobNcclTestMultiNodeManifest, err = fwext.RenderManifests(mpiJobNcclTestMultiNodeManifest, ncclTestManifestTplVars{