From ff45ddff1bc5e5a88443f5164d13b86708072214 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Tue, 5 Aug 2025 18:33:17 +0100 Subject: [PATCH] fix(runtimes): Set numProcPerNode: 1 in DeepSpeed Runtime Signed-off-by: Andrey Velichkevich --- manifests/base/runtimes/deepspeed_distributed.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/manifests/base/runtimes/deepspeed_distributed.yaml b/manifests/base/runtimes/deepspeed_distributed.yaml index 869e97dbbb..2ebc62b226 100644 --- a/manifests/base/runtimes/deepspeed_distributed.yaml +++ b/manifests/base/runtimes/deepspeed_distributed.yaml @@ -8,9 +8,7 @@ spec: mlPolicy: numNodes: 1 mpi: - # TODO (andreyvelich): Change num proc to 1 and remove container resources after we - # allow to override it via TrainJob APIs. - numProcPerNode: 4 + numProcPerNode: 1 mpiImplementation: OpenMPI sshAuthMountPath: /home/mpiuser/.ssh runLauncherAsNode: true