From 10deb2347c3df4d057a75bbf8c13ec22a663b30d Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Tue, 19 Aug 2025 00:47:19 +0100 Subject: [PATCH] fix(trainer): Preserve the original runtime command in get_runtime_packages() API Signed-off-by: Andrey Velichkevich --- kubeflow/trainer/api/trainer_client.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py index 9b705d64..2934867c 100644 --- a/kubeflow/trainer/api/trainer_client.py +++ b/kubeflow/trainer/api/trainer_client.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import logging import multiprocessing import queue @@ -177,11 +178,14 @@ def get_runtime_packages(self, runtime: types.Runtime): if runtime.trainer.trainer_type == types.TrainerType.BUILTIN_TRAINER: raise ValueError("Cannot get Runtime packages for BuiltinTrainer") + # Create a deepcopy of the runtime to avoid modifying the original command. + runtime_copy = copy.deepcopy(runtime) + # Run mpirun only within the single process. - if runtime.trainer.command[0] == "mpirun": + if runtime_copy.trainer.command[0] == "mpirun": mpi_command = list(constants.MPI_COMMAND) mpi_command[1:3] = ["-np", "1"] - runtime.trainer.set_command(tuple(mpi_command)) + runtime_copy.trainer.set_command(tuple(mpi_command)) def print_packages(): import subprocess @@ -211,12 +215,12 @@ def print_packages(): # Create the TrainJob and wait until it completes. # If Runtime trainer has GPU resources use them, otherwise run TrainJob with 1 CPU. job_name = self.train( - runtime=runtime, + runtime=runtime_copy, trainer=types.CustomTrainer( func=print_packages, num_nodes=1, resources_per_node=( - {"cpu": 1} if runtime.trainer.device != "gpu" else None + {"cpu": 1} if runtime_copy.trainer.device != "gpu" else None ), ), )