kubeflow · jskswamy · Jun 18, 2025 · Jun 18, 2025 · Jun 23, 2025 · Jun 23, 2025
diff --git a/python/kubeflow/trainer/types/types.py b/python/kubeflow/trainer/types/types.py
@@ -230,19 +230,16 @@ class Initializer:
     model: Optional[HuggingFaceModelInitializer] = None
 
 
-# The dict where key is the container image and value its representation.
-# Each Trainer representation defines trainer parameters (e.g. type, framework, entrypoint).
-# TODO (andreyvelich): We should allow user to overrides the default image names.
-ALL_TRAINERS: Dict[str, Trainer] = {
-    # Custom Trainers.
-    "pytorch/pytorch": Trainer(
+# Centralized trainer configurations to eliminate duplication
+TRAINER_CONFIGS: Dict[Framework, Trainer] = {
+    Framework.TORCH: Trainer(
         trainer_type=TrainerType.CUSTOM_TRAINER,
         framework=Framework.TORCH,
-        framework=Framework.TORCH,
-        framework=Framework.TORCH,
         entrypoint=[constants.TORCH_ENTRYPOINT],
     ),
-    "ghcr.io/kubeflow/trainer/mlx-runtime": Trainer(
+    Framework.DEEPSPEED: Trainer(
         trainer_type=TrainerType.CUSTOM_TRAINER,
-        framework=Framework.MLX,
+        framework=Framework.DEEPSPEED,
         entrypoint=[
             constants.MPI_ENTRYPOINT,
             "--hostfile",
@@ -251,9 +248,9 @@ class Initializer:
             "-c",
         ],
     ),
-    "ghcr.io/kubeflow/trainer/deepspeed-runtime": Trainer(
+    Framework.MLX: Trainer(
         trainer_type=TrainerType.CUSTOM_TRAINER,
-        framework=Framework.DEEPSPEED,
+        framework=Framework.MLX,
         entrypoint=[
             constants.MPI_ENTRYPOINT,
             "--hostfile",
@@ -262,20 +259,15 @@ class Initializer:
             "-c",
         ],
     ),
-    # Builtin Trainers.
-    "ghcr.io/kubeflow/trainer/torchtune-trainer": Trainer(
+    Framework.TORCHTUNE: Trainer(
         trainer_type=TrainerType.BUILTIN_TRAINER,
         framework=Framework.TORCHTUNE,
         entrypoint=constants.DEFAULT_TORCHTUNE_COMMAND,
     ),
 }
 
 # The default trainer configuration when runtime detection fails
-DEFAULT_TRAINER = Trainer(
-    trainer_type=TrainerType.CUSTOM_TRAINER,
-    framework=Framework.TORCH,
-    entrypoint=[constants.TORCH_ENTRYPOINT],
-)
+DEFAULT_TRAINER = TRAINER_CONFIGS[Framework.TORCH]
 
 # The default runtime configuration for the train() API
 DEFAULT_RUNTIME = Runtime(

diff --git a/python/kubeflow/trainer/types/types_test.py b/python/kubeflow/trainer/types/types_test.py
@@ -0,0 +1,18 @@
+from kubeflow.trainer.types import types
+
+
+class TestTrainerConfigurations:
+    """Test cases for trainer configurations and types."""
+
+    def test_centralized_trainer_configs(self):
+        """Test that centralized trainer configurations are properly defined."""
+        # Verify all trainer frameworks have configurations
+        for framework in types.Framework:
+            assert framework in types.TRAINER_CONFIGS
+            trainer = types.TRAINER_CONFIGS[framework]
+            assert trainer.framework == framework
+
+    def test_default_trainer_uses_centralized_config(self):
+        """Test that DEFAULT_TRAINER uses centralized configuration."""
+        assert types.DEFAULT_TRAINER == types.TRAINER_CONFIGS[types.Framework.TORCH]
+        assert types.DEFAULT_TRAINER.framework == types.Framework.TORCH
diff --git a/python/kubeflow/trainer/utils/utils.py b/python/kubeflow/trainer/utils/utils.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import inspect
 import os
 import queue
@@ -107,6 +108,65 @@ def get_runtime_trainer_container(
     return None
 
 
+def detect_trainer_from_image_patterns(image_name: str) -> Optional[types.Trainer]:
+    """
+    Detect trainer type based on image name patterns using regex.
+
+    This method uses pattern matching on the image name to determine
+    the likely trainer type.
+
+    Args:
+        image_name: The container image name
+
+    Returns:
+        Trainer object if detected, None otherwise
+    """
+    # DeepSpeed patterns
+    if re.search(r"deepspeed", image_name, re.IGNORECASE):
+        return copy.deepcopy(types.TRAINER_CONFIGS[types.Framework.DEEPSPEED])
+
+    # MLX patterns
+    if re.search(r"mlx", image_name, re.IGNORECASE):
+        return copy.deepcopy(types.TRAINER_CONFIGS[types.Framework.MLX])
+
+    # TorchTune patterns (check before PyTorch to avoid conflicts)
+    if re.search(r"torchtune", image_name, re.IGNORECASE):
+        return copy.deepcopy(types.TRAINER_CONFIGS[types.Framework.TORCHTUNE])
+
+    # PyTorch patterns - require explicit "pytorch" in image name for clarity
+    if re.search(r"pytorch", image_name, re.IGNORECASE):
+        return copy.deepcopy(types.TRAINER_CONFIGS[types.Framework.TORCH])
+
+    return None
+
+
+def detect_trainer(
+    trainer_container: models.IoK8sApiCoreV1Container,
+) -> types.Trainer:
+    """
+    Detect trainer type using pattern matching with fallback.
+
+    This method implements the detection logic:
+    1. Use image pattern matching to detect framework
+    2. Fall back to DEFAULT_TRAINER if no patterns match
+
+    Args:
+        trainer_container: The trainer container object
+
+    Returns:
+        Trainer object
+    """
+    image_name = trainer_container.image.split(":")[0]
+
+    # 1. Use image pattern matching
+    trainer = detect_trainer_from_image_patterns(image_name)
+    if trainer:
+        return trainer
+
+    # 2. Fall back to DEFAULT_TRAINER
+    return copy.deepcopy(types.DEFAULT_TRAINER)
+
+
 def get_runtime_trainer(
     replicated_jobs: List[models.JobsetV1alpha2ReplicatedJob],
     ml_policy: models.TrainerV1alpha1MLPolicy,
@@ -121,20 +181,23 @@ def get_runtime_trainer(
     if not (trainer_container and trainer_container.image):
         raise Exception(f"Runtime doesn't have trainer container {replicated_jobs}")
 
-    # Extract image name from the container image to get appropriate Trainer.
-    image_name = trainer_container.image.split(":")[0]
-    trainer = types.ALL_TRAINERS.get(image_name, types.DEFAULT_TRAINER)
+    # Use the new detection logic with fallback
+    trainer = detect_trainer(trainer_container)
 
     # Get the container devices.
     if devices := get_container_devices(trainer_container.resources):
         _, trainer.accelerator_count = devices
 
     # Torch and MPI plugins override accelerator count.
-    if ml_policy.torch and ml_policy.torch.num_proc_per_node:
+    # NOTE: The 'is not None' checks are essential because:
+    # 1. For torch: prevents AttributeError when accessing None.actual_instance
+    # 2. For MPI: prevents setting accelerator_count to None
+    # 3. Semantically: only override when user explicitly provides num_proc_per_node
+    if ml_policy.torch and ml_policy.torch.num_proc_per_node is not None:
         num_proc = ml_policy.torch.num_proc_per_node.actual_instance
         if isinstance(num_proc, int):
             trainer.accelerator_count = num_proc
-    elif ml_policy.mpi and ml_policy.mpi.num_proc_per_node:
+    elif ml_policy.mpi and ml_policy.mpi.num_proc_per_node is not None:
         trainer.accelerator_count = ml_policy.mpi.num_proc_per_node
 
     # Multiply accelerator_count by the number of nodes.
@@ -212,7 +275,7 @@ def get_trainjob_node_step(
         # TODO (andreyvelich): We should also override the device_count
         # based on OMPI_MCA_orte_set_default_slots value. Right now, it is hard to do
         # since we inject this env only to the Launcher Pod.
-        step.name = f"{constants.NODE}-{job_index+1}"
+        step.name = f"{constants.NODE}-{job_index + 1}"
 
     if container.env:
         for env in container.env: