diff --git a/api/openapi-spec/swagger.json b/api/openapi-spec/swagger.json index a84c853173..043cfef8c1 100644 --- a/api/openapi-spec/swagger.json +++ b/api/openapi-spec/swagger.json @@ -14272,7 +14272,7 @@ "type": "string" }, "podSpecOverrides": { - "description": "Custom overrides for the training runtime.", + "description": "Custom overrides for the training runtime. When multiple overrides apply to the same targetJob, later entries in the slice override earlier field values.", "type": "array", "items": { "default": {}, diff --git a/api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_train_job_spec.py b/api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_train_job_spec.py index 24ad3f523d..a78150d7c7 100644 --- a/api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_train_job_spec.py +++ b/api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_train_job_spec.py @@ -34,7 +34,7 @@ class TrainerV1alpha1TrainJobSpec(BaseModel): initializer: Optional[TrainerV1alpha1Initializer] = Field(default=None, description="Configuration of the initializer.") labels: Optional[Dict[str, StrictStr]] = Field(default=None, description="Labels to apply for the derivative JobSet and Jobs. They will be merged with the TrainingRuntime values.") managed_by: Optional[StrictStr] = Field(default=None, description="ManagedBy is used to indicate the controller or entity that manages a TrainJob. The value must be either an empty, `trainer.kubeflow.org/trainjob-controller` or `kueue.x-k8s.io/multikueue`. The built-in TrainJob controller reconciles TrainJob which don't have this field at all or the field value is the reserved string `trainer.kubeflow.org/trainjob-controller`, but delegates reconciling TrainJobs with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. Defaults to `trainer.kubeflow.org/trainjob-controller`", alias="managedBy") - pod_spec_overrides: Optional[List[TrainerV1alpha1PodSpecOverride]] = Field(default=None, description="Custom overrides for the training runtime.", alias="podSpecOverrides") + pod_spec_overrides: Optional[List[TrainerV1alpha1PodSpecOverride]] = Field(default=None, description="Custom overrides for the training runtime. When multiple overrides apply to the same targetJob, later entries in the slice override earlier field values.", alias="podSpecOverrides") runtime_ref: TrainerV1alpha1RuntimeRef = Field(description="Reference to the training runtime. The field is immutable.", alias="runtimeRef") suspend: Optional[StrictBool] = Field(default=None, description="Whether the controller should suspend the running TrainJob. Defaults to false.") trainer: Optional[TrainerV1alpha1Trainer] = Field(default=None, description="Configuration of the trainer.") diff --git a/charts/kubeflow-trainer/crds/trainer.kubeflow.org_trainjobs.yaml b/charts/kubeflow-trainer/crds/trainer.kubeflow.org_trainjobs.yaml index ebd8c5e8c7..c39360cdb9 100644 --- a/charts/kubeflow-trainer/crds/trainer.kubeflow.org_trainjobs.yaml +++ b/charts/kubeflow-trainer/crds/trainer.kubeflow.org_trainjobs.yaml @@ -454,7 +454,9 @@ spec: - message: ManagedBy value is immutable rule: self == oldSelf podSpecOverrides: - description: Custom overrides for the training runtime. + description: |- + Custom overrides for the training runtime. + When multiple overrides apply to the same targetJob, later entries in the slice override earlier field values. items: description: PodSpecOverride represents the custom overrides that will be applied for the TrainJob's resources. diff --git a/manifests/base/crds/trainer.kubeflow.org_trainjobs.yaml b/manifests/base/crds/trainer.kubeflow.org_trainjobs.yaml index ebd8c5e8c7..c39360cdb9 100644 --- a/manifests/base/crds/trainer.kubeflow.org_trainjobs.yaml +++ b/manifests/base/crds/trainer.kubeflow.org_trainjobs.yaml @@ -454,7 +454,9 @@ spec: - message: ManagedBy value is immutable rule: self == oldSelf podSpecOverrides: - description: Custom overrides for the training runtime. + description: |- + Custom overrides for the training runtime. + When multiple overrides apply to the same targetJob, later entries in the slice override earlier field values. items: description: PodSpecOverride represents the custom overrides that will be applied for the TrainJob's resources. diff --git a/pkg/apis/trainer/v1alpha1/trainjob_types.go b/pkg/apis/trainer/v1alpha1/trainjob_types.go index e37bde9468..9ab84c6fd4 100644 --- a/pkg/apis/trainer/v1alpha1/trainjob_types.go +++ b/pkg/apis/trainer/v1alpha1/trainjob_types.go @@ -113,6 +113,7 @@ type TrainJobSpec struct { Annotations map[string]string `json:"annotations,omitempty"` // Custom overrides for the training runtime. + // When multiple overrides apply to the same targetJob, later entries in the slice override earlier field values. // +listType=atomic PodSpecOverrides []PodSpecOverride `json:"podSpecOverrides,omitempty"` diff --git a/pkg/apis/trainer/v1alpha1/zz_generated.openapi.go b/pkg/apis/trainer/v1alpha1/zz_generated.openapi.go index 1d3a7a232c..52fa070773 100644 --- a/pkg/apis/trainer/v1alpha1/zz_generated.openapi.go +++ b/pkg/apis/trainer/v1alpha1/zz_generated.openapi.go @@ -1475,7 +1475,7 @@ func schema_pkg_apis_trainer_v1alpha1_TrainJobSpec(ref common.ReferenceCallback) }, }, SchemaProps: spec.SchemaProps{ - Description: "Custom overrides for the training runtime.", + Description: "Custom overrides for the training runtime. When multiple overrides apply to the same targetJob, later entries in the slice override earlier field values.", Type: []string{"array"}, Items: &spec.SchemaOrArray{ Schema: &spec.Schema{ diff --git a/pkg/runtime/core/trainingruntime_test.go b/pkg/runtime/core/trainingruntime_test.go index 4266341907..9f1e58bde7 100644 --- a/pkg/runtime/core/trainingruntime_test.go +++ b/pkg/runtime/core/trainingruntime_test.go @@ -634,6 +634,81 @@ func TestTrainingRuntimeNewObjects(t *testing.T) { Obj(), }, }, + "succeeded to build JobSet with TrainJob's PodSpecOverrides containing duplicate TargetJobs": { + trainingRuntime: testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").RuntimeSpec( + testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").Spec). + Container(constants.Node, constants.Node, "test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests). + Obj(), + ).Obj(), + trainJob: testingutil.MakeTrainJobWrapper(metav1.NamespaceDefault, "test-job"). + UID("uid"). + RuntimeRef(trainer.SchemeGroupVersion.WithKind(trainer.TrainingRuntimeKind), "test-runtime"). + PodSpecOverrides([]trainer.PodSpecOverride{ + { + TargetJobs: []trainer.PodSpecOverrideTargetJob{{Name: constants.Node}}, + NodeSelector: map[string]string{ + "node.kubernetes.io/instance-type": "p5.48xlarge", + }, + }, + { + TargetJobs: []trainer.PodSpecOverrideTargetJob{{Name: constants.Node}}, + ServiceAccountName: ptr.To("test-sa"), + }, + }). + Obj(), + wantObjs: []runtime.Object{ + testingutil.MakeJobSetWrapper(metav1.NamespaceDefault, "test-job"). + ControllerReference(trainer.SchemeGroupVersion.WithKind(trainer.TrainJobKind), "test-job", "uid"). + Replicas(1, constants.DatasetInitializer, constants.ModelInitializer, constants.Node). + Parallelism(1, constants.DatasetInitializer, constants.ModelInitializer, constants.Node). + Completions(1, constants.DatasetInitializer, constants.ModelInitializer, constants.Node). + Container(constants.Node, constants.Node, "test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests). + NodeSelector(constants.Node, + map[string]string{ + "node.kubernetes.io/instance-type": "p5.48xlarge", + }). + ServiceAccountName(constants.Node, "test-sa"). + Obj(), + }, + }, + "succeeded to build JobSet with TrainJob's PodSpecOverrides targeting the same job with different values": { + trainingRuntime: testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").RuntimeSpec( + testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").Spec). + Container(constants.Node, constants.Node, "test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests). + Obj(), + ).Obj(), + trainJob: testingutil.MakeTrainJobWrapper(metav1.NamespaceDefault, "test-job"). + UID("uid"). + RuntimeRef(trainer.SchemeGroupVersion.WithKind(trainer.TrainingRuntimeKind), "test-runtime"). + PodSpecOverrides([]trainer.PodSpecOverride{ + { + TargetJobs: []trainer.PodSpecOverrideTargetJob{{Name: constants.Node}}, + NodeSelector: map[string]string{ + "node.kubernetes.io/instance-type": "p5.48xlarge", + }, + }, + { + TargetJobs: []trainer.PodSpecOverrideTargetJob{{Name: constants.Node}}, + NodeSelector: map[string]string{ + "node.kubernetes.io/instance-type": "p5en.48xlarge", + }, + }, + }). + Obj(), + wantObjs: []runtime.Object{ + testingutil.MakeJobSetWrapper(metav1.NamespaceDefault, "test-job"). + ControllerReference(trainer.SchemeGroupVersion.WithKind(trainer.TrainJobKind), "test-job", "uid"). + Replicas(1, constants.DatasetInitializer, constants.ModelInitializer, constants.Node). + Parallelism(1, constants.DatasetInitializer, constants.ModelInitializer, constants.Node). + Completions(1, constants.DatasetInitializer, constants.ModelInitializer, constants.Node). + Container(constants.Node, constants.Node, "test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests). + NodeSelector(constants.Node, + map[string]string{ + "node.kubernetes.io/instance-type": "p5en.48xlarge", + }). + Obj(), + }, + }, "succeeded to build JobSet with dataset and model initializer from the TrainJob.": { trainingRuntime: testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").RuntimeSpec( testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").Spec). diff --git a/pkg/runtime/framework/plugins/jobset/jobset.go b/pkg/runtime/framework/plugins/jobset/jobset.go index e829188fd8..05920989d7 100644 --- a/pkg/runtime/framework/plugins/jobset/jobset.go +++ b/pkg/runtime/framework/plugins/jobset/jobset.go @@ -121,16 +121,7 @@ func (j *JobSet) Validate(ctx context.Context, info *runtime.Info, oldObj, newOb allErrs = append(allErrs, j.checkPodSpecOverridesImmutability(ctx, oldObj, newObj)...) // TODO (andreyvelich): Validate Volumes, VolumeMounts, and Tolerations. - targetJobNames := sets.New[string]() for _, podSpecOverride := range newObj.Spec.PodSpecOverrides { - // Validate that there are no duplicate target job names within the same PodSpecOverride - for _, targetJob := range podSpecOverride.TargetJobs { - if targetJobNames.Has(targetJob.Name) { - allErrs = append(allErrs, field.Duplicate(podSpecOverridePath, targetJob.Name)) - } - targetJobNames.Insert(targetJob.Name) - } - for _, targetJob := range podSpecOverride.TargetJobs { containers, ok := rJobContainerNames[targetJob.Name] if !ok { diff --git a/test/integration/webhooks/trainjob_test.go b/test/integration/webhooks/trainjob_test.go index fca92ea99a..c9e2a215b3 100644 --- a/test/integration/webhooks/trainjob_test.go +++ b/test/integration/webhooks/trainjob_test.go @@ -351,9 +351,9 @@ var _ = ginkgo.Describe("TrainJob marker validations and defaulting", ginkgo.Ord Obj() }, testingutil.BeInvalidError()), - ginkgo.Entry("Should fail in creating trainJob with podSpecOverrides have duplicated targetJob", + ginkgo.Entry("Should succeed to create trainJob with podSpecOverrides containing duplicate targetJob", func() *trainer.TrainJob { - return testingutil.MakeTrainJobWrapper(ns.Name, "invalid-pod-spec-overrides"). + return testingutil.MakeTrainJobWrapper(ns.Name, "duplicated-podspecoverrides-target-jobs"). RuntimeRef(trainer.GroupVersion.WithKind(trainer.TrainingRuntimeKind), "testing"). PodSpecOverrides([]trainer.PodSpecOverride{ { @@ -367,7 +367,7 @@ var _ = ginkgo.Describe("TrainJob marker validations and defaulting", ginkgo.Ord }). Obj() }, - testingutil.BeForbiddenError()), + gomega.Succeed()), ) ginkgo.DescribeTable("Defaulting TrainJob on creation", func(trainJob func() *trainer.TrainJob, wantTrainJob func() *trainer.TrainJob) { created := trainJob()