diff --git a/charts/kubeflow-trainer/templates/manager/_helpers.tpl b/charts/kubeflow-trainer/templates/manager/_helpers.tpl index 97b93a87ae..474c9bc9ed 100644 --- a/charts/kubeflow-trainer/templates/manager/_helpers.tpl +++ b/charts/kubeflow-trainer/templates/manager/_helpers.tpl @@ -55,3 +55,10 @@ Create the name of the manager deployment. {{- define "trainer.manager.service.name" -}} {{ include "trainer.manager.name" . }} {{- end -}} + +{{/* +Create the name of the manager configmap. +*/}} +{{- define "trainer.manager.configmap.name" -}} +{{ include "trainer.fullname" . }}-config +{{- end -}} diff --git a/charts/kubeflow-trainer/templates/manager/configmap.yaml b/charts/kubeflow-trainer/templates/manager/configmap.yaml new file mode 100644 index 0000000000..6d3f016225 --- /dev/null +++ b/charts/kubeflow-trainer/templates/manager/configmap.yaml @@ -0,0 +1,64 @@ +{{- /* +Copyright 2025 The Kubeflow authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ -}} + +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "trainer.manager.configmap.name" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "trainer.manager.labels" . | nindent 4 }} +data: + controller_manager_config.yaml: | + apiVersion: config.trainer.kubeflow.org/v1alpha1 + kind: Configuration + # Health configuration + health: + healthProbeBindAddress: {{ .Values.manager.config.health.healthProbeBindAddress | default ":8081" }} + readinessEndpointName: {{ .Values.manager.config.health.readinessEndpointName | default "readyz" }} + livenessEndpointName: {{ .Values.manager.config.health.livenessEndpointName | default "healthz" }} + + # Metrics configuration + metrics: + bindAddress: {{ .Values.manager.config.metrics.bindAddress | default ":8443" }} + secureServing: {{ .Values.manager.config.metrics.secureServing | default true }} + + # Webhook configuration + webhook: + port: {{ .Values.manager.config.webhook.port | default 9443 }} + host: {{ .Values.manager.config.webhook.host | default "\"\"" }} + + # Leader election configuration + leaderElection: + leaderElect: {{ .Values.manager.config.leaderElection.leaderElect | default true }} + resourceName: {{ .Values.manager.config.leaderElection.resourceName | default "trainer.kubeflow.org" }} + resourceNamespace: {{ .Values.manager.config.leaderElection.resourceNamespace | default "\"\"" }} + leaseDuration: {{ .Values.manager.config.leaderElection.leaseDuration | default "15s" }} + renewDeadline: {{ .Values.manager.config.leaderElection.renewDeadline | default "10s" }} + retryPeriod: {{ .Values.manager.config.leaderElection.retryPeriod | default "2s" }} + + # Controller configuration + controller: + groupKindConcurrency: + TrainJob.trainer.kubeflow.org: {{ .Values.manager.config.controller.groupKindConcurrency.trainJob | default 5 }} + TrainingRuntime.trainer.kubeflow.org: {{ .Values.manager.config.controller.groupKindConcurrency.trainingRuntime | default 1 }} + ClusterTrainingRuntime.trainer.kubeflow.org: {{ .Values.manager.config.controller.groupKindConcurrency.clusterTrainingRuntime | default 1 }} + + # Certificate management configuration + certManagement: + enable: {{ .Values.manager.config.certManagement.enable | default true }} + webhookServiceName: {{ if .Values.manager.config.certManagement.webhookServiceName }}{{ .Values.manager.config.certManagement.webhookServiceName }}{{ else }}{{ include "trainer.webhook.service.name" . }}{{ end }} + webhookSecretName: {{ if .Values.manager.config.certManagement.webhookSecretName }}{{ .Values.manager.config.certManagement.webhookSecretName }}{{ else }}{{ include "trainer.webhook.secret.name" . }}{{ end }} diff --git a/charts/kubeflow-trainer/templates/manager/deployment.yaml b/charts/kubeflow-trainer/templates/manager/deployment.yaml index 131177c73a..e486e3ba0a 100644 --- a/charts/kubeflow-trainer/templates/manager/deployment.yaml +++ b/charts/kubeflow-trainer/templates/manager/deployment.yaml @@ -40,11 +40,7 @@ spec: command: - /manager args: - {{- if gt (.Values.manager.replicas | int) 1 }} - - --leader-elect=true - {{- end }} - - --webhook-service-name={{ include "trainer.webhook.service.name" . }} - - --webhook-secret-name={{ include "trainer.webhook.secret.name" . }} + - --config=/controller_manager_config.yaml {{- with .Values.manager.env }} env: {{- toYaml . | nindent 8 }} @@ -57,6 +53,10 @@ spec: - name: webhook-cert mountPath: /tmp/k8s-webhook-server/serving-certs readOnly: true + - name: manager-config + mountPath: /controller_manager_config.yaml + subPath: controller_manager_config.yaml + readOnly: true {{- with .Values.manager.volumeMounts }} {{- toYaml . | nindent 8 }} {{- end }} @@ -91,6 +91,9 @@ spec: secret: secretName: {{ include "trainer.webhook.secret.name" . }} defaultMode: 420 + - name: manager-config + configMap: + name: {{ include "trainer.manager.configmap.name" . }} {{- with .Values.manager.volumes }} {{- toYaml . | nindent 6 }} {{- end }} diff --git a/charts/kubeflow-trainer/tests/manager/configmap_test.yaml b/charts/kubeflow-trainer/tests/manager/configmap_test.yaml new file mode 100644 index 0000000000..ceac2c49f0 --- /dev/null +++ b/charts/kubeflow-trainer/tests/manager/configmap_test.yaml @@ -0,0 +1,83 @@ +suite: test manager configmap +templates: + - manager/configmap.yaml +release: + name: kubeflow-trainer + namespace: kubeflow-system +tests: + - it: should create configmap with default values + asserts: + - isKind: + of: ConfigMap + - equal: + path: metadata.name + value: kubeflow-trainer-config + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "apiVersion: config.trainer.kubeflow.org/v1alpha1" + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "kind: Configuration" + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "healthProbeBindAddress: :8081" + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "bindAddress: :8443" + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "port: 9443" + + - it: should use custom webhook service and secret names from config + set: + manager: + config: + certManagement: + webhookServiceName: "custom-webhook-service" + webhookSecretName: "custom-webhook-secret" + asserts: + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "webhookServiceName: custom-webhook-service" + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "webhookSecretName: custom-webhook-secret" + + - it: should auto-generate webhook service and secret names when not provided + asserts: + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "webhookServiceName: kubeflow-trainer-controller-manager" + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "webhookSecretName: kubeflow-trainer-webhook-cert" + + - it: should enable leader election by default + asserts: + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "leaderElect: true" + + - it: should allow disabling leader election + set: + manager.config.leaderElection.leaderElect: false + asserts: + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "leaderElect: false" + + - it: should use custom controller concurrency values + set: + manager.config.controller.groupKindConcurrency.trainJob: 10 + manager.config.controller.groupKindConcurrency.trainingRuntime: 5 + manager.config.controller.groupKindConcurrency.clusterTrainingRuntime: 3 + asserts: + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "TrainJob.trainer.kubeflow.org: 10" + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "TrainingRuntime.trainer.kubeflow.org: 5" + - matchRegex: + path: data["controller_manager_config.yaml"] + pattern: "ClusterTrainingRuntime.trainer.kubeflow.org: 3" diff --git a/charts/kubeflow-trainer/values.yaml b/charts/kubeflow-trainer/values.yaml index 2ca07805ca..d68cdbaa21 100644 --- a/charts/kubeflow-trainer/values.yaml +++ b/charts/kubeflow-trainer/values.yaml @@ -101,6 +101,36 @@ manager: seccompProfile: type: RuntimeDefault + # -- Controller manager configuration. + # This configuration is used to generate the ConfigMap for the controller manager. + config: + health: + healthProbeBindAddress: ":8081" + readinessEndpointName: "readyz" + livenessEndpointName: "healthz" + metrics: + bindAddress: ":8443" + secureServing: true + webhook: + port: 9443 + host: "" + leaderElection: + resourceName: "trainer.kubeflow.org" + resourceNamespace: "" + leaseDuration: "15s" + renewDeadline: "10s" + retryPeriod: "2s" + controller: + groupKindConcurrency: + trainJob: 5 + trainingRuntime: 1 + clusterTrainingRuntime: 1 + certManagement: + enable: true + # webhookServiceName and webhookSecretName are auto-generated if not specified + webhookServiceName: "" + webhookSecretName: "" + webhook: # -- Specifies how unrecognized errors are handled. # Available options are `Ignore` or `Fail`. diff --git a/manifests/base/manager/kustomization.yaml b/manifests/base/manager/kustomization.yaml index d85c43a0cb..9ddc848c0d 100644 --- a/manifests/base/manager/kustomization.yaml +++ b/manifests/base/manager/kustomization.yaml @@ -1,6 +1,10 @@ resources: - manager.yaml +# Disable hash suffix for predictable ConfigMap names +generatorOptions: + disableNameSuffixHash: true + # ConfigMap generator for controller manager configuration configMapGenerator: - name: kubeflow-trainer-config