Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions charts/kubeflow-trainer/templates/manager/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,10 @@ Create the name of the manager deployment.
{{- define "trainer.manager.service.name" -}}
{{ include "trainer.manager.name" . }}
{{- end -}}

{{/*
Create the name of the manager configmap.
*/}}
{{- define "trainer.manager.configmap.name" -}}
{{ include "trainer.fullname" . }}-config
{{- end -}}
64 changes: 64 additions & 0 deletions charts/kubeflow-trainer/templates/manager/configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{{- /*
Copyright 2025 The Kubeflow authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/ -}}

apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "trainer.manager.configmap.name" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "trainer.manager.labels" . | nindent 4 }}
data:
controller_manager_config.yaml: |
apiVersion: config.trainer.kubeflow.org/v1alpha1
kind: Configuration
# Health configuration
health:
healthProbeBindAddress: {{ .Values.manager.config.health.healthProbeBindAddress | default ":8081" }}
readinessEndpointName: {{ .Values.manager.config.health.readinessEndpointName | default "readyz" }}
livenessEndpointName: {{ .Values.manager.config.health.livenessEndpointName | default "healthz" }}

# Metrics configuration
metrics:
bindAddress: {{ .Values.manager.config.metrics.bindAddress | default ":8443" }}
secureServing: {{ .Values.manager.config.metrics.secureServing | default true }}

# Webhook configuration
webhook:
port: {{ .Values.manager.config.webhook.port | default 9443 }}
host: {{ .Values.manager.config.webhook.host | default "\"\"" }}

# Leader election configuration
leaderElection:
leaderElect: {{ .Values.manager.config.leaderElection.leaderElect | default true }}
resourceName: {{ .Values.manager.config.leaderElection.resourceName | default "trainer.kubeflow.org" }}
resourceNamespace: {{ .Values.manager.config.leaderElection.resourceNamespace | default "\"\"" }}
leaseDuration: {{ .Values.manager.config.leaderElection.leaseDuration | default "15s" }}
renewDeadline: {{ .Values.manager.config.leaderElection.renewDeadline | default "10s" }}
retryPeriod: {{ .Values.manager.config.leaderElection.retryPeriod | default "2s" }}

# Controller configuration
controller:
groupKindConcurrency:
TrainJob.trainer.kubeflow.org: {{ .Values.manager.config.controller.groupKindConcurrency.trainJob | default 5 }}
TrainingRuntime.trainer.kubeflow.org: {{ .Values.manager.config.controller.groupKindConcurrency.trainingRuntime | default 1 }}
ClusterTrainingRuntime.trainer.kubeflow.org: {{ .Values.manager.config.controller.groupKindConcurrency.clusterTrainingRuntime | default 1 }}

# Certificate management configuration
certManagement:
enable: {{ .Values.manager.config.certManagement.enable | default true }}
webhookServiceName: {{ if .Values.manager.config.certManagement.webhookServiceName }}{{ .Values.manager.config.certManagement.webhookServiceName }}{{ else }}{{ include "trainer.webhook.service.name" . }}{{ end }}
webhookSecretName: {{ if .Values.manager.config.certManagement.webhookSecretName }}{{ .Values.manager.config.certManagement.webhookSecretName }}{{ else }}{{ include "trainer.webhook.secret.name" . }}{{ end }}
13 changes: 8 additions & 5 deletions charts/kubeflow-trainer/templates/manager/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,7 @@ spec:
command:
- /manager
args:
{{- if gt (.Values.manager.replicas | int) 1 }}
- --leader-elect=true
{{- end }}
- --webhook-service-name={{ include "trainer.webhook.service.name" . }}
- --webhook-secret-name={{ include "trainer.webhook.secret.name" . }}
- --config=/controller_manager_config.yaml
{{- with .Values.manager.env }}
env:
{{- toYaml . | nindent 8 }}
Expand All @@ -57,6 +53,10 @@ spec:
- name: webhook-cert
mountPath: /tmp/k8s-webhook-server/serving-certs
readOnly: true
- name: manager-config
mountPath: /controller_manager_config.yaml
subPath: controller_manager_config.yaml
readOnly: true
{{- with .Values.manager.volumeMounts }}
{{- toYaml . | nindent 8 }}
{{- end }}
Expand Down Expand Up @@ -91,6 +91,9 @@ spec:
secret:
secretName: {{ include "trainer.webhook.secret.name" . }}
defaultMode: 420
- name: manager-config
configMap:
name: {{ include "trainer.manager.configmap.name" . }}
{{- with .Values.manager.volumes }}
{{- toYaml . | nindent 6 }}
{{- end }}
Expand Down
83 changes: 83 additions & 0 deletions charts/kubeflow-trainer/tests/manager/configmap_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
suite: test manager configmap
templates:
- manager/configmap.yaml
release:
name: kubeflow-trainer
namespace: kubeflow-system
tests:
- it: should create configmap with default values
asserts:
- isKind:
of: ConfigMap
- equal:
path: metadata.name
value: kubeflow-trainer-config
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "apiVersion: config.trainer.kubeflow.org/v1alpha1"
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "kind: Configuration"
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "healthProbeBindAddress: :8081"
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "bindAddress: :8443"
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "port: 9443"

- it: should use custom webhook service and secret names from config
set:
manager:
config:
certManagement:
webhookServiceName: "custom-webhook-service"
webhookSecretName: "custom-webhook-secret"
asserts:
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "webhookServiceName: custom-webhook-service"
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "webhookSecretName: custom-webhook-secret"

- it: should auto-generate webhook service and secret names when not provided
asserts:
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "webhookServiceName: kubeflow-trainer-controller-manager"
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "webhookSecretName: kubeflow-trainer-webhook-cert"

- it: should enable leader election by default
asserts:
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "leaderElect: true"

- it: should allow disabling leader election
set:
manager.config.leaderElection.leaderElect: false
asserts:
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "leaderElect: false"

- it: should use custom controller concurrency values
set:
manager.config.controller.groupKindConcurrency.trainJob: 10
manager.config.controller.groupKindConcurrency.trainingRuntime: 5
manager.config.controller.groupKindConcurrency.clusterTrainingRuntime: 3
asserts:
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "TrainJob.trainer.kubeflow.org: 10"
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "TrainingRuntime.trainer.kubeflow.org: 5"
- matchRegex:
path: data["controller_manager_config.yaml"]
pattern: "ClusterTrainingRuntime.trainer.kubeflow.org: 3"
30 changes: 30 additions & 0 deletions charts/kubeflow-trainer/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,36 @@ manager:
seccompProfile:
type: RuntimeDefault

# -- Controller manager configuration.
# This configuration is used to generate the ConfigMap for the controller manager.
config:
health:
healthProbeBindAddress: ":8081"
readinessEndpointName: "readyz"
livenessEndpointName: "healthz"
metrics:
bindAddress: ":8443"
secureServing: true
webhook:
port: 9443
host: ""
leaderElection:
resourceName: "trainer.kubeflow.org"
resourceNamespace: ""
leaseDuration: "15s"
renewDeadline: "10s"
retryPeriod: "2s"
controller:
groupKindConcurrency:
trainJob: 5
trainingRuntime: 1
clusterTrainingRuntime: 1
certManagement:
enable: true
# webhookServiceName and webhookSecretName are auto-generated if not specified
webhookServiceName: ""
webhookSecretName: ""

webhook:
# -- Specifies how unrecognized errors are handled.
# Available options are `Ignore` or `Fail`.
Expand Down
4 changes: 4 additions & 0 deletions manifests/base/manager/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
resources:
- manager.yaml

# Disable hash suffix for predictable ConfigMap names
generatorOptions:
disableNameSuffixHash: true

# ConfigMap generator for controller manager configuration
configMapGenerator:
- name: kubeflow-trainer-config
Expand Down
Loading