Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions examples/v1beta1/kubeflow-trainer/trainjob-pytorch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
---
apiVersion: kubeflow.org/v1beta1
kind: Experiment
metadata:
namespace: kubeflow
name: torch-distributed-example
spec:
parallelTrialCount: 3
maxTrialCount: 12
maxFailedTrialCount: 3
objective:
type: minimize
goal: 0.001
objectiveMetricName: loss
algorithm:
algorithmName: random
parameters:
- name: lr
parameterType: double
feasibleSpace:
min: "0.01"
max: "0.05"
- name: momentum
parameterType: double
feasibleSpace:
min: "0.5"
max: "0.9"
trialTemplate:
primaryContainerName: node
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- name: momentum
description: Momentum for the training model
reference: momentum
trialSpec:
apiVersion: trainer.kubeflow.org/v1alpha1
kind: TrainJob
spec:
runtimeRef:
name: torch-distributed
trainer:
numNodes: 2
image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
- "--epochs=1"
- "--lr=${trialParameters.learningRate}"
- "--momentum=${trialParameters.momentum}"
16 changes: 16 additions & 0 deletions manifests/v1beta1/components/controller/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,22 @@ rules:
- "watch"
- "create"
- "delete"
- apiGroups:
- jobset.x-k8s.io
resources:
- jobsets
verbs:
- "get"
- "list"
- "watch"
- apiGroups:
- trainer.kubeflow.org
resources:
- trainjobs
verbs:
- "get"
- "list"
- "watch"
- apiGroups:
- kubeflow.org
resources:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ init:
controller:
webhookPort: 8443
trialResources:
- TrainJob.v1alpha1.trainer.kubeflow.org
- Job.v1.batch
- TFJob.v1.kubeflow.org
- PyTorchJob.v1.kubeflow.org
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ init:
controller:
webhookPort: 8443
trialResources:
- TrainJob.v1alpha1.trainer.kubeflow.org
- Job.v1.batch
- TFJob.v1.kubeflow.org
- PyTorchJob.v1.kubeflow.org
Expand Down
10 changes: 10 additions & 0 deletions pkg/apis/controller/experiments/v1beta1/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,27 @@ const (

// DefaultKubeflowJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Kubeflow Training Job.
DefaultKubeflowJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"

// DefaultTrainJobSuccessCondition is the default value of spec.trialTemplate.successCondition for Training Operator Job.
DefaultTrainJobSuccessCondition = "status.conditions.#(type==\"Complete\")#|#(status==\"True\")#"

// DefaultTrainJobFailureCondition is the default value of spec.trialTemplate.failureCondition for Training Operator Job.
DefaultTrainJobFailureCondition = "status.conditions.#(type==\"Failed\")#|#(status==\"True\")#"
)

var (
// DefaultKubeflowJobPrimaryPodLabels is the default value of spec.trialTemplate.primaryPodLabels for Kubeflow Training Job.
DefaultKubeflowJobPrimaryPodLabels = map[string]string{"training.kubeflow.org/job-role": "master"}

// DefaultKubeflowJobPrimaryPodLabels is the default value of spec.trialTemplate.primaryPodLabels for Training Operator Job.
DefaultTrainJobPrimaryPodLabels = map[string]string{"jobset.sigs.k8s.io/replicatedjob-name": "node", "batch.kubernetes.io/job-completion-index": "0"}

// KubeflowJobKinds is the list of Kubeflow Training Job kinds.
KubeflowJobKinds = map[string]bool{
"TFJob": true,
"PyTorchJob": true,
"XGBoostJob": true,
"MPIJob": true,
"TrainJob": true,
}
)
19 changes: 16 additions & 3 deletions pkg/apis/controller/experiments/v1beta1/experiment_defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,27 @@ func (e *Experiment) setDefaultTrialTemplate() {
}
} else if _, ok := KubeflowJobKinds[jobKind]; ok {
if t.SuccessCondition == "" {
t.SuccessCondition = DefaultKubeflowJobSuccessCondition
if jobKind == "TrainJob" {
t.SuccessCondition = DefaultTrainJobSuccessCondition
} else {
t.SuccessCondition = DefaultKubeflowJobSuccessCondition
}
}
if t.FailureCondition == "" {
t.FailureCondition = DefaultKubeflowJobFailureCondition
if jobKind == "TrainJob" {
t.FailureCondition = DefaultTrainJobFailureCondition
} else {
t.FailureCondition = DefaultKubeflowJobFailureCondition
}
}
// For Kubeflow Job also set default PrimaryPodLabels
if len(t.PrimaryPodLabels) == 0 {
t.PrimaryPodLabels = DefaultKubeflowJobPrimaryPodLabels
if jobKind == "TrainJob" {
t.PrimaryPodLabels = DefaultTrainJobPrimaryPodLabels
} else {
t.PrimaryPodLabels = DefaultKubeflowJobPrimaryPodLabels
}

}
}
}
Expand Down
Loading