From bf59ca38142a2bc7ea383886d2c5d87ebf70fcd8 Mon Sep 17 00:00:00 2001 From: Joao Cardoso Date: Mon, 28 Oct 2024 14:16:14 +0100 Subject: [PATCH 1/4] fix(manifests): Updating manifests since now storage class is defined at the experiment level, instead of during cluster import --- manifest.yaml | 6 ++++++ manifests/manifest.jax.sample.yaml | 9 +++++++-- manifests/manifest.kuberay.sample.yaml | 7 ++++++- manifests/manifest.pytorch.sample.yaml | 6 ++++++ manifests/manifest.tf.sample.yaml | 6 ++++++ manifests/manifest.xgboost.sample.yaml | 6 ++++++ 6 files changed, 37 insertions(+), 3 deletions(-) diff --git a/manifest.yaml b/manifest.yaml index db50f41..66dca9f 100644 --- a/manifest.yaml +++ b/manifest.yaml @@ -11,6 +11,12 @@ spec: image: image command: "python3 -u main.py --operator=tf --sleep=300 --tb-write=True" + storage: + sharedVolume: + mountPoint: "/mnt/shared" + sizeGB: 16 + storageClass: longhorn-xfs + tensorboard: enabled: true diff --git a/manifests/manifest.jax.sample.yaml b/manifests/manifest.jax.sample.yaml index ca10b32..e9a05de 100644 --- a/manifests/manifest.jax.sample.yaml +++ b/manifests/manifest.jax.sample.yaml @@ -11,6 +11,12 @@ spec: image: image command: "python3 -u main.py --operator=jax --sleep=300 --tb-write=True" + storage: + sharedVolume: + mountPoint: "/mnt/shared" + sizeGB: 16 + storageClass: longhorn-xfs + tensorboard: enabled: true @@ -25,5 +31,4 @@ spec: gpu: count: 0 type: gpu - product: nvidia-tesla-t4 - + product: nvidia-tesla-t4 \ No newline at end of file diff --git a/manifests/manifest.kuberay.sample.yaml b/manifests/manifest.kuberay.sample.yaml index cd694ad..2ca11fe 100644 --- a/manifests/manifest.kuberay.sample.yaml +++ b/manifests/manifest.kuberay.sample.yaml @@ -11,6 +11,12 @@ spec: image: image command: "python3 -u main.py --operator=kuberay --sleep=300 --tb-write=True" + storage: + sharedVolume: + mountPoint: "/mnt/shared" + sizeGB: 16 + storageClass: longhorn-xfs + debug: jupyter: false @@ -30,4 +36,3 @@ spec: cpus: 1 ramRatio: 2 shmSizeGB: 0 - diff --git a/manifests/manifest.pytorch.sample.yaml b/manifests/manifest.pytorch.sample.yaml index 8ef17cb..f50f9a7 100644 --- a/manifests/manifest.pytorch.sample.yaml +++ b/manifests/manifest.pytorch.sample.yaml @@ -11,6 +11,12 @@ spec: image: image command: "python3 -u main.py --operator=pytorch --sleep=300 --tb-write=True" + storage: + sharedVolume: + mountPoint: "/mnt/shared" + sizeGB: 16 + storageClass: longhorn-xfs + tensorboard: enabled: true diff --git a/manifests/manifest.tf.sample.yaml b/manifests/manifest.tf.sample.yaml index db50f41..66dca9f 100644 --- a/manifests/manifest.tf.sample.yaml +++ b/manifests/manifest.tf.sample.yaml @@ -11,6 +11,12 @@ spec: image: image command: "python3 -u main.py --operator=tf --sleep=300 --tb-write=True" + storage: + sharedVolume: + mountPoint: "/mnt/shared" + sizeGB: 16 + storageClass: longhorn-xfs + tensorboard: enabled: true diff --git a/manifests/manifest.xgboost.sample.yaml b/manifests/manifest.xgboost.sample.yaml index 748372c..0c2d75e 100644 --- a/manifests/manifest.xgboost.sample.yaml +++ b/manifests/manifest.xgboost.sample.yaml @@ -11,6 +11,12 @@ spec: image: image command: "python3 -u main.py --operator=xgboost --sleep=300 --tb-write=True" + storage: + sharedVolume: + mountPoint: "/mnt/shared" + sizeGB: 16 + storageClass: longhorn-xfs + tensorboard: enabled: true From 48c0e67c013d8b03a8e2c30d572401c0e618f6a4 Mon Sep 17 00:00:00 2001 From: Joao Cardoso Date: Mon, 28 Oct 2024 14:20:16 +0100 Subject: [PATCH 2/4] fix(manifests): Updating manifests since now storage class is defined at the experiment level, instead of during cluster import --- manifest.yaml | 1 + manifests/manifest.jax.sample.yaml | 3 ++- manifests/manifest.kuberay.sample.yaml | 1 + manifests/manifest.pytorch.sample.yaml | 1 + manifests/manifest.tf.sample.yaml | 1 + manifests/manifest.xgboost.sample.yaml | 1 + 6 files changed, 7 insertions(+), 1 deletion(-) diff --git a/manifest.yaml b/manifest.yaml index 66dca9f..21ebc87 100644 --- a/manifest.yaml +++ b/manifest.yaml @@ -11,6 +11,7 @@ spec: image: image command: "python3 -u main.py --operator=tf --sleep=300 --tb-write=True" + #Optional storage: sharedVolume: mountPoint: "/mnt/shared" diff --git a/manifests/manifest.jax.sample.yaml b/manifests/manifest.jax.sample.yaml index e9a05de..051aa65 100644 --- a/manifests/manifest.jax.sample.yaml +++ b/manifests/manifest.jax.sample.yaml @@ -11,7 +11,8 @@ spec: image: image command: "python3 -u main.py --operator=jax --sleep=300 --tb-write=True" - storage: + #Optional + storage: sharedVolume: mountPoint: "/mnt/shared" sizeGB: 16 diff --git a/manifests/manifest.kuberay.sample.yaml b/manifests/manifest.kuberay.sample.yaml index 2ca11fe..630d91f 100644 --- a/manifests/manifest.kuberay.sample.yaml +++ b/manifests/manifest.kuberay.sample.yaml @@ -11,6 +11,7 @@ spec: image: image command: "python3 -u main.py --operator=kuberay --sleep=300 --tb-write=True" + #Optional storage: sharedVolume: mountPoint: "/mnt/shared" diff --git a/manifests/manifest.pytorch.sample.yaml b/manifests/manifest.pytorch.sample.yaml index f50f9a7..5ee316e 100644 --- a/manifests/manifest.pytorch.sample.yaml +++ b/manifests/manifest.pytorch.sample.yaml @@ -11,6 +11,7 @@ spec: image: image command: "python3 -u main.py --operator=pytorch --sleep=300 --tb-write=True" + #Optional storage: sharedVolume: mountPoint: "/mnt/shared" diff --git a/manifests/manifest.tf.sample.yaml b/manifests/manifest.tf.sample.yaml index 66dca9f..21ebc87 100644 --- a/manifests/manifest.tf.sample.yaml +++ b/manifests/manifest.tf.sample.yaml @@ -11,6 +11,7 @@ spec: image: image command: "python3 -u main.py --operator=tf --sleep=300 --tb-write=True" + #Optional storage: sharedVolume: mountPoint: "/mnt/shared" diff --git a/manifests/manifest.xgboost.sample.yaml b/manifests/manifest.xgboost.sample.yaml index 0c2d75e..6167dc5 100644 --- a/manifests/manifest.xgboost.sample.yaml +++ b/manifests/manifest.xgboost.sample.yaml @@ -11,6 +11,7 @@ spec: image: image command: "python3 -u main.py --operator=xgboost --sleep=300 --tb-write=True" + #Optional storage: sharedVolume: mountPoint: "/mnt/shared" From d1a9c38a81eb3e9e521aee2c0742eb899d57649c Mon Sep 17 00:00:00 2001 From: Joao Cardoso Date: Tue, 29 Oct 2024 10:28:14 +0100 Subject: [PATCH 3/4] chore(charts): Commenting storage section --- manifest.yaml | 10 +++++----- manifests/manifest.jax.sample.yaml | 10 +++++----- manifests/manifest.kuberay.sample.yaml | 10 +++++----- manifests/manifest.pytorch.sample.yaml | 10 +++++----- manifests/manifest.tf.sample.yaml | 10 +++++----- manifests/manifest.xgboost.sample.yaml | 10 +++++----- 6 files changed, 30 insertions(+), 30 deletions(-) diff --git a/manifest.yaml b/manifest.yaml index 21ebc87..3152b40 100644 --- a/manifest.yaml +++ b/manifest.yaml @@ -12,11 +12,11 @@ spec: command: "python3 -u main.py --operator=tf --sleep=300 --tb-write=True" #Optional - storage: - sharedVolume: - mountPoint: "/mnt/shared" - sizeGB: 16 - storageClass: longhorn-xfs + # storage: + # sharedVolume: + # mountPoint: "/mnt/shared" + # sizeGB: 16 + # storageClass: longhorn-xfs tensorboard: enabled: true diff --git a/manifests/manifest.jax.sample.yaml b/manifests/manifest.jax.sample.yaml index 051aa65..0888f77 100644 --- a/manifests/manifest.jax.sample.yaml +++ b/manifests/manifest.jax.sample.yaml @@ -12,11 +12,11 @@ spec: command: "python3 -u main.py --operator=jax --sleep=300 --tb-write=True" #Optional - storage: - sharedVolume: - mountPoint: "/mnt/shared" - sizeGB: 16 - storageClass: longhorn-xfs + # storage: + # sharedVolume: + # mountPoint: "/mnt/shared" + # sizeGB: 16 + # storageClass: longhorn-xfs tensorboard: enabled: true diff --git a/manifests/manifest.kuberay.sample.yaml b/manifests/manifest.kuberay.sample.yaml index 630d91f..b2e6b86 100644 --- a/manifests/manifest.kuberay.sample.yaml +++ b/manifests/manifest.kuberay.sample.yaml @@ -12,11 +12,11 @@ spec: command: "python3 -u main.py --operator=kuberay --sleep=300 --tb-write=True" #Optional - storage: - sharedVolume: - mountPoint: "/mnt/shared" - sizeGB: 16 - storageClass: longhorn-xfs + # storage: + # sharedVolume: + # mountPoint: "/mnt/shared" + # sizeGB: 16 + # storageClass: longhorn-xfs debug: jupyter: false diff --git a/manifests/manifest.pytorch.sample.yaml b/manifests/manifest.pytorch.sample.yaml index 5ee316e..688bbe6 100644 --- a/manifests/manifest.pytorch.sample.yaml +++ b/manifests/manifest.pytorch.sample.yaml @@ -12,11 +12,11 @@ spec: command: "python3 -u main.py --operator=pytorch --sleep=300 --tb-write=True" #Optional - storage: - sharedVolume: - mountPoint: "/mnt/shared" - sizeGB: 16 - storageClass: longhorn-xfs + # storage: + # sharedVolume: + # mountPoint: "/mnt/shared" + # sizeGB: 16 + # storageClass: longhorn-xfs tensorboard: enabled: true diff --git a/manifests/manifest.tf.sample.yaml b/manifests/manifest.tf.sample.yaml index 21ebc87..3152b40 100644 --- a/manifests/manifest.tf.sample.yaml +++ b/manifests/manifest.tf.sample.yaml @@ -12,11 +12,11 @@ spec: command: "python3 -u main.py --operator=tf --sleep=300 --tb-write=True" #Optional - storage: - sharedVolume: - mountPoint: "/mnt/shared" - sizeGB: 16 - storageClass: longhorn-xfs + # storage: + # sharedVolume: + # mountPoint: "/mnt/shared" + # sizeGB: 16 + # storageClass: longhorn-xfs tensorboard: enabled: true diff --git a/manifests/manifest.xgboost.sample.yaml b/manifests/manifest.xgboost.sample.yaml index 6167dc5..941c304 100644 --- a/manifests/manifest.xgboost.sample.yaml +++ b/manifests/manifest.xgboost.sample.yaml @@ -12,11 +12,11 @@ spec: command: "python3 -u main.py --operator=xgboost --sleep=300 --tb-write=True" #Optional - storage: - sharedVolume: - mountPoint: "/mnt/shared" - sizeGB: 16 - storageClass: longhorn-xfs + # storage: + # sharedVolume: + # mountPoint: "/mnt/shared" + # sizeGB: 16 + # storageClass: longhorn-xfs tensorboard: enabled: true From 82169fa8ba4a86433309fd7a40a6d9c6811d61c2 Mon Sep 17 00:00:00 2001 From: Joao Cardoso Date: Wed, 30 Oct 2024 15:27:23 +0100 Subject: [PATCH 4/4] chore(manifests): Adding accessmode field in optional storage section in manifests --- manifest.yaml | 1 + manifests/manifest.jax.sample.yaml | 1 + manifests/manifest.kuberay.sample.yaml | 1 + manifests/manifest.pytorch.sample.yaml | 3 ++- manifests/manifest.tf.sample.yaml | 1 + manifests/manifest.xgboost.sample.yaml | 1 + 6 files changed, 7 insertions(+), 1 deletion(-) diff --git a/manifest.yaml b/manifest.yaml index 3152b40..be0169a 100644 --- a/manifest.yaml +++ b/manifest.yaml @@ -17,6 +17,7 @@ spec: # mountPoint: "/mnt/shared" # sizeGB: 16 # storageClass: longhorn-xfs + # accessMode: string # ReadWriteOnce / ReadWriteOncePod / ReadOnlyMany / ReadWriteMany tensorboard: enabled: true diff --git a/manifests/manifest.jax.sample.yaml b/manifests/manifest.jax.sample.yaml index 0888f77..e4e0ba1 100644 --- a/manifests/manifest.jax.sample.yaml +++ b/manifests/manifest.jax.sample.yaml @@ -17,6 +17,7 @@ spec: # mountPoint: "/mnt/shared" # sizeGB: 16 # storageClass: longhorn-xfs + # accessMode: string # ReadWriteOnce / ReadWriteOncePod / ReadOnlyMany / ReadWriteMany tensorboard: enabled: true diff --git a/manifests/manifest.kuberay.sample.yaml b/manifests/manifest.kuberay.sample.yaml index b2e6b86..bce212c 100644 --- a/manifests/manifest.kuberay.sample.yaml +++ b/manifests/manifest.kuberay.sample.yaml @@ -17,6 +17,7 @@ spec: # mountPoint: "/mnt/shared" # sizeGB: 16 # storageClass: longhorn-xfs + # accessMode: string # ReadWriteOnce / ReadWriteOncePod / ReadOnlyMany / ReadWriteMany debug: jupyter: false diff --git a/manifests/manifest.pytorch.sample.yaml b/manifests/manifest.pytorch.sample.yaml index 688bbe6..8c712c7 100644 --- a/manifests/manifest.pytorch.sample.yaml +++ b/manifests/manifest.pytorch.sample.yaml @@ -17,7 +17,8 @@ spec: # mountPoint: "/mnt/shared" # sizeGB: 16 # storageClass: longhorn-xfs - + # accessMode: string # ReadWriteOnce / ReadWriteOncePod / ReadOnlyMany / ReadWriteMany + tensorboard: enabled: true diff --git a/manifests/manifest.tf.sample.yaml b/manifests/manifest.tf.sample.yaml index 3152b40..be0169a 100644 --- a/manifests/manifest.tf.sample.yaml +++ b/manifests/manifest.tf.sample.yaml @@ -17,6 +17,7 @@ spec: # mountPoint: "/mnt/shared" # sizeGB: 16 # storageClass: longhorn-xfs + # accessMode: string # ReadWriteOnce / ReadWriteOncePod / ReadOnlyMany / ReadWriteMany tensorboard: enabled: true diff --git a/manifests/manifest.xgboost.sample.yaml b/manifests/manifest.xgboost.sample.yaml index 941c304..70e56a3 100644 --- a/manifests/manifest.xgboost.sample.yaml +++ b/manifests/manifest.xgboost.sample.yaml @@ -17,6 +17,7 @@ spec: # mountPoint: "/mnt/shared" # sizeGB: 16 # storageClass: longhorn-xfs + # accessMode: string # ReadWriteOnce / ReadWriteOncePod / ReadOnlyMany / ReadWriteMany tensorboard: enabled: true