From dbfee412723ca76a4e8ab3c274a5d8a09b78e33c Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Tue, 29 Nov 2022 19:44:15 +0000 Subject: [PATCH 1/3] home: bitbucket GPU demo --- .../pages/Home/UseCasesSection/index.tsx | 86 ++++++++++--------- 1 file changed, 46 insertions(+), 40 deletions(-) diff --git a/src/components/pages/Home/UseCasesSection/index.tsx b/src/components/pages/Home/UseCasesSection/index.tsx index 76a369f4..9f96448f 100644 --- a/src/components/pages/Home/UseCasesSection/index.tsx +++ b/src/components/pages/Home/UseCasesSection/index.tsx @@ -672,54 +672,60 @@ const UseCasesSection: React.ForwardRefRenderFunction = () => ( )} bitbucket={( - - -
# GPU support coming soon, see https://github.com/iterative/cml/issues/1015
-
+ +
image: iterativeai/cml:0-dvc2-base1
pipelines:
-
default:
-
- step:
-
name: deploy-runner
-
image: iterativeai/cml:0-dvc2-base1
-
script:
-
- |
- -
cml runner \
-
--cloud=aws \
-
--cloud-region=us-west \
-
--cloud-type=m5.2xlarge \
-
--cloud-spot \
-
--labels=cml.runner
-
-
- step:
-
name: run
- -
runs-on: [self.hosted, cml.runner]
+
default:
+
- step:
+
name: Launch Runner and Train
+
script:
+
- |
+
cat <<EOF > leo-script.sh
+
#!/bin/bash
+
apt-get update -q && apt-get install -yq python3.9
+ +
dvc pull data
-
image: iterativeai/cml:0-dvc2-base1
-
script:
-
- apt-get update -y
-
- apt install imagemagick -y
-
- pip install -r requirements.txt
+
pip3 install -r requirements.txt
+
dvc repro
+
+
EOF
+ +
- |
+
LEO_OPTIONS="--cloud=aws --region=us-west"
+
leo_id=$(leo create $LEO_OPTIONS \
+
--image="nvidia"
+
--machine="p2.xlarge" \
+
--disk-size=64 \
+
--workdir="." \
+
--output="." \
+
--environment AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" \
+
--environment AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" \
+
--script="$(cat ./leo-script.sh)"
+
)
+
leo read $LEO_OPTIONS --follow "$leo_id"
+
sleep 45 # TODO: replace this hack with a proper wait loop
+
leo delete $LEO_OPTIONS --workdir="." --output="." \
+
"$leo_id"
-
- git fetch --prune
-
- dvc repro
-
- echo "# Style transfer" >> report.md
-
- git show origin/master:final_owl.png > master_owl.png
-
- convert +append final_owl.png master_owl.png out.png
-
- convert out.png -resize 75% out_shrink.png
-
- echo "### Workspace vs. Main" >> report.md
-
- cml publish out_shrink.png --md --title 'compare' >> report.md
-
- echo "## Training metrics" >> report.md
-
- dvc params diff master --show-md >> report.md
-
- echo >> report.md
-
- cml send-comment report.md
+
- git show origin/main:image.png > image-main.png
+
- |
+
cat <<EOF > report.md
+
# Style transfer
+
## Workspace vs. Main
+
![](./image.png "Workspace") ![](./image-main.png "Main")
+
## Training metrics
+
$(dvc params diff main --show-md)
+
## GPU info
+
$(cat gpu_info.txt)
+
EOF
+
- cml comment create report.md
- + Bitbucket Cloud report example From 620eceb9ad0f1d55fd7cc25f550576c28bab0255 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Tue, 29 Nov 2022 20:27:06 +0000 Subject: [PATCH 2/3] bitbucket: add GPU examples - fixes https://github.com/iterative/cml/issues/1015 - fixes https://github.com/iterative/cml.dev/issues/349 --- content/docs/ref/runner.md | 9 ++++ content/docs/self-hosted-runners.md | 53 ++++++++++++++++++- .../pages/Home/UseCasesSection/index.tsx | 4 +- 3 files changed, 63 insertions(+), 3 deletions(-) diff --git a/content/docs/ref/runner.md b/content/docs/ref/runner.md index 7cbe17e2..6f099536 100644 --- a/content/docs/ref/runner.md +++ b/content/docs/ref/runner.md @@ -78,6 +78,15 @@ Any [generic option](/doc/ref) in addition to: need to write your code to save intermediate results to take advantage of this). +### Bitbucket + +- **GPU support**. + + See + [the guide on self-hosted Bitbucket runners](/doc/self-hosted-runners?tab=Bitbucket) + to work around + [Bitbucket's lack of native GPU support](https://jira.atlassian.com/browse/BCLOUD-21459). + ## Examples ### Using `--cloud-permission-set` diff --git a/content/docs/self-hosted-runners.md b/content/docs/self-hosted-runners.md index 3a6b5cfb..9fa5b876 100644 --- a/content/docs/self-hosted-runners.md +++ b/content/docs/self-hosted-runners.md @@ -116,7 +116,7 @@ train-and-report: ``` - + ```yaml pipelines: @@ -134,7 +134,6 @@ pipelines: - step: runs-on: [self.hosted, cml.runner] image: iterativeai/cml:0-dvc2-base1 - # GPU not yet supported, see https://github.com/iterative/cml/issues/1015 script: - pip install -r requirements.txt - python train.py # generate plot.png @@ -144,6 +143,56 @@ pipelines: - cml comment create report.md ``` + + + +Bitbucket does not support GPUs natively +([cml#1015](https://github.com/iterative/cml/issues/1015), +[BCLOUD-21459](https://jira.atlassian.com/browse/BCLOUD-21459)). A work-around +is to directly use +[TPI](https://github.com/iterative/terraform-provider-iterative) (the library +which CML `runner` uses internally). TPI includes a CLI-friendly helper called +LEO (launch, execute, orchestrate), used below: + +```yaml +image: iterativeai/cml:0-dvc2-base1 +pipelines: + default: + - step: + name: Launch Runner and Train + script: + # Create training script + - | + cat < leo-script.sh + #!/bin/bash + apt-get update -q && apt-get install -yq python3.9 + pip3 install -r requirements.txt + python train.py # generate plot.png + EOF + # Launch runner + - | + LEO_OPTIONS="--cloud=aws --region=us-west" + leo_id=$(leo create $LEO_OPTIONS \ + --image=nvidia + --machine=p2.xlarge \ + --disk-size=64 \ + --workdir=. \ + --output=. \ + --environment AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" \ + --environment AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" \ + --script="$(cat ./leo-script.sh)" + ) + # Wait for cloud training to finish + leo read $LEO_OPTIONS --follow "$leo_id" + sleep 45 # TODO: explain + # Download cloud training results & clean up cloud resources + leo delete $LEO_OPTIONS --workdir=. --output=. "$leo_id" + # Create CML report + - cat metrics.txt >> report.md + - echo '![](./plot.png "Confusion Matrix")' >> report.md + - cml comment create report.md +``` + diff --git a/src/components/pages/Home/UseCasesSection/index.tsx b/src/components/pages/Home/UseCasesSection/index.tsx index 9f96448f..999a900b 100644 --- a/src/components/pages/Home/UseCasesSection/index.tsx +++ b/src/components/pages/Home/UseCasesSection/index.tsx @@ -673,6 +673,8 @@ const UseCasesSection: React.ForwardRefRenderFunction = () => ( bitbucket={( +
# Use LEO instead of CML to force GPU support on Bitbucket
+
image: iterativeai/cml:0-dvc2-base1
pipelines:
default:
@@ -705,7 +707,7 @@ const UseCasesSection: React.ForwardRefRenderFunction = () => (
--script="$(cat ./leo-script.sh)"
)
leo read $LEO_OPTIONS --follow "$leo_id"
-
sleep 45 # TODO: replace this hack with a proper wait loop
+
sleep 45 # TODO: explain
leo delete $LEO_OPTIONS --workdir="." --output="." \
"$leo_id"
From f4b28cfbcc0e81401bbdb30cff2933b2056457bf Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Tue, 29 Nov 2022 20:35:02 +0000 Subject: [PATCH 3/3] self-hosted-runners: better bitbucket GPU tabbing --- content/docs/ref/runner.md | 2 +- content/docs/self-hosted-runners.md | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/content/docs/ref/runner.md b/content/docs/ref/runner.md index 6f099536..8b441259 100644 --- a/content/docs/ref/runner.md +++ b/content/docs/ref/runner.md @@ -83,7 +83,7 @@ Any [generic option](/doc/ref) in addition to: - **GPU support**. See - [the guide on self-hosted Bitbucket runners](/doc/self-hosted-runners?tab=Bitbucket) + [the guide on self-hosted Bitbucket runners](/doc/self-hosted-runners?tab=Bitbucket-GPU) to work around [Bitbucket's lack of native GPU support](https://jira.atlassian.com/browse/BCLOUD-21459). diff --git a/content/docs/self-hosted-runners.md b/content/docs/self-hosted-runners.md index 9fa5b876..d8e6b966 100644 --- a/content/docs/self-hosted-runners.md +++ b/content/docs/self-hosted-runners.md @@ -116,7 +116,9 @@ train-and-report: ```
- + + + ```yaml pipelines: @@ -144,7 +146,7 @@ pipelines: ``` - + Bitbucket does not support GPUs natively ([cml#1015](https://github.com/iterative/cml/issues/1015), @@ -193,6 +195,8 @@ pipelines: - cml comment create report.md ``` + +