diff --git a/content/docs/ref/runner.md b/content/docs/ref/runner.md index 7cbe17e2..8b441259 100644 --- a/content/docs/ref/runner.md +++ b/content/docs/ref/runner.md @@ -78,6 +78,15 @@ Any [generic option](/doc/ref) in addition to: need to write your code to save intermediate results to take advantage of this). +### Bitbucket + +- **GPU support**. + + See + [the guide on self-hosted Bitbucket runners](/doc/self-hosted-runners?tab=Bitbucket-GPU) + to work around + [Bitbucket's lack of native GPU support](https://jira.atlassian.com/browse/BCLOUD-21459). + ## Examples ### Using `--cloud-permission-set` diff --git a/content/docs/self-hosted-runners.md b/content/docs/self-hosted-runners.md index 3a6b5cfb..d8e6b966 100644 --- a/content/docs/self-hosted-runners.md +++ b/content/docs/self-hosted-runners.md @@ -117,6 +117,8 @@ train-and-report: + + ```yaml pipelines: @@ -134,7 +136,6 @@ pipelines: - step: runs-on: [self.hosted, cml.runner] image: iterativeai/cml:0-dvc2-base1 - # GPU not yet supported, see https://github.com/iterative/cml/issues/1015 script: - pip install -r requirements.txt - python train.py # generate plot.png @@ -144,6 +145,58 @@ pipelines: - cml comment create report.md ``` + + + +Bitbucket does not support GPUs natively +([cml#1015](https://github.com/iterative/cml/issues/1015), +[BCLOUD-21459](https://jira.atlassian.com/browse/BCLOUD-21459)). A work-around +is to directly use +[TPI](https://github.com/iterative/terraform-provider-iterative) (the library +which CML `runner` uses internally). TPI includes a CLI-friendly helper called +LEO (launch, execute, orchestrate), used below: + +```yaml +image: iterativeai/cml:0-dvc2-base1 +pipelines: + default: + - step: + name: Launch Runner and Train + script: + # Create training script + - | + cat < leo-script.sh + #!/bin/bash + apt-get update -q && apt-get install -yq python3.9 + pip3 install -r requirements.txt + python train.py # generate plot.png + EOF + # Launch runner + - | + LEO_OPTIONS="--cloud=aws --region=us-west" + leo_id=$(leo create $LEO_OPTIONS \ + --image=nvidia + --machine=p2.xlarge \ + --disk-size=64 \ + --workdir=. \ + --output=. \ + --environment AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" \ + --environment AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" \ + --script="$(cat ./leo-script.sh)" + ) + # Wait for cloud training to finish + leo read $LEO_OPTIONS --follow "$leo_id" + sleep 45 # TODO: explain + # Download cloud training results & clean up cloud resources + leo delete $LEO_OPTIONS --workdir=. --output=. "$leo_id" + # Create CML report + - cat metrics.txt >> report.md + - echo '![](./plot.png "Confusion Matrix")' >> report.md + - cml comment create report.md +``` + + + diff --git a/src/components/pages/Home/UseCasesSection/index.tsx b/src/components/pages/Home/UseCasesSection/index.tsx index 76a369f4..999a900b 100644 --- a/src/components/pages/Home/UseCasesSection/index.tsx +++ b/src/components/pages/Home/UseCasesSection/index.tsx @@ -672,54 +672,62 @@ const UseCasesSection: React.ForwardRefRenderFunction = () => ( )} bitbucket={( - - -
# GPU support coming soon, see https://github.com/iterative/cml/issues/1015
-
+ +
# Use LEO instead of CML to force GPU support on Bitbucket
+
# (https://cml.dev/doc/ref/runner#bitbucket)
+
image: iterativeai/cml:0-dvc2-base1
pipelines:
-
default:
-
- step:
-
name: deploy-runner
-
image: iterativeai/cml:0-dvc2-base1
-
script:
-
- |
- -
cml runner \
-
--cloud=aws \
-
--cloud-region=us-west \
-
--cloud-type=m5.2xlarge \
-
--cloud-spot \
-
--labels=cml.runner
-
-
- step:
-
name: run
- -
runs-on: [self.hosted, cml.runner]
+
default:
+
- step:
+
name: Launch Runner and Train
+
script:
+
- |
+
cat <<EOF > leo-script.sh
+
#!/bin/bash
+
apt-get update -q && apt-get install -yq python3.9
+ +
dvc pull data
-
image: iterativeai/cml:0-dvc2-base1
-
script:
-
- apt-get update -y
-
- apt install imagemagick -y
-
- pip install -r requirements.txt
+
pip3 install -r requirements.txt
+
dvc repro
+
+
EOF
+ +
- |
+
LEO_OPTIONS="--cloud=aws --region=us-west"
+
leo_id=$(leo create $LEO_OPTIONS \
+
--image="nvidia"
+
--machine="p2.xlarge" \
+
--disk-size=64 \
+
--workdir="." \
+
--output="." \
+
--environment AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" \
+
--environment AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" \
+
--script="$(cat ./leo-script.sh)"
+
)
+
leo read $LEO_OPTIONS --follow "$leo_id"
+
sleep 45 # TODO: explain
+
leo delete $LEO_OPTIONS --workdir="." --output="." \
+
"$leo_id"
-
- git fetch --prune
-
- dvc repro
-
- echo "# Style transfer" >> report.md
-
- git show origin/master:final_owl.png > master_owl.png
-
- convert +append final_owl.png master_owl.png out.png
-
- convert out.png -resize 75% out_shrink.png
-
- echo "### Workspace vs. Main" >> report.md
-
- cml publish out_shrink.png --md --title 'compare' >> report.md
-
- echo "## Training metrics" >> report.md
-
- dvc params diff master --show-md >> report.md
-
- echo >> report.md
-
- cml send-comment report.md
+
- git show origin/main:image.png > image-main.png
+
- |
+
cat <<EOF > report.md
+
# Style transfer
+
## Workspace vs. Main
+
![](./image.png "Workspace") ![](./image-main.png "Main")
+
## Training metrics
+
$(dvc params diff main --show-md)
+
## GPU info
+
$(cat gpu_info.txt)
+
EOF
+
- cml comment create report.md
- + Bitbucket Cloud report example