Skip to content

Commit f252af3

Browse files
committed
GPU support without restart
1 parent e14146d commit f252af3

File tree

3 files changed

+29
-17
lines changed

3 files changed

+29
-17
lines changed

cml/ami.json

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,6 @@
3333
"type": "shell",
3434
"environment_vars": ["DEBIAN_FRONTEND=noninteractive"],
3535
"script": "./setup.sh"
36-
},
37-
{
38-
"type": "shell",
39-
"inline": ["sudo shutdown -r now", "sleep 60"],
40-
"start_retry_timeout": "10m",
41-
"expect_disconnect": true
4236
}
4337
]
4438
}

cml/setup.sh

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ echo "APT::Get::Assume-Yes \"true\";" | sudo tee -a /etc/apt/apt.conf.d/90assume
66
sudo apt remove unattended-upgrades
77
systemctl disable apt-daily-upgrade.service
88

9+
sudo add-apt-repository universe -y
910
sudo add-apt-repository ppa:git-core/ppa -y
1011
sudo apt update && sudo apt-get install -y git
1112
sudo curl -fsSL https://get.docker.com -o get-docker.sh && sudo sh get-docker.sh && \
@@ -21,6 +22,12 @@ sudo apt update && sudo apt-get install -y nodejs
2122

2223
sudo apt install -y ubuntu-drivers-common git
2324
sudo ubuntu-drivers autoinstall
24-
curl -s -L https://nvidia.GitHub.io/nvidia-docker/gpgkey | sudo apt-key add - && \
25-
curl -s -L https://nvidia.GitHub.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
26-
sudo apt update && sudo apt install -y nvidia-container-toolkit
25+
26+
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
27+
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
28+
sudo apt update && sudo apt install -y nvidia-docker2
29+
30+
sudo systemctl restart docker
31+
32+
sudo nvidia-smi
33+
sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi

iterative/resource_runner.go

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -228,39 +228,50 @@ func provisionerCode(d *schema.ResourceData) (string, error) {
228228
data["idle_timeout"] = strconv.Itoa(d.Get("idle_timeout").(int))
229229
data["name"] = d.Get("name").(string)
230230
data["tf_resource"] = base64.StdEncoding.EncodeToString(jsonResource)
231+
data["instance_gpu"] = d.Get("instance_gpu").(string)
231232
data["AWS_SECRET_ACCESS_KEY"] = os.Getenv("AWS_SECRET_ACCESS_KEY")
232233
data["AWS_ACCESS_KEY_ID"] = os.Getenv("AWS_ACCESS_KEY_ID")
233234
data["AZURE_CLIENT_ID"] = os.Getenv("AZURE_CLIENT_ID")
234235
data["AZURE_CLIENT_SECRET"] = os.Getenv("AZURE_CLIENT_SECRET")
235236
data["AZURE_SUBSCRIPTION_ID"] = os.Getenv("AZURE_SUBSCRIPTION_ID")
236237
data["AZURE_TENANT_ID"] = os.Getenv("AZURE_TENANT_ID")
237238

238-
tmpl, err := template.New("deploy").Parse(`#!/bin/bash
239+
tmpl, err := template.New("deploy").Parse(`#!/bin/sh
239240
export DEBIAN_FRONTEND=noninteractive
240241
241242
{{if eq .cloud "azure"}}
242243
echo "APT::Get::Assume-Yes \"true\";" | sudo tee -a /etc/apt/apt.conf.d/90assumeyes
243244
244245
sudo apt remove unattended-upgrades
245-
systemctl disable apt-daily-upgrade.service
246+
systemctl disable apt-daily-upgrade.service
246247
248+
sudo add-apt-repository universe -y
247249
sudo add-apt-repository ppa:git-core/ppa -y
248250
sudo apt update && sudo apt-get install -y git
249251
sudo curl -fsSL https://get.docker.com -o get-docker.sh && sudo sh get-docker.sh
250252
sudo usermod -aG docker ubuntu
251253
sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock
254+
252255
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
253256
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
254257
sudo apt update && sudo apt-get install -y terraform
258+
255259
curl -sL https://deb.nodesource.com/setup_12.x | sudo bash
256260
sudo apt update && sudo apt-get install -y nodejs
261+
257262
sudo apt install -y ubuntu-drivers-common git
258-
sudo ubuntu-drivers autoinstall
259-
sudo rmmod nvidia && sudo nvidia-smi
260-
curl -s -L https://nvidia.GitHub.io/nvidia-docker/gpgkey | sudo apt-key add -
261-
curl -s -L https://nvidia.GitHub.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
262-
sudo apt update && sudo apt install -y nvidia-container-toolkit
263+
sudo ubuntu-drivers autoinstall
264+
265+
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
266+
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
267+
sudo apt update && sudo apt install -y nvidia-docker2
268+
269+
sudo systemctl restart docker
270+
271+
sudo nvidia-smi
272+
sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi
263273
{{end}}
274+
264275
sudo npm install -g git+https://github.com/iterative/cml.git#cml-runner
265276
export HOME=/root
266277
export AWS_SECRET_ACCESS_KEY={{.AWS_SECRET_ACCESS_KEY}}
@@ -269,7 +280,7 @@ export AZURE_CLIENT_ID={{.AZURE_CLIENT_ID}}
269280
export AZURE_CLIENT_SECRET={{.AZURE_CLIENT_SECRET}}
270281
export AZURE_SUBSCRIPTION_ID={{.AZURE_SUBSCRIPTION_ID}}
271282
export AZURE_TENANT_ID={{.AZURE_TENANT_ID}}
272-
nohup cml-runner{{if .name}} --name {{.name}}{{end}}{{if .labels}} --labels {{.labels}}{{end}}{{if .idle_timeout}} --idle-timeout {{.idle_timeout}}{{end}}{{if .driver}} --driver {{.driver}}{{end}}{{if .repo}} --repo {{.repo}}{{end}}{{if .token}} --token {{.token}}{{end}}{{if .tf_resource}} --tf_resource={{.tf_resource}}{{end}} < /dev/null > std.out 2> std.err &
283+
nohup cml-runner{{if .name}} --name {{.name}}{{end}}{{if .labels}} --labels {{.labels}}{{end}}{{if .idle_timeout}} --idle-timeout {{.idle_timeout}}{{end}}{{if .driver}} --driver {{.driver}}{{end}}{{if .repo}} --repo {{.repo}}{{end}}{{if .token}} --token {{.token}}{{end}}{{if .tf_resource}} --tf_resource={{.tf_resource}}{{end}} {{if .instance_gpu}} --cloud-gpu {{.instance_gpu}}{{end}} < /dev/null > std.out 2> std.err &
273284
sleep 10
274285
`)
275286
var customDataBuffer bytes.Buffer

0 commit comments

Comments
 (0)