Skip to content

Commit 5e4b037

Browse files
authored
Reboot instance if gpu is not accesible (#383)
* Reboot instance if gpu is not accesible * tests
1 parent 9e05809 commit 5e4b037

7 files changed

+26
-7
lines changed

cml/setup.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ if [ ! -f "$FILE" ]; then
1111
sudo add-apt-repository ppa:git-core/ppa -y
1212
sudo apt update && sudo apt-get install -y software-properties-common build-essential git
1313

14-
sudo curl -fsSL https://get.docker.com -o get-docker.sh && sudo sh get-docker.sh &&
15-
sudo usermod -aG docker ubuntu
14+
sudo curl -fsSL https://get.docker.com -o get-docker.sh && sudo sh get-docker.sh
15+
sudo usermod -aG docker ubuntu
1616
sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock
1717

1818
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -

iterative/resource_runner.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,8 @@ func renderScript(data map[string]interface{}) (string, error) {
299299

300300
tmpl, err := template.New("deploy").Funcs(template.FuncMap{"escape": shellescape.Quote}).Parse(
301301
`#!/bin/sh
302+
sudo systemctl is-enabled cml.service && return 0
303+
302304
{{- if not .container}}
303305
{{- if .setup}}{{.setup}}{{- end}}
304306
sudo npm config set user 0 && sudo npm install --global @dvcorg/cml
@@ -362,7 +364,11 @@ EOF'
362364
363365
{{- if .cloud}}
364366
sudo systemctl daemon-reload
365-
sudo systemctl enable cml.service --now
367+
sudo systemctl enable cml.service
368+
{{- if .instance_gpu}}
369+
nvidia-smi &>/dev/null || reboot
370+
{{- end}}
371+
sudo systemctl start cml.service
366372
{{- end}}
367373
368374
{{- end}}

iterative/testdata/script_template_cloud_aws.golden

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/bin/sh
2+
sudo systemctl is-enabled cml.service && return 0
23
FILE=/var/log/cml_stack.log
34
if [ ! -f "$FILE" ]; then
45
DEBIAN_FRONTEND=noninteractive
@@ -65,4 +66,6 @@ sudo bash -c 'cat << EOF > /etc/systemd/system/cml.service
6566
WantedBy=default.target
6667
EOF'
6768
sudo systemctl daemon-reload
68-
sudo systemctl enable cml.service --now
69+
sudo systemctl enable cml.service
70+
nvidia-smi &>/dev/null || reboot
71+
sudo systemctl start cml.service

iterative/testdata/script_template_cloud_azure.golden

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/bin/sh
2+
sudo systemctl is-enabled cml.service && return 0
23
FILE=/var/log/cml_stack.log
34
if [ ! -f "$FILE" ]; then
45
DEBIAN_FRONTEND=noninteractive
@@ -66,4 +67,6 @@ sudo bash -c 'cat << EOF > /etc/systemd/system/cml.service
6667
WantedBy=default.target
6768
EOF'
6869
sudo systemctl daemon-reload
69-
sudo systemctl enable cml.service --now
70+
sudo systemctl enable cml.service
71+
nvidia-smi &>/dev/null || reboot
72+
sudo systemctl start cml.service

iterative/testdata/script_template_cloud_gcp.golden

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/bin/sh
2+
sudo systemctl is-enabled cml.service && return 0
23
FILE=/var/log/cml_stack.log
34
if [ ! -f "$FILE" ]; then
45
DEBIAN_FRONTEND=noninteractive
@@ -63,4 +64,6 @@ sudo bash -c 'cat << EOF > /etc/systemd/system/cml.service
6364
WantedBy=default.target
6465
EOF'
6566
sudo systemctl daemon-reload
66-
sudo systemctl enable cml.service --now
67+
sudo systemctl enable cml.service
68+
nvidia-smi &>/dev/null || reboot
69+
sudo systemctl start cml.service

iterative/testdata/script_template_cloud_invalid.golden

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/bin/sh
2+
sudo systemctl is-enabled cml.service && return 0
23
FILE=/var/log/cml_stack.log
34
if [ ! -f "$FILE" ]; then
45
DEBIAN_FRONTEND=noninteractive
@@ -62,4 +63,6 @@ sudo bash -c 'cat << EOF > /etc/systemd/system/cml.service
6263
WantedBy=default.target
6364
EOF'
6465
sudo systemctl daemon-reload
65-
sudo systemctl enable cml.service --now
66+
sudo systemctl enable cml.service
67+
nvidia-smi &>/dev/null || reboot
68+
sudo systemctl start cml.service

iterative/testdata/script_template_cloud_kubernetes.golden

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/bin/sh
2+
sudo systemctl is-enabled cml.service && return 0
23
export KUBERNETES_CONFIGURATION='8 value with "quotes" and spaces'
34

45
HOME="$(mktemp -d)" exec cml-runner \

0 commit comments

Comments
 (0)