Skip to content

Commit 244fea2

Browse files
tasdomas0x2b3bfa0
andauthored
Slight refactoring of machine script generation (#642)
* Slight refactoring of machine script generation. Use embedded template instead of format string. Add test. Remove use of map pointer - golang maps are already pointers to a maps structure and can be nil. * Remove misleading comment. * Use goldie for storing expected script output. * Fix missing error check. * Remove residual escaped percent signs Co-authored-by: Helio Machado <0x2b3bfa0+git@googlemail.com>
1 parent 952286d commit 244fea2

14 files changed

+490
-164
lines changed

task/aws/resources/data_source_credentials.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ type Credentials struct {
2222
Dependencies struct {
2323
*Bucket
2424
}
25-
Resource *map[string]string
25+
Resource map[string]string
2626
}
2727

2828
func (c *Credentials) Read(ctx context.Context) error {
@@ -40,7 +40,7 @@ func (c *Credentials) Read(ctx context.Context) error {
4040
c.Dependencies.Bucket.Identifier,
4141
)
4242

43-
c.Resource = &map[string]string{
43+
c.Resource = map[string]string{
4444
"AWS_ACCESS_KEY_ID": credentials.AccessKeyID,
4545
"AWS_SECRET_ACCESS_KEY": credentials.SecretAccessKey,
4646
"AWS_SESSION_TOKEN": credentials.SessionToken,

task/aws/resources/resource_launch_template.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ import (
44
"context"
55
"encoding/base64"
66
"errors"
7+
"fmt"
8+
"time"
79

810
"github.com/aws/smithy-go"
911

@@ -48,7 +50,11 @@ func (l *LaunchTemplate) Create(ctx context.Context) error {
4850
l.Attributes.Environment.Variables = make(map[string]*string)
4951
}
5052

51-
script := machine.Script(l.Attributes.Environment.Script, l.Dependencies.Credentials.Resource, l.Attributes.Environment.Variables, l.Attributes.Environment.Timeout)
53+
timeout := time.Now().Add(l.Attributes.Environment.Timeout)
54+
script, err := machine.Script(l.Attributes.Environment.Script, l.Dependencies.Credentials.Resource, l.Attributes.Environment.Variables, &timeout)
55+
if err != nil {
56+
return fmt.Errorf("failed to render machine script: %w", err)
57+
}
5258
userData := base64.StdEncoding.EncodeToString([]byte(script))
5359

5460
size := l.Attributes.Size.Machine
@@ -112,7 +118,7 @@ func (l *LaunchTemplate) Create(ctx context.Context) error {
112118
input.LaunchTemplateData.BlockDeviceMappings[0].Ebs.VolumeSize = aws.Int32(int32(size))
113119
}
114120

115-
if _, err := l.Client.Services.EC2.CreateLaunchTemplate(ctx, &input); err != nil {
121+
if _, err = l.Client.Services.EC2.CreateLaunchTemplate(ctx, &input); err != nil {
116122
var e smithy.APIError
117123
if errors.As(err, &e) && e.ErrorCode() == "InvalidLaunchTemplateName.AlreadyExistsException" {
118124
return l.Read(ctx)

task/aws/task.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,8 @@ func (t *Task) Delete(ctx context.Context) error {
221221
}
222222
}
223223
logrus.Info("[2/8] Emptying Bucket...")
224-
if err := machine.Delete(ctx, (*t.DataSources.Credentials.Resource)["RCLONE_REMOTE"]); err != nil && err != common.NotFoundError {
224+
225+
if err := machine.Delete(ctx, t.DataSources.Credentials.Resource["RCLONE_REMOTE"]); err != nil && err != common.NotFoundError {
225226
return err
226227
}
227228
}
@@ -258,23 +259,23 @@ func (t *Task) Logs(ctx context.Context) ([]string, error) {
258259
return nil, err
259260
}
260261

261-
return machine.Logs(ctx, (*t.DataSources.Credentials.Resource)["RCLONE_REMOTE"])
262+
return machine.Logs(ctx, t.DataSources.Credentials.Resource["RCLONE_REMOTE"])
262263
}
263264

264265
func (t *Task) Pull(ctx context.Context, destination, include string) error {
265266
if err := t.Read(ctx); err != nil {
266267
return err
267268
}
268269

269-
return machine.Transfer(ctx, (*t.DataSources.Credentials.Resource)["RCLONE_REMOTE"]+"/data", destination, include)
270+
return machine.Transfer(ctx, t.DataSources.Credentials.Resource["RCLONE_REMOTE"]+"/data", destination, include)
270271
}
271272

272273
func (t *Task) Push(ctx context.Context, source string) error {
273274
if err := t.Read(ctx); err != nil {
274275
return err
275276
}
276277

277-
return machine.Transfer(ctx, source, (*t.DataSources.Credentials.Resource)["RCLONE_REMOTE"]+"/data", "**")
278+
return machine.Transfer(ctx, source, t.DataSources.Credentials.Resource["RCLONE_REMOTE"]+"/data", "**")
278279
}
279280

280281
func (t *Task) Start(ctx context.Context) error {
@@ -302,7 +303,7 @@ func (t *Task) Status(ctx context.Context) (common.Status, error) {
302303
return nil, err
303304
}
304305

305-
return machine.Status(ctx, (*t.DataSources.Credentials.Resource)["RCLONE_REMOTE"], t.Attributes.Status)
306+
return machine.Status(ctx, t.DataSources.Credentials.Resource["RCLONE_REMOTE"], t.Attributes.Status)
306307
}
307308

308309
func (t *Task) GetKeyPair(ctx context.Context) (*ssh.DeterministicSSHKeyPair, error) {

task/az/resources/data_source_credentials.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ type Credentials struct {
2929
*StorageAccount
3030
*BlobContainer
3131
}
32-
Resource *map[string]string
32+
Resource map[string]string
3333
}
3434

3535
func (c *Credentials) Read(ctx context.Context) error {
@@ -51,7 +51,7 @@ func (c *Credentials) Read(ctx context.Context) error {
5151

5252
subscriptionID := c.Client.Settings.GetSubscriptionID()
5353

54-
c.Resource = &map[string]string{
54+
c.Resource = map[string]string{
5555
"AZURE_CLIENT_ID": credentials.ClientID,
5656
"AZURE_CLIENT_SECRET": credentials.ClientSecret,
5757
"AZURE_SUBSCRIPTION_ID": subscriptionID,

task/az/resources/resource_virtual_machine_scale_set.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,11 @@ func (v *VirtualMachineScaleSet) Create(ctx context.Context) error {
7878
v.Attributes.Environment.Variables = make(map[string]*string)
7979
}
8080

81-
script := machine.Script(v.Attributes.Environment.Script, v.Dependencies.Credentials.Resource, v.Attributes.Environment.Variables, v.Attributes.Environment.Timeout)
81+
timeout := time.Now().Add(v.Attributes.Environment.Timeout)
82+
script, err := machine.Script(v.Attributes.Environment.Script, v.Dependencies.Credentials.Resource, v.Attributes.Environment.Variables, &timeout)
83+
if err != nil {
84+
return fmt.Errorf("failed to render machine script: %w", err)
85+
}
8286

8387
image := v.Attributes.Environment.Image
8488
images := map[string]string{

task/az/task.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,8 @@ func (t *Task) Delete(ctx context.Context) error {
210210
}
211211
}
212212
logrus.Info("[2/9] Emptying Bucket...")
213-
if err := machine.Delete(ctx, (*t.DataSources.Credentials.Resource)["RCLONE_REMOTE"]); err != nil && err != common.NotFoundError {
213+
214+
if err := machine.Delete(ctx, t.DataSources.Credentials.Resource["RCLONE_REMOTE"]); err != nil && err != common.NotFoundError {
214215
return err
215216
}
216217
}
@@ -251,23 +252,23 @@ func (t *Task) Logs(ctx context.Context) ([]string, error) {
251252
return nil, err
252253
}
253254

254-
return machine.Logs(ctx, (*t.DataSources.Credentials.Resource)["RCLONE_REMOTE"])
255+
return machine.Logs(ctx, t.DataSources.Credentials.Resource["RCLONE_REMOTE"])
255256
}
256257

257258
func (t *Task) Pull(ctx context.Context, destination, include string) error {
258259
if err := t.Read(ctx); err != nil {
259260
return err
260261
}
261262

262-
return machine.Transfer(ctx, (*t.DataSources.Credentials.Resource)["RCLONE_REMOTE"]+"/data", destination, include)
263+
return machine.Transfer(ctx, t.DataSources.Credentials.Resource["RCLONE_REMOTE"]+"/data", destination, include)
263264
}
264265

265266
func (t *Task) Push(ctx context.Context, source string) error {
266267
if err := t.Read(ctx); err != nil {
267268
return err
268269
}
269270

270-
return machine.Transfer(ctx, source, (*t.DataSources.Credentials.Resource)["RCLONE_REMOTE"]+"/data", "**")
271+
return machine.Transfer(ctx, source, t.DataSources.Credentials.Resource["RCLONE_REMOTE"]+"/data", "**")
271272
}
272273

273274
func (t *Task) Start(ctx context.Context) error {
@@ -294,8 +295,7 @@ func (t *Task) Status(ctx context.Context) (common.Status, error) {
294295
if err := t.Read(ctx); err != nil {
295296
return nil, err
296297
}
297-
298-
return machine.Status(ctx, (*t.DataSources.Credentials.Resource)["RCLONE_REMOTE"], t.Attributes.Status)
298+
return machine.Status(ctx, t.DataSources.Credentials.Resource["RCLONE_REMOTE"], t.Attributes.Status)
299299
}
300300

301301
func (t *Task) GetKeyPair(ctx context.Context) (*ssh.DeterministicSSHKeyPair, error) {
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#!/bin/bash
2+
sudo mkdir --parents /opt/task/directory
3+
chmod u=rwx,g=rwx,o=rwx /opt/task/directory
4+
5+
base64 --decode << END | sudo tee /usr/bin/tpi-task > /dev/null
6+
{{.TaskScript}}
7+
END
8+
chmod u=rwx,g=rx,a=rx /usr/bin/tpi-task
9+
10+
sudo tee /usr/bin/tpi-task-shutdown << 'END'
11+
#!/bin/bash
12+
sleep 20; while pgrep rclone > /dev/null; do sleep 1; done
13+
source /opt/task/credentials
14+
if ! test -z "$CI"; then
15+
cml rerun-workflow
16+
fi
17+
(systemctl is-system-running | grep stopping) || tpi stop --cloud="$TPI_TASK_CLOUD_PROVIDER" --region="$TPI_TASK_CLOUD_REGION" "$TPI_TASK_IDENTIFIER";
18+
END
19+
20+
chmod u=rwx,g=rx,o=rx /usr/bin/tpi-task-shutdown
21+
22+
base64 --decode << END | sudo tee /opt/task/variables > /dev/null
23+
{{.Environment}}
24+
END
25+
base64 --decode << END | sudo tee /opt/task/credentials > /dev/null
26+
{{.Credentials}}
27+
END
28+
chmod u=rw,g=,o= /opt/task/variables
29+
chmod u=rw,g=,o= /opt/task/credentials
30+
31+
while IFS= read -rd $'\0' variable; do
32+
export "$(perl -0777p -e 's/\\"/"/g;' -e 's/(.+?)="(.+)"/$1=$2/sg' <<< "$variable")"
33+
done < <(perl -0777pe 's/\n*(.+?=".*?((?<!\\)"|\\\\"))\n*/$1\x00/sg' /opt/task/variables)
34+
35+
TPI_MACHINE_IDENTITY="$(uuidgen)"
36+
TPI_LOG_DIRECTORY="$(mktemp --directory)"
37+
TPI_DATA_DIRECTORY="/opt/task/directory"
38+
39+
TPI_START_COMMAND="/bin/bash -lc 'exec /usr/bin/tpi-task'"
40+
TPI_REMAINING_RUN_TIME=$(({{.Timeout}}-$(date +%s)))
41+
if (( TPI_REMAINING_RUN_TIME < 1 )); then
42+
TPI_START_COMMAND="/bin/bash -c 'sleep infinity'"
43+
TPI_REMAINING_RUN_TIME=1
44+
fi
45+
46+
source /opt/task/credentials
47+
48+
sudo tee /etc/systemd/system/tpi-task.service > /dev/null <<END
49+
[Unit]
50+
After=default.target
51+
[Service]
52+
Type=simple
53+
ExecStart=-$TPI_START_COMMAND
54+
ExecStop=/bin/bash -c 'source /opt/task/credentials; systemctl is-system-running | grep stopping || echo "{\\\\"result\\\\": \\\\"\$SERVICE_RESULT\\\\", \\\\"code\\\\": \\\\"\$EXIT_STATUS\\\\", \\\\"status\\\\": \\\\"\$EXIT_CODE\\\\"}" > "$TPI_LOG_DIRECTORY/status-$TPI_MACHINE_IDENTITY" && RCLONE_CONFIG= rclone copy "$TPI_LOG_DIRECTORY" "\$RCLONE_REMOTE/reports"'
55+
ExecStopPost=/usr/bin/tpi-task-shutdown
56+
Environment=HOME=/root
57+
EnvironmentFile=/opt/task/variables
58+
WorkingDirectory=/opt/task/directory
59+
RuntimeMaxSec=$TPI_REMAINING_RUN_TIME
60+
[Install]
61+
WantedBy=default.target
62+
END
63+
64+
curl --location --remote-name https://github.com/iterative/terraform-provider-iterative/releases/latest/download/terraform-provider-iterative_linux_amd64
65+
sudo mv terraform-provider-iterative* /usr/bin/tpi
66+
sudo chmod u=rwx,g=rx,o=rx /usr/bin/tpi
67+
sudo chown root:root /usr/bin/tpi
68+
69+
curl --location --remote-name https://github.com/iterative/cml/releases/latest/download/cml-linux
70+
chmod u=rwx,g=rx,o=rx cml-linux
71+
sudo mv cml-linux /usr/bin/cml
72+
73+
extract_here(){
74+
if command -v unzip 2>&1 > /dev/null; then
75+
unzip "$1"
76+
elif command -v python3 2>&1 > /dev/null; then
77+
python3 -m zipfile -e "$1" .
78+
else
79+
python -m zipfile -e "$1" .
80+
fi
81+
}
82+
83+
if ! command -v rclone 2>&1 > /dev/null; then
84+
curl --remote-name https://downloads.rclone.org/rclone-current-linux-amd64.zip
85+
extract_here rclone-current-linux-amd64.zip
86+
sudo cp rclone-*-linux-amd64/rclone /usr/bin
87+
sudo chmod u=rwx,g=rx,o=rx /usr/bin/rclone
88+
sudo chown root:root /usr/bin/rclone
89+
rm --recursive rclone-*-linux-amd64*
90+
fi
91+
92+
rclone copy "$RCLONE_REMOTE/data" /opt/task/directory
93+
94+
yes | /etc/profile.d/install-driver-prompt.sh # for GCP GPU machines
95+
96+
# FIX NVIDIA APT GPG KEYS (https://github.com/NVIDIA/cuda-repo-management/issues/1#issuecomment-1111490201) 🤬
97+
if test -f /etc/apt/sources.list.d/cuda.list; then
98+
for list in cuda nvidia-ml; do mv /etc/apt/sources.list.d/$list.list{,.backup}; done
99+
apt-get update
100+
apt-get install --yes gpg
101+
apt-key del 7fa2af80
102+
apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/3bf863cc.pub
103+
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/7fa2af80.pub
104+
for list in cuda nvidia-ml; do mv /etc/apt/sources.list.d/$list.list{.backup,}; done
105+
fi
106+
107+
sudo systemctl daemon-reload
108+
sudo systemctl enable tpi-task.service --now
109+
sudo systemctl disable --now apt-daily.timer
110+
111+
while sleep 5; do
112+
test -n "$TPI_MACHINE_LOGS" && journalctl > "$TPI_LOG_DIRECTORY/machine-$TPI_MACHINE_IDENTITY"
113+
journalctl --all --no-hostname --output=short-iso --quiet --unit=tpi-task --utc | sed 's/^\([0-9-]*\)T\([0-9:]*\)+0000 \S*: \(.*\)/\1 \2 \3/g' > "$TPI_LOG_DIRECTORY/task-$TPI_MACHINE_IDENTITY"
114+
NEW_TPI_LOG_DIRECTORY_HASH="$(md5sum "$TPI_LOG_DIRECTORY"/*)"
115+
if test "$NEW_TPI_LOG_DIRECTORY_HASH" != "$TPI_LOG_DIRECTORY_HASH"; then
116+
TPI_LOG_DIRECTORY_HASH="$NEW_TPI_LOG_DIRECTORY_HASH"
117+
rclone sync "$TPI_LOG_DIRECTORY" "$RCLONE_REMOTE/reports"
118+
fi
119+
done &
120+
121+
while sleep 10; do
122+
NEW_TPI_DATA_DIRECTORY_EPOCH="$(find "$TPI_DATA_DIRECTORY" -printf "%T@\n" | sort | tail -1)"
123+
if test "$NEW_TPI_DATA_DIRECTORY_EPOCH" != "$TPI_DATA_DIRECTORY_EPOCH"; then
124+
TPI_DATA_DIRECTORY_EPOCH="$NEW_TPI_DATA_DIRECTORY_EPOCH"
125+
rclone sync "$TPI_DATA_DIRECTORY" "$RCLONE_REMOTE/data"
126+
fi
127+
done &

0 commit comments

Comments
 (0)