Skip to content

Commit 1b37830

Browse files
DavidGOrtega0x2b3bfa0dacbd
authored
Studio bash logger (#735)
* Studio bash logger * statuses * golden * tpi features as env vars --------- Co-authored-by: Helio Machado <0x2b3bfa0+git@googlemail.com> Co-authored-by: Daniel Barnes <dabarnes2b@gmail.com>
1 parent ce4f3be commit 1b37830

File tree

4 files changed

+125
-9
lines changed

4 files changed

+125
-9
lines changed

iterative/resource_task.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,11 @@ func resourceTaskBuild(ctx context.Context, d *schema.ResourceData, m interface{
348348
v["CML_*"] = nil
349349
v["REPO_TOKEN"] = nil
350350

351+
region := d.Get("region").(string)
352+
machine := d.Get("machine").(string)
353+
v["TPI_REGION"] = &region
354+
v["TPI_MACHINE"] = &machine
355+
351356
c := common.Cloud{
352357
Provider: common.Provider(d.Get("cloud").(string)),
353358
Region: common.Region(d.Get("region").(string)),

task/common/machine/machine-script.sh.tpl

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,51 @@ sleep 20; while pgrep rclone > /dev/null; do sleep 1; done
1313
source /opt/task/credentials
1414
(systemctl is-system-running | grep stopping) || leo stop --cloud="$TPI_TASK_CLOUD_PROVIDER" --region="$TPI_TASK_CLOUD_REGION" "$TPI_TASK_IDENTIFIER";
1515
END
16-
1716
chmod u=rwx,g=rx,o=rx /usr/bin/tpi-task-shutdown
1817

18+
sudo tee /usr/bin/tpi-task-studio-log << 'END'
19+
#!/bin/bash
20+
URL="${STUDIO_URL:-https://studio.iterative.ai/api/live}"
21+
STEP="${STUDIO_STEP:-`echo $(date +%s)`}"
22+
STATUS=$1
23+
DATE_START="${TPI_TASK_DATE_START:-0}"
24+
DATE_END=0
25+
26+
if [ -n "$STUDIO_TOKEN" ]; then
27+
if [ -z "$STATUS" ]; then
28+
if systemctl is-system-running | grep stopping; then
29+
STATUS=queued;
30+
else
31+
if test $SERVICE_RESULT == timeout; then
32+
STATUS=timeout;
33+
else
34+
test $EXIT_STATUS == 0 && STATUS=succeeded || STATUS=failed;
35+
fi
36+
fi
37+
fi
38+
39+
if [[ "$STATUS" =~ ^(timeout|succeeded|failed)$ ]]; then
40+
DATE_END=$(date +%s)
41+
fi
42+
43+
STUDIO_PARAMS="{\"task\": {\"id\": \"${TPI_TASK_IDENTIFIER}\", \"status\": \"${STATUS}\", \"cloud\": \"${TPI_TASK_CLOUD_PROVIDER}\", \"machine\": \"${TPI_MACHINE}\", \"region\": \"${TPI_REGION}\", \"diskSize\": \"${TPI_DISK_SIZE}\", \"dateStart\": ${DATE_START}, \"dateEnd\": ${DATE_END}}}"
44+
STUDIO_PAYLOAD="{\"type\": \"data\", \"client\": \"dvclive\", \"repo_url\": \"${STUDIO_REPO_URL}\", \"baseline_sha\": \"${STUDIO_BASELINE_SHA}\", \"name\": \"TPI_TASK:${TPI_TASK_IDENTIFIER}\", \"step\":${STEP}, \"params\": ${STUDIO_PARAMS}}"
45+
curl -X POST $URL \
46+
-H "Content-Type: application/json" \
47+
-H "Authorization: token ${STUDIO_TOKEN}" \
48+
-d "${STUDIO_PAYLOAD}"
49+
fi
50+
END
51+
chmod u=rwx,g=rx,o=rx /usr/bin/tpi-task-studio-log
52+
1953
base64 --decode << END | sudo tee /opt/task/variables > /dev/null
2054
{{.Environment}}
2155
END
56+
chmod u=rw,g=,o= /opt/task/variables
57+
2258
base64 --decode << END | sudo tee /opt/task/credentials > /dev/null
2359
{{.Credentials}}
2460
END
25-
chmod u=rw,g=,o= /opt/task/variables
2661
chmod u=rw,g=,o= /opt/task/credentials
2762

2863
while IFS= read -rd $'\0' variable; do
@@ -48,7 +83,7 @@ sudo tee /etc/systemd/system/tpi-task.service > /dev/null <<END
4883
[Service]
4984
Type=simple
5085
ExecStart=-$TPI_START_COMMAND
51-
ExecStop=/bin/bash -c 'source /opt/task/credentials; systemctl is-system-running | grep stopping || echo "{\\\\"result\\\\": \\\\"\$SERVICE_RESULT\\\\", \\\\"code\\\\": \\\\"\$EXIT_STATUS\\\\", \\\\"status\\\\": \\\\"\$EXIT_CODE\\\\"}" > "$TPI_LOG_DIRECTORY/status-$TPI_MACHINE_IDENTITY" && RCLONE_CONFIG= rclone copy "$TPI_LOG_DIRECTORY" "\$RCLONE_REMOTE/reports"'
86+
ExecStop=/bin/bash -c 'source /opt/task/credentials; /usr/bin/tpi-task-studio-log && systemctl is-system-running | grep stopping || echo "{\\\\"result\\\\": \\\\"\$SERVICE_RESULT\\\\", \\\\"code\\\\": \\\\"\$EXIT_STATUS\\\\", \\\\"status\\\\": \\\\"\$EXIT_CODE\\\\"}" > "$TPI_LOG_DIRECTORY/status-$TPI_MACHINE_IDENTITY" && RCLONE_CONFIG= rclone copy "$TPI_LOG_DIRECTORY" "\$RCLONE_REMOTE/reports"'
5287
ExecStopPost=/usr/bin/tpi-task-shutdown
5388
Environment=HOME=/root
5489
EnvironmentFile=/opt/task/variables
@@ -101,6 +136,8 @@ if test -f /etc/apt/sources.list.d/cuda.list; then
101136
for list in cuda nvidia-ml; do mv /etc/apt/sources.list.d/$list.list{.backup,}; done
102137
fi
103138

139+
/usr/bin/tpi-task-studio-log running
140+
104141
sudo systemctl daemon-reload
105142
sudo systemctl enable tpi-task.service --now
106143
sudo systemctl disable --now apt-daily.timer

task/common/machine/testdata/machine_script_full.golden

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,51 @@ sleep 20; while pgrep rclone > /dev/null; do sleep 1; done
1313
source /opt/task/credentials
1414
(systemctl is-system-running | grep stopping) || leo stop --cloud="$TPI_TASK_CLOUD_PROVIDER" --region="$TPI_TASK_CLOUD_REGION" "$TPI_TASK_IDENTIFIER";
1515
END
16-
1716
chmod u=rwx,g=rx,o=rx /usr/bin/tpi-task-shutdown
1817

18+
sudo tee /usr/bin/tpi-task-studio-log << 'END'
19+
#!/bin/bash
20+
URL="${STUDIO_URL:-https://studio.iterative.ai/api/live}"
21+
STEP="${STUDIO_STEP:-`echo $(date +%s)`}"
22+
STATUS=$1
23+
DATE_START="${TPI_TASK_DATE_START:-0}"
24+
DATE_END=0
25+
26+
if [ -n "$STUDIO_TOKEN" ]; then
27+
if [ -z "$STATUS" ]; then
28+
if systemctl is-system-running | grep stopping; then
29+
STATUS=queued;
30+
else
31+
if test $SERVICE_RESULT == timeout; then
32+
STATUS=timeout;
33+
else
34+
test $EXIT_STATUS == 0 && STATUS=succeeded || STATUS=failed;
35+
fi
36+
fi
37+
fi
38+
39+
if [[ "$STATUS" =~ ^(timeout|succeeded|failed)$ ]]; then
40+
DATE_END=$(date +%s)
41+
fi
42+
43+
STUDIO_PARAMS="{\"task\": {\"id\": \"${TPI_TASK_IDENTIFIER}\", \"status\": \"${STATUS}\", \"cloud\": \"${TPI_TASK_CLOUD_PROVIDER}\", \"machine\": \"${TPI_MACHINE}\", \"region\": \"${TPI_REGION}\", \"diskSize\": \"${TPI_DISK_SIZE}\", \"dateStart\": ${DATE_START}, \"dateEnd\": ${DATE_END}}}"
44+
STUDIO_PAYLOAD="{\"type\": \"data\", \"client\": \"dvclive\", \"repo_url\": \"${STUDIO_REPO_URL}\", \"baseline_sha\": \"${STUDIO_BASELINE_SHA}\", \"name\": \"TPI_TASK:${TPI_TASK_IDENTIFIER}\", \"step\":${STEP}, \"params\": ${STUDIO_PARAMS}}"
45+
curl -X POST $URL \
46+
-H "Content-Type: application/json" \
47+
-H "Authorization: token ${STUDIO_TOKEN}" \
48+
-d "${STUDIO_PAYLOAD}"
49+
fi
50+
END
51+
chmod u=rwx,g=rx,o=rx /usr/bin/tpi-task-studio-log
52+
1953
base64 --decode << END | sudo tee /opt/task/variables > /dev/null
2054
S0VZPSJWQUxVRSIK
2155
END
56+
chmod u=rw,g=,o= /opt/task/variables
57+
2258
base64 --decode << END | sudo tee /opt/task/credentials > /dev/null
2359
ZXhwb3J0IFNFQ1JFVD1WQUxVRQo=
2460
END
25-
chmod u=rw,g=,o= /opt/task/variables
2661
chmod u=rw,g=,o= /opt/task/credentials
2762

2863
while IFS= read -rd $'\0' variable; do
@@ -48,7 +83,7 @@ sudo tee /etc/systemd/system/tpi-task.service > /dev/null <<END
4883
[Service]
4984
Type=simple
5085
ExecStart=-$TPI_START_COMMAND
51-
ExecStop=/bin/bash -c 'source /opt/task/credentials; systemctl is-system-running | grep stopping || echo "{\\\\"result\\\\": \\\\"\$SERVICE_RESULT\\\\", \\\\"code\\\\": \\\\"\$EXIT_STATUS\\\\", \\\\"status\\\\": \\\\"\$EXIT_CODE\\\\"}" > "$TPI_LOG_DIRECTORY/status-$TPI_MACHINE_IDENTITY" && RCLONE_CONFIG= rclone copy "$TPI_LOG_DIRECTORY" "\$RCLONE_REMOTE/reports"'
86+
ExecStop=/bin/bash -c 'source /opt/task/credentials; /usr/bin/tpi-task-studio-log && systemctl is-system-running | grep stopping || echo "{\\\\"result\\\\": \\\\"\$SERVICE_RESULT\\\\", \\\\"code\\\\": \\\\"\$EXIT_STATUS\\\\", \\\\"status\\\\": \\\\"\$EXIT_CODE\\\\"}" > "$TPI_LOG_DIRECTORY/status-$TPI_MACHINE_IDENTITY" && RCLONE_CONFIG= rclone copy "$TPI_LOG_DIRECTORY" "\$RCLONE_REMOTE/reports"'
5287
ExecStopPost=/usr/bin/tpi-task-shutdown
5388
Environment=HOME=/root
5489
EnvironmentFile=/opt/task/variables
@@ -101,6 +136,8 @@ if test -f /etc/apt/sources.list.d/cuda.list; then
101136
for list in cuda nvidia-ml; do mv /etc/apt/sources.list.d/$list.list{.backup,}; done
102137
fi
103138

139+
/usr/bin/tpi-task-studio-log running
140+
104141
sudo systemctl daemon-reload
105142
sudo systemctl enable tpi-task.service --now
106143
sudo systemctl disable --now apt-daily.timer

task/common/machine/testdata/machine_script_minimal.golden

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,51 @@ sleep 20; while pgrep rclone > /dev/null; do sleep 1; done
1313
source /opt/task/credentials
1414
(systemctl is-system-running | grep stopping) || leo stop --cloud="$TPI_TASK_CLOUD_PROVIDER" --region="$TPI_TASK_CLOUD_REGION" "$TPI_TASK_IDENTIFIER";
1515
END
16-
1716
chmod u=rwx,g=rx,o=rx /usr/bin/tpi-task-shutdown
1817

18+
sudo tee /usr/bin/tpi-task-studio-log << 'END'
19+
#!/bin/bash
20+
URL="${STUDIO_URL:-https://studio.iterative.ai/api/live}"
21+
STEP="${STUDIO_STEP:-`echo $(date +%s)`}"
22+
STATUS=$1
23+
DATE_START="${TPI_TASK_DATE_START:-0}"
24+
DATE_END=0
25+
26+
if [ -n "$STUDIO_TOKEN" ]; then
27+
if [ -z "$STATUS" ]; then
28+
if systemctl is-system-running | grep stopping; then
29+
STATUS=queued;
30+
else
31+
if test $SERVICE_RESULT == timeout; then
32+
STATUS=timeout;
33+
else
34+
test $EXIT_STATUS == 0 && STATUS=succeeded || STATUS=failed;
35+
fi
36+
fi
37+
fi
38+
39+
if [[ "$STATUS" =~ ^(timeout|succeeded|failed)$ ]]; then
40+
DATE_END=$(date +%s)
41+
fi
42+
43+
STUDIO_PARAMS="{\"task\": {\"id\": \"${TPI_TASK_IDENTIFIER}\", \"status\": \"${STATUS}\", \"cloud\": \"${TPI_TASK_CLOUD_PROVIDER}\", \"machine\": \"${TPI_MACHINE}\", \"region\": \"${TPI_REGION}\", \"diskSize\": \"${TPI_DISK_SIZE}\", \"dateStart\": ${DATE_START}, \"dateEnd\": ${DATE_END}}}"
44+
STUDIO_PAYLOAD="{\"type\": \"data\", \"client\": \"dvclive\", \"repo_url\": \"${STUDIO_REPO_URL}\", \"baseline_sha\": \"${STUDIO_BASELINE_SHA}\", \"name\": \"TPI_TASK:${TPI_TASK_IDENTIFIER}\", \"step\":${STEP}, \"params\": ${STUDIO_PARAMS}}"
45+
curl -X POST $URL \
46+
-H "Content-Type: application/json" \
47+
-H "Authorization: token ${STUDIO_TOKEN}" \
48+
-d "${STUDIO_PAYLOAD}"
49+
fi
50+
END
51+
chmod u=rwx,g=rx,o=rx /usr/bin/tpi-task-studio-log
52+
1953
base64 --decode << END | sudo tee /opt/task/variables > /dev/null
2054
2155
END
56+
chmod u=rw,g=,o= /opt/task/variables
57+
2258
base64 --decode << END | sudo tee /opt/task/credentials > /dev/null
2359
2460
END
25-
chmod u=rw,g=,o= /opt/task/variables
2661
chmod u=rw,g=,o= /opt/task/credentials
2762

2863
while IFS= read -rd $'\0' variable; do
@@ -48,7 +83,7 @@ sudo tee /etc/systemd/system/tpi-task.service > /dev/null <<END
4883
[Service]
4984
Type=simple
5085
ExecStart=-$TPI_START_COMMAND
51-
ExecStop=/bin/bash -c 'source /opt/task/credentials; systemctl is-system-running | grep stopping || echo "{\\\\"result\\\\": \\\\"\$SERVICE_RESULT\\\\", \\\\"code\\\\": \\\\"\$EXIT_STATUS\\\\", \\\\"status\\\\": \\\\"\$EXIT_CODE\\\\"}" > "$TPI_LOG_DIRECTORY/status-$TPI_MACHINE_IDENTITY" && RCLONE_CONFIG= rclone copy "$TPI_LOG_DIRECTORY" "\$RCLONE_REMOTE/reports"'
86+
ExecStop=/bin/bash -c 'source /opt/task/credentials; /usr/bin/tpi-task-studio-log && systemctl is-system-running | grep stopping || echo "{\\\\"result\\\\": \\\\"\$SERVICE_RESULT\\\\", \\\\"code\\\\": \\\\"\$EXIT_STATUS\\\\", \\\\"status\\\\": \\\\"\$EXIT_CODE\\\\"}" > "$TPI_LOG_DIRECTORY/status-$TPI_MACHINE_IDENTITY" && RCLONE_CONFIG= rclone copy "$TPI_LOG_DIRECTORY" "\$RCLONE_REMOTE/reports"'
5287
ExecStopPost=/usr/bin/tpi-task-shutdown
5388
Environment=HOME=/root
5489
EnvironmentFile=/opt/task/variables
@@ -101,6 +136,8 @@ if test -f /etc/apt/sources.list.d/cuda.list; then
101136
for list in cuda nvidia-ml; do mv /etc/apt/sources.list.d/$list.list{.backup,}; done
102137
fi
103138

139+
/usr/bin/tpi-task-studio-log running
140+
104141
sudo systemctl daemon-reload
105142
sudo systemctl enable tpi-task.service --now
106143
sudo systemctl disable --now apt-daily.timer

0 commit comments

Comments
 (0)