|
| 1 | +#!/bin/bash |
| 2 | +sudo mkdir --parents /opt/task/directory |
| 3 | +chmod u=rwx,g=rwx,o=rwx /opt/task/directory |
| 4 | + |
| 5 | +base64 --decode << END | sudo tee /usr/bin/tpi-task > /dev/null |
| 6 | +{{.TaskScript}} |
| 7 | +END |
| 8 | +chmod u=rwx,g=rx,a=rx /usr/bin/tpi-task |
| 9 | + |
| 10 | +sudo tee /usr/bin/tpi-task-shutdown << 'END' |
| 11 | +#!/bin/bash |
| 12 | +sleep 20; while pgrep rclone > /dev/null; do sleep 1; done |
| 13 | +source /opt/task/credentials |
| 14 | +if ! test -z "$CI"; then |
| 15 | + cml rerun-workflow |
| 16 | +fi |
| 17 | +(systemctl is-system-running | grep stopping) || tpi stop --cloud="$TPI_TASK_CLOUD_PROVIDER" --region="$TPI_TASK_CLOUD_REGION" "$TPI_TASK_IDENTIFIER"; |
| 18 | +END |
| 19 | + |
| 20 | +chmod u=rwx,g=rx,o=rx /usr/bin/tpi-task-shutdown |
| 21 | + |
| 22 | +base64 --decode << END | sudo tee /opt/task/variables > /dev/null |
| 23 | +{{.Environment}} |
| 24 | +END |
| 25 | +base64 --decode << END | sudo tee /opt/task/credentials > /dev/null |
| 26 | +{{.Credentials}} |
| 27 | +END |
| 28 | +chmod u=rw,g=,o= /opt/task/variables |
| 29 | +chmod u=rw,g=,o= /opt/task/credentials |
| 30 | + |
| 31 | +while IFS= read -rd $'\0' variable; do |
| 32 | + export "$(perl -0777p -e 's/\\"/"/g;' -e 's/(.+?)="(.+)"/$1=$2/sg' <<< "$variable")" |
| 33 | +done < <(perl -0777pe 's/\n*(.+?=".*?((?<!\\)"|\\\\"))\n*/$1\x00/sg' /opt/task/variables) |
| 34 | + |
| 35 | +TPI_MACHINE_IDENTITY="$(uuidgen)" |
| 36 | +TPI_LOG_DIRECTORY="$(mktemp --directory)" |
| 37 | +TPI_DATA_DIRECTORY="/opt/task/directory" |
| 38 | + |
| 39 | +TPI_START_COMMAND="/bin/bash -lc 'exec /usr/bin/tpi-task'" |
| 40 | +TPI_REMAINING_RUN_TIME=$(({{.Timeout}}-$(date +%s))) |
| 41 | +if (( TPI_REMAINING_RUN_TIME < 1 )); then |
| 42 | + TPI_START_COMMAND="/bin/bash -c 'sleep infinity'" |
| 43 | + TPI_REMAINING_RUN_TIME=1 |
| 44 | +fi |
| 45 | + |
| 46 | +source /opt/task/credentials |
| 47 | + |
| 48 | +sudo tee /etc/systemd/system/tpi-task.service > /dev/null <<END |
| 49 | +[Unit] |
| 50 | + After=default.target |
| 51 | +[Service] |
| 52 | + Type=simple |
| 53 | + ExecStart=-$TPI_START_COMMAND |
| 54 | + ExecStop=/bin/bash -c 'source /opt/task/credentials; systemctl is-system-running | grep stopping || echo "{\\\\"result\\\\": \\\\"\$SERVICE_RESULT\\\\", \\\\"code\\\\": \\\\"\$EXIT_STATUS\\\\", \\\\"status\\\\": \\\\"\$EXIT_CODE\\\\"}" > "$TPI_LOG_DIRECTORY/status-$TPI_MACHINE_IDENTITY" && RCLONE_CONFIG= rclone copy "$TPI_LOG_DIRECTORY" "\$RCLONE_REMOTE/reports"' |
| 55 | + ExecStopPost=/usr/bin/tpi-task-shutdown |
| 56 | + Environment=HOME=/root |
| 57 | + EnvironmentFile=/opt/task/variables |
| 58 | + WorkingDirectory=/opt/task/directory |
| 59 | + RuntimeMaxSec=$TPI_REMAINING_RUN_TIME |
| 60 | +[Install] |
| 61 | + WantedBy=default.target |
| 62 | +END |
| 63 | + |
| 64 | +curl --location --remote-name https://github.com/iterative/terraform-provider-iterative/releases/latest/download/terraform-provider-iterative_linux_amd64 |
| 65 | +sudo mv terraform-provider-iterative* /usr/bin/tpi |
| 66 | +sudo chmod u=rwx,g=rx,o=rx /usr/bin/tpi |
| 67 | +sudo chown root:root /usr/bin/tpi |
| 68 | + |
| 69 | +curl --location --remote-name https://github.com/iterative/cml/releases/latest/download/cml-linux |
| 70 | +chmod u=rwx,g=rx,o=rx cml-linux |
| 71 | +sudo mv cml-linux /usr/bin/cml |
| 72 | + |
| 73 | +extract_here(){ |
| 74 | + if command -v unzip 2>&1 > /dev/null; then |
| 75 | + unzip "$1" |
| 76 | + elif command -v python3 2>&1 > /dev/null; then |
| 77 | + python3 -m zipfile -e "$1" . |
| 78 | + else |
| 79 | + python -m zipfile -e "$1" . |
| 80 | + fi |
| 81 | +} |
| 82 | + |
| 83 | +if ! command -v rclone 2>&1 > /dev/null; then |
| 84 | + curl --remote-name https://downloads.rclone.org/rclone-current-linux-amd64.zip |
| 85 | + extract_here rclone-current-linux-amd64.zip |
| 86 | + sudo cp rclone-*-linux-amd64/rclone /usr/bin |
| 87 | + sudo chmod u=rwx,g=rx,o=rx /usr/bin/rclone |
| 88 | + sudo chown root:root /usr/bin/rclone |
| 89 | + rm --recursive rclone-*-linux-amd64* |
| 90 | +fi |
| 91 | + |
| 92 | +rclone copy "$RCLONE_REMOTE/data" /opt/task/directory |
| 93 | + |
| 94 | +yes | /etc/profile.d/install-driver-prompt.sh # for GCP GPU machines |
| 95 | + |
| 96 | +# FIX NVIDIA APT GPG KEYS (https://github.com/NVIDIA/cuda-repo-management/issues/1#issuecomment-1111490201) 🤬 |
| 97 | +if test -f /etc/apt/sources.list.d/cuda.list; then |
| 98 | + for list in cuda nvidia-ml; do mv /etc/apt/sources.list.d/$list.list{,.backup}; done |
| 99 | + apt-get update |
| 100 | + apt-get install --yes gpg |
| 101 | + apt-key del 7fa2af80 |
| 102 | + apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/3bf863cc.pub |
| 103 | + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/7fa2af80.pub |
| 104 | + for list in cuda nvidia-ml; do mv /etc/apt/sources.list.d/$list.list{.backup,}; done |
| 105 | +fi |
| 106 | + |
| 107 | +sudo systemctl daemon-reload |
| 108 | +sudo systemctl enable tpi-task.service --now |
| 109 | +sudo systemctl disable --now apt-daily.timer |
| 110 | + |
| 111 | +while sleep 5; do |
| 112 | + test -n "$TPI_MACHINE_LOGS" && journalctl > "$TPI_LOG_DIRECTORY/machine-$TPI_MACHINE_IDENTITY" |
| 113 | + journalctl --all --no-hostname --output=short-iso --quiet --unit=tpi-task --utc | sed 's/^\([0-9-]*\)T\([0-9:]*\)+0000 \S*: \(.*\)/\1 \2 \3/g' > "$TPI_LOG_DIRECTORY/task-$TPI_MACHINE_IDENTITY" |
| 114 | + NEW_TPI_LOG_DIRECTORY_HASH="$(md5sum "$TPI_LOG_DIRECTORY"/*)" |
| 115 | + if test "$NEW_TPI_LOG_DIRECTORY_HASH" != "$TPI_LOG_DIRECTORY_HASH"; then |
| 116 | + TPI_LOG_DIRECTORY_HASH="$NEW_TPI_LOG_DIRECTORY_HASH" |
| 117 | + rclone sync "$TPI_LOG_DIRECTORY" "$RCLONE_REMOTE/reports" |
| 118 | + fi |
| 119 | +done & |
| 120 | + |
| 121 | +while sleep 10; do |
| 122 | + NEW_TPI_DATA_DIRECTORY_EPOCH="$(find "$TPI_DATA_DIRECTORY" -printf "%T@\n" | sort | tail -1)" |
| 123 | + if test "$NEW_TPI_DATA_DIRECTORY_EPOCH" != "$TPI_DATA_DIRECTORY_EPOCH"; then |
| 124 | + TPI_DATA_DIRECTORY_EPOCH="$NEW_TPI_DATA_DIRECTORY_EPOCH" |
| 125 | + rclone sync "$TPI_DATA_DIRECTORY" "$RCLONE_REMOTE/data" |
| 126 | + fi |
| 127 | +done & |
0 commit comments