Skip to content

Commit adcc0dd

Browse files
authored
Update actions to minimize node use. (#933)
1 parent 8026a1c commit adcc0dd

File tree

7 files changed

+292
-91
lines changed

7 files changed

+292
-91
lines changed

.github/workflows/bench.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,11 @@ jobs:
9797
run: |
9898
cat pr/bench-${{ matrix.device }}.* 2>/dev/null || true
9999
cat master/bench-${{ matrix.device }}.* 2>/dev/null || true
100-
101-
- name: Archive Logs
100+
101+
# All other runners (non-Phoenix) just run without special env
102+
- name: Archive Logs (Frontier)
103+
if: always() && matrix.cluster != 'phoenix'
102104
uses: actions/upload-artifact@v4
103-
if: always()
104105
with:
105106
name: ${{ matrix.cluster }}-${{ matrix.device }}
106107
path: |

.github/workflows/phoenix/bench.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
n_ranks=12
44

5-
if [ "$job_device" = "gpu" ]; then
5+
echo "My benchmarking device is:" $device
6+
if [ "$device" = "gpu" ]; then
67
n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node
78
gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1
89
device_opts="--gpu -g $gpu_ids"
@@ -15,7 +16,7 @@ mkdir -p $currentdir
1516

1617
export TMPDIR=$currentdir
1718

18-
if [ "$job_device" = "gpu" ]; then
19+
if [ "$device" = "gpu" ]; then
1920
./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
2021
else
2122
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
Lines changed: 82 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,107 @@
1-
#!/bin/bash
2-
3-
set -e
1+
#!/usr/bin/env bash
2+
set -euo pipefail
43

54
usage() {
65
echo "Usage: $0 [script.sh] [cpu|gpu]"
6+
exit 1
77
}
88

9-
if [ ! -z "$1" ]; then
10-
sbatch_script_contents=`cat $1`
11-
else
12-
usage
13-
exit 1
14-
fi
9+
[[ $# -eq 2 ]] || usage
1510

16-
sbatch_cpu_opts="\
11+
sbatch_script="$1"
12+
13+
device="$2"
14+
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
15+
16+
# read the body of the user script
17+
sbatch_body=$(<"$sbatch_script")
18+
19+
# common SBATCH directives
20+
sbatch_common_opts="\
21+
#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name
22+
#SBATCH --account=gts-sbryngelson3 # account
23+
#SBATCH -N1 # nodes
24+
#SBATCH -t 02:00:00 # walltime
25+
#SBATCH -q embers # QOS
26+
#SBATCH -o $job_slug.out # stdout+stderr
27+
#SBATCH --mem-per-cpu=2G # default mem (overridden below)
28+
"
29+
30+
# CPU vs GPU overrides
31+
if [[ "$device" == "cpu" ]]; then
32+
sbatch_device_opts="\
1733
#SBATCH -p cpu-small # partition
1834
#SBATCH --ntasks-per-node=24 # Number of cores per node required
1935
#SBATCH --mem-per-cpu=2G # Memory per core\
2036
"
21-
22-
sbatch_gpu_opts="\
37+
elif [[ "$device" == "gpu" ]]; then
38+
sbatch_device_opts="\
2339
#SBATCH -CL40S
2440
#SBATCH --ntasks-per-node=4 # Number of cores per node required
2541
#SBATCH -G2\
2642
"
27-
28-
if [ "$2" = "cpu" ]; then
29-
sbatch_device_opts="$sbatch_cpu_opts"
30-
elif [ "$2" = "gpu" ]; then
31-
sbatch_device_opts="$sbatch_gpu_opts"
3243
else
33-
usage
34-
exit 1
44+
usage
3545
fi
3646

37-
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
47+
# submit and capture the JobID
48+
JOBID=$(sbatch <<-EOT | awk '{print $4}'
49+
#!/usr/bin/env bash
50+
${sbatch_common_opts}
51+
${sbatch_device_opts}
52+
53+
export job_slug="${job_slug}"
54+
export device="${device}"
3855
39-
sbatch <<EOT
40-
#!/bin/bash
41-
#SBATCH -Jshb-$job_slug # Job name
42-
#SBATCH --account=gts-sbryngelson3 # charge account
43-
#SBATCH -N1 # Number of nodes required
44-
$sbatch_device_opts
45-
#SBATCH -t 02:00:00 # Duration of the job (Ex: 15 mins)
46-
#SBATCH -q embers # QOS Name
47-
#SBATCH -o$job_slug.out # Combined output and error messages file
48-
#SBATCH -W # Do not exit until the submitted job terminates.
56+
echo "Job slug is:" $job_slug
57+
echo "Device is:" $device
58+
59+
set -e -x
4960
50-
set -e
51-
set -x
61+
cd "\$SLURM_SUBMIT_DIR"
62+
echo "Running in \$(pwd):"
5263
53-
cd "\$SLURM_SUBMIT_DIR"
54-
echo "Running in $(pwd):"
64+
# load your modules & env
65+
. ./mfc.sh load -c p -m $device
5566
56-
job_slug="$job_slug"
57-
job_device="$2"
67+
# user script contents
68+
${sbatch_body}
69+
EOT
70+
)
5871

59-
. ./mfc.sh load -c p -m $2
72+
echo "🚀 Submitted SLURM job $JOBID"
6073

61-
$sbatch_script_contents
74+
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
75+
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT
6276

63-
EOT
77+
# ────────── Poll until SLURM job finishes ──────────
78+
while :; do
79+
# Try sacct first
80+
STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1)
81+
82+
# Fallback to squeue if sacct is empty
83+
if [[ -z "$STATE" ]]; then
84+
STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "")
85+
fi
86+
87+
# If it’s one of SLURM’s terminal states, break immediately
88+
case "$STATE" in
89+
COMPLETED|FAILED|CANCELLED|TIMEOUT)
90+
echo "✅ SLURM job $JOBID reached terminal state: $STATE"
91+
break
92+
;;
93+
"")
94+
echo "✅ SLURM job $JOBID no longer in queue; assuming finished"
95+
break
96+
;;
97+
*)
98+
echo "⏳ SLURM job $JOBID state: $STATE"
99+
sleep 10
100+
;;
101+
esac
102+
done
64103

104+
# Now retrieve the exit code and exit with it
105+
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
106+
echo "🔚 SLURM job $JOBID exit code: $EXIT_CODE"
107+
exit "$EXIT_CODE"

.github/workflows/phoenix/submit.sh

Lines changed: 79 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,100 @@
1-
#!/bin/bash
2-
3-
set -e
1+
#!/usr/bin/env bash
2+
set -euo pipefail
43

54
usage() {
65
echo "Usage: $0 [script.sh] [cpu|gpu]"
6+
exit 1
77
}
88

9-
if [ ! -z "$1" ]; then
10-
sbatch_script_contents=`cat $1`
11-
else
12-
usage
13-
exit 1
14-
fi
9+
[[ $# -eq 2 ]] || usage
10+
11+
sbatch_script="$1"
12+
device="$2"
13+
14+
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
15+
16+
# read the body of the user script
17+
sbatch_body=$(<"$sbatch_script")
1518

16-
sbatch_cpu_opts="\
17-
#SBATCH -p cpu-small # partition
18-
#SBATCH --ntasks-per-node=24 # Number of cores per node required
19-
#SBATCH --mem-per-cpu=2G # Memory per core\
19+
# common SBATCH directives
20+
sbatch_common_opts="\
21+
#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name
22+
#SBATCH --account=gts-sbryngelson3 # account
23+
#SBATCH -N1 # nodes
24+
#SBATCH -t 03:00:00 # walltime
25+
#SBATCH -q embers # QOS
26+
#SBATCH -o $job_slug.out # stdout+stderr
27+
#SBATCH --mem-per-cpu=2G # default mem (overridden below)
2028
"
2129

22-
sbatch_gpu_opts="\
30+
# CPU vs GPU overrides
31+
if [[ "$device" == "cpu" ]]; then
32+
sbatch_device_opts="\
33+
#SBATCH -p cpu-small
34+
#SBATCH --ntasks-per-node=24
35+
"
36+
elif [[ "$device" == "gpu" ]]; then
37+
sbatch_device_opts="\
2338
#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
24-
#SBATCH --ntasks-per-node=4 # Number of cores per node required
25-
#SBATCH -G2\
39+
#SBATCH --ntasks-per-node=4
40+
#SBATCH -G2
2641
"
27-
28-
if [ "$2" = "cpu" ]; then
29-
sbatch_device_opts="$sbatch_cpu_opts"
30-
elif [ "$2" = "gpu" ]; then
31-
sbatch_device_opts="$sbatch_gpu_opts"
3242
else
33-
usage
34-
exit 1
43+
usage
3544
fi
3645

37-
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
46+
# submit and capture the JobID
47+
JOBID=$(sbatch <<-EOT | awk '{print $4}'
48+
#!/usr/bin/env bash
49+
${sbatch_common_opts}
50+
${sbatch_device_opts}
3851
39-
sbatch <<EOT
40-
#!/bin/bash
41-
#SBATCH -Jshb-$job_slug # Job name
42-
#SBATCH --account=gts-sbryngelson3 # charge account
43-
#SBATCH -N1 # Number of nodes required
44-
$sbatch_device_opts
45-
#SBATCH -t 03:00:00 # Duration of the job (Ex: 15 mins)
46-
#SBATCH -q embers # QOS Name
47-
#SBATCH -o$job_slug.out # Combined output and error messages file
48-
#SBATCH -W # Do not exit until the submitted job terminates.
52+
set -e -x
4953
50-
set -e
51-
set -x
54+
cd "\$SLURM_SUBMIT_DIR"
55+
echo "Running in \$(pwd):"
5256
53-
cd "\$SLURM_SUBMIT_DIR"
54-
echo "Running in $(pwd):"
57+
# load your modules & env
58+
. ./mfc.sh load -c p -m $device
5559
56-
job_slug="$job_slug"
57-
job_device="$2"
60+
# user script contents
61+
${sbatch_body}
62+
EOT
63+
)
5864

59-
. ./mfc.sh load -c p -m $2
65+
echo "🚀 Submitted SLURM job $JOBID"
6066

61-
$sbatch_script_contents
67+
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
68+
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT
6269

63-
EOT
70+
# ────────── Poll until SLURM job finishes ──────────
71+
while :; do
72+
# Try sacct first
73+
STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1)
74+
75+
# Fallback to squeue if sacct is empty
76+
if [[ -z "$STATE" ]]; then
77+
STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "")
78+
fi
79+
80+
# If it’s one of SLURM’s terminal states, break immediately
81+
case "$STATE" in
82+
COMPLETED|FAILED|CANCELLED|TIMEOUT)
83+
echo "✅ SLURM job $JOBID reached terminal state: $STATE"
84+
break
85+
;;
86+
"")
87+
echo "✅ SLURM job $JOBID no longer in queue; assuming finished"
88+
break
89+
;;
90+
*)
91+
echo "⏳ SLURM job $JOBID state: $STATE"
92+
sleep 10
93+
;;
94+
esac
95+
done
6496

97+
# Now retrieve the exit code and exit with it
98+
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
99+
echo "🔚 SLURM job $JOBID exit code: $EXIT_CODE"
100+
exit "$EXIT_CODE"

.github/workflows/phoenix/test.sh

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,19 @@
11
#!/bin/bash
22

3+
tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
4+
currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
5+
mkdir -p $tmpbuild
6+
mkdir -p $currentdir
7+
export TMPDIR=$currentdir
8+
9+
n_test_threads=8
10+
311
build_opts=""
412
if [ "$job_device" = "gpu" ]; then
513
build_opts="--gpu"
614
fi
715

8-
./mfc.sh test --dry-run -j 8 $build_opts
9-
10-
n_test_threads=8
16+
./mfc.sh test --dry-run -j $n_test_threads $build_opts
1117

1218
if [ "$job_device" = "gpu" ]; then
1319
gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node
@@ -18,4 +24,7 @@ fi
1824

1925
./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix
2026

27+
sleep 10
28+
rm -rf "$currentdir" || true
2129

30+
unset TMPDIR

.github/workflows/test.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ jobs:
101101
group: phoenix
102102
labels: ${{ matrix.lbl }}
103103
env:
104+
NODE_OPTIONS: ${{ matrix.lbl == 'gt' && '--max-old-space-size=2048' || '' }}
104105
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
105106
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
106107
steps:
@@ -125,7 +126,7 @@ jobs:
125126

126127
- name: Archive Logs
127128
uses: actions/upload-artifact@v4
128-
if: always()
129+
if: matrix.lbl == 'frontier'
129130
with:
130131
name: logs-${{ strategy.job-index }}-${{ matrix.device }}
131132
path: test-${{ matrix.device }}.out

0 commit comments

Comments
 (0)