Skip to content

Commit 6f58eec

Browse files
Malmahrouqi3mohdsaid497566sbryngelsonwilfonba
authored
Frontier Benchmarking (#453) (#881)
Co-authored-by: mohdsaid497566 <mohdsaid497566@gmail.com> Co-authored-by: Spencer Bryngelson <sbryngelson@gmail.com> Co-authored-by: Spencer Bryngelson <shb@gatech.edu> Co-authored-by: wilfonba <bwilfong3@gatech.edu>
1 parent da8ae1c commit 6f58eec

File tree

5 files changed

+119
-13
lines changed

5 files changed

+119
-13
lines changed

.github/workflows/bench.yml

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,40 @@ jobs:
2323
filters: ".github/file-filter.yml"
2424

2525
self:
26-
name: Georgia Tech | Phoenix (NVHPC)
26+
name: "${{ matrix.name }} (${{ matrix.device }})"
2727
if: ${{ github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && (
2828
(github.event_name == 'pull_request_review' && github.event.review.state == 'approved') ||
2929
(github.event_name == 'pull_request' && github.event.pull_request.user.login == 'sbryngelson')
30-
) }}
30+
) }}
3131
needs: file-changes
3232
strategy:
33-
matrix:
34-
device: ['cpu', 'gpu']
3533
fail-fast: false
34+
matrix:
35+
include:
36+
- cluster: phoenix
37+
name: Georgia Tech | Phoenix (NVHPC)
38+
group: phoenix
39+
labels: gt
40+
flag: p
41+
device: cpu
42+
build_script: ""
43+
- cluster: phoenix
44+
name: Georgia Tech | Phoenix (NVHPC)
45+
group: phoenix
46+
labels: gt
47+
flag: p
48+
device: gpu
49+
build_script: ""
50+
- cluster: frontier
51+
name: Oak Ridge | Frontier (CCE)
52+
group: phoenix
53+
labels: frontier
54+
flag: f
55+
device: gpu
56+
build_script: "bash .github/workflows/frontier/build.sh gpu bench"
3657
runs-on:
37-
group: phoenix
38-
labels: gt
58+
group: ${{ matrix.group }}
59+
labels: ${{ matrix.labels }}
3960
timeout-minutes: 1400
4061
env:
4162
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
@@ -53,15 +74,22 @@ jobs:
5374
ref: master
5475
path: master
5576

77+
- name: Setup & Build
78+
if: matrix.build_script != ''
79+
run: |
80+
(cd pr && ${{ matrix.build_script }}) &
81+
(cd master && ${{ matrix.build_script }}) &
82+
wait %1 && wait %2
83+
5684
- name: Bench (Master v. PR)
5785
run: |
58-
(cd pr && bash .github/workflows/phoenix/submit-bench.sh .github/workflows/phoenix/bench.sh ${{ matrix.device }}) &
59-
(cd master && bash .github/workflows/phoenix/submit-bench.sh .github/workflows/phoenix/bench.sh ${{ matrix.device }}) &
86+
(cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) &
87+
(cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) &
6088
wait %1 && wait %2
6189
6290
- name: Generate & Post Comment
6391
run: |
64-
(cd pr && . ./mfc.sh load -c p -m g)
92+
(cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g)
6593
(cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}.yaml ../pr/bench-${{ matrix.device }}.yaml)
6694
6795
- name: Print Logs
@@ -72,9 +100,9 @@ jobs:
72100
73101
- name: Archive Logs
74102
uses: actions/upload-artifact@v4
75-
if: always()
103+
if: always()
76104
with:
77-
name: logs-${{ matrix.device }}
105+
name: ${{ matrix.cluster }}-${{ matrix.device }}
78106
path: |
79107
pr/bench-${{ matrix.device }}.*
80108
pr/build/benchmarks/*

.github/workflows/frontier/bench.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
3+
n_ranks=12
4+
5+
if [ "$job_device" = "gpu" ]; then
6+
gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
7+
n_ranks=$(echo "$gpus" | wc -w) # number of GPUs on node
8+
gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//') # GPU IDs from rocm-smi
9+
device_opts="--gpu -g $gpu_ids"
10+
fi
11+
12+
if [ "$job_device" = "gpu" ]; then
13+
./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks
14+
else
15+
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks
16+
fi

.github/workflows/frontier/build.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,12 @@ if [ "$1" = "gpu" ]; then
66
fi
77

88
. ./mfc.sh load -c f -m g
9-
./mfc.sh test --dry-run -j 8 $build_opts
9+
10+
if [ "$2" == "bench" ]; then
11+
for dir in benchmarks/*/; do
12+
dirname=$(basename "$dir")
13+
./mfc.sh run "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts
14+
done
15+
else
16+
./mfc.sh test --dry-run -j 8 $build_opts
17+
fi
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
usage() {
6+
echo "Usage: $0 [script.sh] [cpu|gpu]"
7+
}
8+
9+
if [ ! -z "$1" ]; then
10+
sbatch_script_contents=`cat $1`
11+
else
12+
usage
13+
exit 1
14+
fi
15+
16+
if [ "$2" = "cpu" ]; then
17+
sbatch_device_opts="\
18+
#SBATCH -n 32 # Number of cores required"
19+
elif [ "$2" = "gpu" ]; then
20+
sbatch_device_opts="\
21+
#SBATCH -n 8 # Number of cores required"
22+
else
23+
usage; exit 1
24+
fi
25+
26+
27+
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
28+
29+
sbatch <<EOT
30+
#!/bin/bash
31+
#SBATCH -JMFC-$job_slug # Job name
32+
#SBATCH -A CFD154 # charge account
33+
#SBATCH -N 1 # Number of nodes required
34+
$sbatch_device_opts
35+
#SBATCH -t 03:59:00 # Duration of the job (Ex: 15 mins)
36+
#SBATCH -o$job_slug.out # Combined output and error messages file
37+
#SBATCH -p extended # Extended partition for shorter queues
38+
#SBATCH -W # Do not exit until the submitted job terminates.
39+
40+
set -e
41+
set -x
42+
43+
cd "\$SLURM_SUBMIT_DIR"
44+
echo "Running in $(pwd):"
45+
46+
job_slug="$job_slug"
47+
job_device="$2"
48+
49+
. ./mfc.sh load -c f -m g
50+
51+
$sbatch_script_contents
52+
53+
EOT
54+

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,4 +128,4 @@ jobs:
128128
if: always()
129129
with:
130130
name: logs-${{ strategy.job-index }}-${{ matrix.device }}
131-
path: test-${{ matrix.device }}.out
131+
path: test-${{ matrix.device }}.out

0 commit comments

Comments
 (0)