Skip to content

Commit 2c97b68

Browse files
Merge pull request #3632 from CliMA/sk/add_ss_to_cpu_scaling_pipeline
Add MBW strong scaling runs to CPU scaling pipeline.
2 parents 506be5b + 661211e commit 2c97b68

File tree

3 files changed

+112
-9
lines changed

3 files changed

+112
-9
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Instructions to trigger CPU scaling build
2+
3+
1). Visit url: https://buildkite.com/clima/climaatmos-dot-jl-cpu-scaling
4+
5+
2). Click on "New Build"
6+
7+
3). Set branch to the PR branch
8+
9+
4). Click on "Create Build"
10+
11+
The CPU scaling build is useful for understanding the CPU scaling performance implications of merging a PR.
12+
For higher-resolution simulations, with horizontal resolutions of 13 km or finer, memory footprint constraints
13+
dictated use of at most (16) message passing interface (MPI) ranks per node. To maintain consistency
14+
in the CPU scaling studies, we use (16) MPI ranks per node for all simulations, although this results in
15+
under-utilization of the available computing resources at lower resolutions.

.buildkite/cpu_scaling_pipeline/generate_pipeline.jl

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,16 @@
22
# To run: `julia --project=.buildkite .buildkite/cpu_scaling_pipeline/generate_pipeline.jl`
33
# nodes = (1, 2, 4, 8, 16, 32)
44
# helems = (30, 42, 60, 84, 120, 170)
5-
nodes = (1, 2, 4)
6-
helems = (30, 42, 60)
5+
6+
#strong scaling
7+
ss_nodes = (1, 2, 4) # number of nodes for weak scaling runs
8+
ss_helems = (30, 60, 120) # helems for weak scaling runs
9+
ss_procspernode = 16 # number of MPI processes per node
10+
11+
# weak scaling
12+
ws_nodes = (1, 2, 4) # number of nodes for weak scaling runs
13+
ws_helems = (30, 42, 60) # helems for weak scaling runs
14+
ws_procspernode = 16 # number of MPI processes per node
715

816
import YAML
917

@@ -36,19 +44,41 @@ init_step = Dict(
3644
),
3745
)
3846

39-
function generate_step(nodes::Int, helems::Int)
47+
function generate_step_ws(nodes::Int, helems::Int, procspernode::Int)
4048
return Dict(
41-
"label" => ":computer: $nodes node, 16 processes per node, helem = $helems",
49+
"label" => ":computer: MBW weak scaling, $nodes nodes, $procspernode processes per node, helem = $helems",
4250
"command" => "srun julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $MBW_SCALING_CONFIG_PATH/moist_baroclinic_wave_helem_$(helems)_0M_ws.yml --job_id moist_baroclinic_wave_helem_$(helems)_0M_ws",
4351
"artifact_paths" => "moist_baroclinic_wave_helem_$(helems)_0M_ws/output_active/*",
52+
"key" => "ws_$(nodes)_nodes",
4453
"agents" => Dict(
4554
"slurm_constraint" => CPU_TYPE,
4655
"queue" => "new-central",
4756
"slurm_nodes" => nodes,
48-
"slurm_tasks_per_node" => 16,
57+
"slurm_tasks_per_node" => procspernode,
4958
"slurm_cpus_per_task" => 1,
5059
"slurm_mem" => 0,
5160
"slurm_time" => "1:00:00",
61+
"slurm_reservation" => "false",
62+
"slurm_exclusive" => true,
63+
),
64+
)
65+
end
66+
67+
function generate_step_ss(nodes::Int, helems::Int, procspernode::Int)
68+
return Dict(
69+
"label" => ":computer: MBW strong scaling, $nodes nodes, $procspernode processes per node, helem = $helems",
70+
"command" => "srun julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $MBW_SCALING_CONFIG_PATH/moist_baroclinic_wave_helem_$(helems)_0M_ss.yml --job_id moist_baroclinic_wave_helem_$(helems)_0M_ss",
71+
"artifact_paths" => "moist_baroclinic_wave_helem_$(helems)_0M_ss/output_active/*",
72+
"key" => "ss_$(nodes)_nodes",
73+
"agents" => Dict(
74+
"slurm_constraint" => CPU_TYPE,
75+
"queue" => "new-central",
76+
"slurm_nodes" => nodes,
77+
"slurm_tasks_per_node" => procspernode,
78+
"slurm_cpus_per_task" => 1,
79+
"slurm_mem" => 0,
80+
"slurm_time" => "1:00:00",
81+
"slurm_reservation" => "false",
5282
"slurm_exclusive" => true,
5383
),
5484
)
@@ -62,7 +92,15 @@ pipeline = Dict(
6292
"wait",
6393
Dict(
6494
"group" => "Moist Baroclinic Wave, weak scaling",
65-
"steps" => [generate_step.(nodes, helems)...],
95+
"steps" => [
96+
generate_step_ws.(ws_nodes, ws_helems, ws_procspernode)...,
97+
],
98+
),
99+
Dict(
100+
"group" => "Moist Baroclinic Wave, strong scaling",
101+
"steps" => [
102+
generate_step_ss.(ss_nodes, ss_helems, ss_procspernode)...,
103+
],
66104
),
67105
],
68106
)

.buildkite/cpu_scaling_pipeline/pipeline.yml

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ steps:
1818
JULIA_MAX_NUM_PRECOMPILE_FILES: 50
1919
- "wait"
2020
- steps:
21-
- label: ":computer: 1 node, 16 processes per node, helem = 30"
21+
- label: ":computer: MBW weak scaling, 1 nodes, 16 processes per node, helem = 30"
22+
key: "ws_1_nodes"
2223
agents:
2324
slurm_time: "1:00:00"
2425
slurm_cpus_per_task: 1
@@ -28,9 +29,11 @@ steps:
2829
slurm_exclusive: true
2930
slurm_mem: 0
3031
slurm_constraint: "icelake"
32+
slurm_reservation: "false"
3133
command: "srun julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file config/mbw_scaling_configs/moist_baroclinic_wave_helem_30_0M_ws.yml --job_id moist_baroclinic_wave_helem_30_0M_ws"
3234
artifact_paths: "moist_baroclinic_wave_helem_30_0M_ws/output_active/*"
33-
- label: ":computer: 2 node, 16 processes per node, helem = 42"
35+
- label: ":computer: MBW weak scaling, 2 nodes, 16 processes per node, helem = 42"
36+
key: "ws_2_nodes"
3437
agents:
3538
slurm_time: "1:00:00"
3639
slurm_cpus_per_task: 1
@@ -40,9 +43,11 @@ steps:
4043
slurm_exclusive: true
4144
slurm_mem: 0
4245
slurm_constraint: "icelake"
46+
slurm_reservation: "false"
4347
command: "srun julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file config/mbw_scaling_configs/moist_baroclinic_wave_helem_42_0M_ws.yml --job_id moist_baroclinic_wave_helem_42_0M_ws"
4448
artifact_paths: "moist_baroclinic_wave_helem_42_0M_ws/output_active/*"
45-
- label: ":computer: 4 node, 16 processes per node, helem = 60"
49+
- label: ":computer: MBW weak scaling, 4 nodes, 16 processes per node, helem = 60"
50+
key: "ws_4_nodes"
4651
agents:
4752
slurm_time: "1:00:00"
4853
slurm_cpus_per_task: 1
@@ -52,9 +57,54 @@ steps:
5257
slurm_exclusive: true
5358
slurm_mem: 0
5459
slurm_constraint: "icelake"
60+
slurm_reservation: "false"
5561
command: "srun julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file config/mbw_scaling_configs/moist_baroclinic_wave_helem_60_0M_ws.yml --job_id moist_baroclinic_wave_helem_60_0M_ws"
5662
artifact_paths: "moist_baroclinic_wave_helem_60_0M_ws/output_active/*"
5763
group: "Moist Baroclinic Wave, weak scaling"
64+
- steps:
65+
- label: ":computer: MBW strong scaling, 1 nodes, 16 processes per node, helem = 30"
66+
key: "ss_1_nodes"
67+
agents:
68+
slurm_time: "1:00:00"
69+
slurm_cpus_per_task: 1
70+
slurm_tasks_per_node: 16
71+
queue: "new-central"
72+
slurm_nodes: 1
73+
slurm_exclusive: true
74+
slurm_mem: 0
75+
slurm_constraint: "icelake"
76+
slurm_reservation: "false"
77+
command: "srun julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file config/mbw_scaling_configs/moist_baroclinic_wave_helem_30_0M_ss.yml --job_id moist_baroclinic_wave_helem_30_0M_ss"
78+
artifact_paths: "moist_baroclinic_wave_helem_30_0M_ss/output_active/*"
79+
- label: ":computer: MBW strong scaling, 2 nodes, 16 processes per node, helem = 60"
80+
key: "ss_2_nodes"
81+
agents:
82+
slurm_time: "1:00:00"
83+
slurm_cpus_per_task: 1
84+
slurm_tasks_per_node: 16
85+
queue: "new-central"
86+
slurm_nodes: 2
87+
slurm_exclusive: true
88+
slurm_mem: 0
89+
slurm_constraint: "icelake"
90+
slurm_reservation: "false"
91+
command: "srun julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file config/mbw_scaling_configs/moist_baroclinic_wave_helem_60_0M_ss.yml --job_id moist_baroclinic_wave_helem_60_0M_ss"
92+
artifact_paths: "moist_baroclinic_wave_helem_60_0M_ss/output_active/*"
93+
- label: ":computer: MBW strong scaling, 4 nodes, 16 processes per node, helem = 120"
94+
key: "ss_4_nodes"
95+
agents:
96+
slurm_time: "1:00:00"
97+
slurm_cpus_per_task: 1
98+
slurm_tasks_per_node: 16
99+
queue: "new-central"
100+
slurm_nodes: 4
101+
slurm_exclusive: true
102+
slurm_mem: 0
103+
slurm_constraint: "icelake"
104+
slurm_reservation: "false"
105+
command: "srun julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file config/mbw_scaling_configs/moist_baroclinic_wave_helem_120_0M_ss.yml --job_id moist_baroclinic_wave_helem_120_0M_ss"
106+
artifact_paths: "moist_baroclinic_wave_helem_120_0M_ss/output_active/*"
107+
group: "Moist Baroclinic Wave, strong scaling"
58108
env:
59109
SLURM_KILL_BAD_EXIT: 1
60110
OPENBLAS_NUM_THREADS: 1

0 commit comments

Comments
 (0)