Skip to content

Commit 65ba134

Browse files
committed
Add batching to AutoSparseJacobian
1 parent d13f24c commit 65ba134

File tree

9 files changed

+313
-251
lines changed

9 files changed

+313
-251
lines changed

.buildkite/Manifest-v1.11.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,9 @@ version = "0.31.0"
370370

371371
[[deps.ClimaComms]]
372372
deps = ["Adapt", "Logging", "LoggingExtras"]
373-
git-tree-sha1 = "75b9d1a3b4e3efa2cbbae2eb7b52f14c0b38ccf0"
373+
git-tree-sha1 = "d341c3fc97a98dbecd6b635f34638b2d9771f94e"
374+
repo-rev = "dy/memory_api"
375+
repo-url = "https://github.com/CliMA/ClimaComms.jl.git"
374376
uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
375377
version = "0.6.8"
376378
weakdeps = ["CUDA", "MPI"]

.buildkite/ci_driver.jl

Lines changed: 34 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -66,60 +66,54 @@ if (
6666
scalar_names = CA.scalar_field_names(Y_end)
6767
block_keys = Iterators.product(scalar_names, scalar_names)
6868

69-
@info "Debugging Jacobian in first column of final state"
70-
rms(block) = sqrt(mean(abs2.(block)))
71-
highlighters = (
72-
Highlighter((d, i, j) -> d[i, j] < 1e-12; foreground = :dark_gray),
73-
Highlighter((d, i, j) -> 1e-12 <= d[i, j] < 1e-6; foreground = :green),
74-
Highlighter((d, i, j) -> 1e-6 <= d[i, j] < 1e-2; foreground = :blue),
75-
Highlighter((d, i, j) -> 1e-2 <= d[i, j] < 1e-1; foreground = :cyan),
76-
Highlighter((d, i, j) -> 1e-1 <= d[i, j] < 1; foreground = :light_cyan),
77-
Highlighter((d, i, j) -> d[i, j] == 1; foreground = :yellow),
78-
Highlighter((d, i, j) -> d[i, j] > 1; foreground = :red),
79-
)
69+
exact_jacobian_alg = CA.AutoDenseJacobian()
70+
all_approx_jacobian_algs =
71+
jacobian.alg isa CA.AutoSparseJacobian ?
72+
(; manual = jacobian.alg.sparse_jacobian_alg, auto = jacobian.alg) :
73+
(; manual = jacobian.alg)
74+
exact_blocks =
75+
CA.first_column_block_arrays(exact_jacobian_alg, Y_end, p, dtγ, t_end)
76+
all_approx_blocks = map(all_approx_jacobian_algs) do jacobian_alg
77+
CA.first_column_block_arrays(jacobian_alg, Y_end, p, dtγ, t_end)
78+
end
79+
block_rescalings = CA.first_column_rescaling_arrays(Y_end, p, t_end)
80+
8081
table_kwargs = (;
8182
columns_width = 5,
8283
crop = :none,
8384
formatters = ft_printf("%1.0e"),
84-
highlighters,
8585
row_labels = collect(scalar_names),
8686
show_header = false,
8787
tf = tf_matrix,
8888
vlines = [1],
8989
)
90+
highlighters = (
91+
Highlighter((d, i, j) -> d[i, j] < 1e-12; foreground = :dark_gray),
92+
Highlighter((d, i, j) -> 1e-12 <= d[i, j] < 1e-6; foreground = :blue),
93+
Highlighter((d, i, j) -> 1e-6 <= d[i, j] < 1e-3; foreground = :cyan),
94+
Highlighter((d, i, j) -> 1e-3 <= d[i, j] < 1e-1; foreground = :green),
95+
Highlighter((d, i, j) -> 1e-1 <= d[i, j] < 1; foreground = :yellow),
96+
Highlighter((d, i, j) -> d[i, j] == 1; foreground = :light_red),
97+
Highlighter((d, i, j) -> d[i, j] > 1; foreground = :light_magenta),
98+
)
99+
rms(block) = sqrt(mean(abs2.(block)))
90100

91-
block_rescalings = CA.first_column_rescaling_arrays(Y_end, p, t_end)
101+
@info "Debugging Jacobian in first column of final state"
92102

93-
exact_jacobian_alg = CA.AutoDenseJacobian()
94-
exact_blocks =
95-
CA.first_column_block_arrays(exact_jacobian_alg, Y_end, p, dtγ, t_end)
103+
exact_table_kwargs = (; table_kwargs..., highlighters = highlighters[1])
96104
exact_rms_values = map(block_keys) do block_key
97105
rms(exact_blocks[block_key])
98106
end
99107
exact_rescaled_rms_values = map(block_keys) do block_key
100108
rms(exact_blocks[block_key] .* block_rescalings[block_key])
101109
end
102-
@info "exact, RMS per block [inconsistent units]:"
103-
pretty_table(
104-
exact_rms_values;
105-
table_kwargs...,
106-
highlighters = highlighters[1],
107-
)
110+
@info "exact, RMS per block [unnormalized]:"
111+
pretty_table(exact_rms_values; exact_table_kwargs...)
108112
@info "exact, rescaled RMS per block [s^-1]:"
109-
pretty_table(
110-
exact_rescaled_rms_values;
111-
table_kwargs...,
112-
highlighters = highlighters[1],
113-
)
114-
println("<$('='^70)>")
113+
pretty_table(exact_rescaled_rms_values; exact_table_kwargs...)
114+
println("<$('='^70)>\n")
115115

116-
all_approx_jacobian_algs =
117-
jacobian.alg isa CA.AutoSparseJacobian ?
118-
(; manual = jacobian.alg.sparse_jacobian_alg, auto = jacobian.alg) :
119-
(; manual = jacobian.alg)
120-
all_approx_blocks = map(all_approx_jacobian_algs) do jacobian_alg
121-
CA.first_column_block_arrays(jacobian_alg, Y_end, p, dtγ, t_end)
122-
end
116+
approx_table_kwargs = (; table_kwargs..., highlighters)
123117
if jacobian.alg isa CA.AutoSparseJacobian
124118
approx_diff_rescaled_rms_values = map(block_keys) do block_key
125119
(; manual, auto) = all_approx_blocks
@@ -129,7 +123,7 @@ if (
129123
rms((manual[block_key] - auto[block_key]) .* rescaling) : FT(0)
130124
end
131125
@info "manual approx - auto approx, rescaled RMS per block [s^-1]:"
132-
pretty_table(approx_diff_rescaled_rms_values; table_kwargs...)
126+
pretty_table(approx_diff_rescaled_rms_values; approx_table_kwargs...)
133127
end
134128
for (approx_name, approx_blocks) in pairs(all_approx_blocks)
135129
approx_error_rescaled_rms_values = map(block_keys) do block_key
@@ -141,9 +135,9 @@ if (
141135
rms(approx_error .* rescaling)
142136
end
143137
@info "$approx_name approx - exact, rescaled RMS per block [s^-1]:"
144-
pretty_table(approx_error_rescaled_rms_values; table_kwargs...)
138+
pretty_table(approx_error_rescaled_rms_values; approx_table_kwargs...)
145139
end
146-
println("<$('='^70)>")
140+
println("<$('='^70)>\n")
147141
if jacobian.alg isa CA.AutoSparseJacobian
148142
approx_diff_relative_rms_values = map(block_keys) do block_key
149143
(; manual, auto) = all_approx_blocks
@@ -156,7 +150,7 @@ if (
156150
approx_diff_rms_value / rms(exact_blocks[block_key])
157151
end
158152
@info "manual approx - auto approx, relative RMS per block [unitless]:"
159-
pretty_table(approx_diff_relative_rms_values; table_kwargs...)
153+
pretty_table(approx_diff_relative_rms_values; approx_table_kwargs...)
160154
end
161155
for (approx_name, approx_blocks) in pairs(all_approx_blocks)
162156
approx_error_relative_rms_values = map(block_keys) do block_key
@@ -168,7 +162,7 @@ if (
168162
approx_error_rms_value / rms(exact_blocks[block_key])
169163
end
170164
@info "$approx_name approx - exact, relative RMS per block [unitless]:"
171-
pretty_table(approx_error_relative_rms_values; table_kwargs...)
165+
pretty_table(approx_error_relative_rms_values; approx_table_kwargs...)
172166
end
173167
end
174168

.buildkite/gpu_pipeline/pipeline.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ steps:
2424
- julia --project=.buildkite -e 'using Pkg; Pkg.precompile()'
2525
- julia --project=.buildkite -e 'using CUDA; CUDA.precompile_runtime()'
2626
- julia --project=.buildkite -e 'using Pkg; Pkg.status()'
27+
- julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaComms", rev="dy/memory_api"))'
2728
- julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaCore", rev="main"))'
2829

2930
agents:

.buildkite/longruns_gpu/pipeline.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ steps:
2525
- julia --project=.buildkite -e 'using Pkg; Pkg.precompile()'
2626
- julia --project=.buildkite -e 'using CUDA; CUDA.precompile_runtime()'
2727
- julia --project=.buildkite -e 'using Pkg; Pkg.status()'
28+
- julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaComms", rev="dy/memory_api"))'
2829
- julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaCore", rev="main"))'
2930

3031
agents:

.buildkite/pipeline.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ steps:
2828

2929
- echo "--- Instantiate .buildkite"
3030
- "julia --project=.buildkite -e 'using Pkg; Pkg.instantiate(;verbose=true); Pkg.precompile(;strict=true); using CUDA; CUDA.precompile_runtime(); Pkg.status()'"
31-
- "julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name=\"ClimaCore\", rev=\"main\"))'"
31+
- julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaComms", rev="dy/memory_api"))'
32+
- julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaCore", rev="main"))'
3233

3334
agents:
3435
slurm_cpus_per_task: 8

src/prognostic_equations/implicit/auto_dense_jacobian.jl

Lines changed: 34 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,12 @@ very large memory requirements at higher vertical resolutions.
3434
3535
When the number of values in each column is very large, computing the entire
3636
dense matrix in a single evaluation of `implicit_tendency!` can be too expensive
37-
to compile and run. So, the dual number components are split into batches with a
38-
maximum size of `max_simultaneous_derivatives`, and we call `implicit_tendency!`
39-
once for each batch. That is, if the batch size is ``s``, then the first batch
40-
evaluates the coefficients of ``ε₁`` through ``εₛ``, the second evaluates the
41-
coefficients of ``εₛ₊₁`` through ``ε₂ₛ``, and so on until ``εₙ``. The default
42-
batch size is 32.
37+
to compile and run. So, the dual number components are split into partitions
38+
with a maximum size of `max_simultaneous_derivatives`, and we call
39+
`implicit_tendency!` once for each partition. That is, if the partition size is
40+
``s``, then the first partition evaluates the coefficients of ``ε₁`` through
41+
``εₛ``, the second evaluates the coefficients of ``εₛ₊₁`` through ``ε₂ₛ``, and so
42+
on until ``εₙ``. The default partition size is 32.
4343
"""
4444
struct AutoDenseJacobian{S} <: JacobianAlgorithm end
4545
AutoDenseJacobian(max_simultaneous_derivatives = 32) =
@@ -53,11 +53,11 @@ function jacobian_cache(alg::AutoDenseJacobian, Y, atmos)
5353
DA = ClimaComms.array_type(Y)
5454

5555
FT_dual = ForwardDiff.Dual{Jacobian, FT, max_simultaneous_derivatives(alg)}
56-
Y_dual = replace_parent_type(Y, FT_dual)
57-
Yₜ_dual = similar(Y_dual)
5856
precomputed_dual =
5957
replace_parent_type(implicit_precomputed_quantities(Y, atmos), FT_dual)
6058
scratch_dual = replace_parent_type(temporary_quantities(Y, atmos), FT_dual)
59+
Y_dual = replace_parent_type(Y, FT_dual)
60+
Yₜ_dual = similar(Y_dual)
6161

6262
N = length(Fields.column(Y, 1, 1, 1))
6363
n_columns = Fields.ncolumns(Y.c)
@@ -72,10 +72,10 @@ function jacobian_cache(alg::AutoDenseJacobian, Y, atmos)
7272
I_matrix = reshape(I_column_matrix, N, N, 1)
7373

7474
return (;
75-
Y_dual,
76-
Yₜ_dual,
7775
precomputed_dual,
7876
scratch_dual,
77+
Y_dual,
78+
Yₜ_dual,
7979
column_matrices,
8080
column_lu_factors,
8181
column_lu_vectors,
@@ -85,7 +85,7 @@ function jacobian_cache(alg::AutoDenseJacobian, Y, atmos)
8585
end
8686

8787
function update_column_matrices!(alg::AutoDenseJacobian, cache, Y, p, t)
88-
(; Y_dual, Yₜ_dual, precomputed_dual, scratch_dual, column_matrices) = cache
88+
(; precomputed_dual, scratch_dual, Y_dual, Yₜ_dual, column_matrices) = cache
8989
device = ClimaComms.device(Y.c)
9090
column_indices = column_index_iterator(Y)
9191
scalar_names = scalar_field_names(Y)
@@ -99,48 +99,47 @@ function update_column_matrices!(alg::AutoDenseJacobian, cache, Y, p, t)
9999
for jacobian_index_to_Y_index_map_partition in
100100
ClimaComms.threadable(device, jacobian_index_to_Y_index_map_partitions)
101101

102-
# Add a unique ε to Y for each Jacobian column index in this batch. With
102+
# Add a unique ε to each value in Y that is part of this partition. With
103103
# Y_col and Yᴰ_col denoting the columns of Y and Y_dual at column_index,
104-
# set Yᴰ_col to Y_col + I[:, jacobian_indices_in_partition] * εs, where
105-
# I is the identity matrix for Y_col (i.e., the value of ∂Y_col/∂Y_col),
106-
# εs is a vector of max_simultaneous_derivatives(alg) dual number
107-
# components, and jacobian_indices_in_partition is equal to
104+
# set Yᴰ_col to Y_col + I[:, jacobian_column_indices] * εs, where I is
105+
# the identity matrix for Y_col (i.e., the value of ∂Y_col/∂Y_col), εs
106+
# is a vector of max_simultaneous_derivatives(alg) dual number
107+
# components, and jacobian_column_indices is equal to
108108
# first.(jacobian_index_to_Y_index_map_partition).
109109
Y_dual .= Y
110110
ClimaComms.@threaded device begin
111111
# On multithreaded devices, assign one thread to each combination of
112-
# spatial column index and Jacobian index in this batch.
112+
# spatial column index and Jacobian index in this partition.
113113
for column_index in column_indices,
114-
(ε_index, (_, (scalar_index, level_index))) in
114+
(diagonal_ε_index, (_, (scalar_index, level_index))) in
115115
enumerate(jacobian_index_to_Y_index_map_partition)
116116

117-
Y_partials =
118-
ntuple(==(ε_index), Val(max_simultaneous_derivatives(alg)))
119-
Y_dual_εs_value = ForwardDiff.Dual{Jacobian}(0, Y_partials)
117+
n_εs_val = Val(max_simultaneous_derivatives(alg))
118+
Y_dual_ε_coefficients = ntuple(==(diagonal_ε_index), n_εs_val)
120119
unrolled_applyat(scalar_index, scalar_names) do name
121120
field = MatrixFields.get_field(Y_dual, name)
122121
@inbounds point(field, level_index, column_index...)[] +=
123-
Y_dual_εs_value
122+
ForwardDiff.Dual{Jacobian}(0, Y_dual_ε_coefficients)
124123
end
125124
end
126125
end
127126

128-
# Compute this batch's portions of ∂p/∂Y and ∂Yₜ/∂Y.
127+
# Compute this partition of ∂p/∂Y and ∂Yₜ/∂Y.
129128
set_implicit_precomputed_quantities!(Y_dual, p_dual, t)
130129
implicit_tendency!(Yₜ_dual, Y_dual, p_dual, t)
131130

132-
# Copy this batch's portion of ∂Yₜ/∂Y into column_matrices. With Yₜ_col
133-
# and Yₜᴰ_col denoting the columns of Yₜ and Yₜ_dual at column_index, and
131+
# Copy this partition of ∂Yₜ/∂Y into column_matrices. With Yₜ_col and
132+
# Yₜᴰ_col denoting the columns of Yₜ and Yₜ_dual at column_index, and
134133
# with col_matrix denoting the matrix at the corresponding matrix_index
135134
# in column_matrices, copy the coefficients of the εs in Yₜᴰ_col into
136135
# col_matrix, where the previous steps have set Yₜᴰ_col to
137-
# Yₜ_col + (∂Yₜ_col/∂Y_col)[:, jacobian_indices_in_batch] * εs. In other
138-
# words, set col_matrix[jacobian_row_index, jacobian_column_index] to
139-
# ∂Yₜ_col[jacobian_row_index]/∂Y_col[jacobian_column_index], obtaining
140-
# this derivative from the coefficient of εs[ε_index] in
141-
# Yₜᴰ_col[jacobian_row_index], where ε_index is the index of
142-
# jacobian_column_index in jacobian_indices_in_batch. After all batches
143-
# are processed, col_matrix contains the full Jacobian ∂Yₜ_col/∂Y_col.
136+
# Yₜ_col + (∂Yₜ_col/∂Y_col)[:, jacobian_column_indices] * εs. In
137+
# other words, set col_matrix[jacobian_row_index, jacobian_column_index]
138+
# to ∂Yₜ_col[jacobian_row_index]/∂Y_col[jacobian_column_index],
139+
# obtaining this derivative from the coefficient of
140+
# εs[jacobian_column_ε_index] in Yₜᴰ_col[jacobian_row_index], where
141+
# jacobian_column_ε_index is the index of jacobian_column_index in
142+
# jacobian_column_indices.
144143
ClimaComms.@threaded device begin
145144
# On multithreaded devices, assign one thread to each combination of
146145
# spatial column index and scalar level index.
@@ -153,16 +152,16 @@ function update_column_matrices!(alg::AutoDenseJacobian, cache, Y, p, t)
153152
field = MatrixFields.get_field(Yₜ_dual, name)
154153
@inbounds point(field, level_index, column_index...)[]
155154
end
156-
Yₜ_partials = ForwardDiff.partials(Yₜ_dual_value)
157-
for (ε_index, (jacobian_column_index, _)) in
155+
Yₜ_dual_ε_coefficients = ForwardDiff.partials(Yₜ_dual_value)
156+
for (jacobian_column_ε_index, (jacobian_column_index, _)) in
158157
enumerate(jacobian_index_to_Y_index_map_partition)
159158
cartesian_index = (
160159
jacobian_row_index,
161160
jacobian_column_index,
162161
matrix_index,
163162
)
164163
@inbounds column_matrices[cartesian_index...] =
165-
Yₜ_partials[ε_index]
164+
Yₜ_dual_ε_coefficients[jacobian_column_ε_index]
166165
end
167166
end
168167
end

0 commit comments

Comments
 (0)