Skip to content

Commit 2923b52

Browse files
committed
Try using specialized padding
1 parent 3cd387f commit 2923b52

File tree

6 files changed

+114
-52
lines changed

6 files changed

+114
-52
lines changed

.buildkite/ci_driver.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ if (
4949
config.parsed_args["debug_approximate_jacobian"] &&
5050
!config.parsed_args["use_dense_jacobian"]
5151
)
52+
@info "Debugging Jacobian in first column of final state"
53+
5254
Y_end = integrator.u
5355
t_end = integrator.t
5456
(; p, dt) = integrator
@@ -106,8 +108,6 @@ if (
106108
any(!iszero, diag(block, band_index))
107109
end
108110

109-
@info "Debugging Jacobian in first column of final state"
110-
111111
bandwidth_error_values = map(block_keys) do block_key
112112
approx_blocks = first(all_approx_blocks)
113113
approx_bandwidth =

config/default_configs/default_config.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,6 @@ use_dense_jacobian:
9292
use_auto_jacobian:
9393
help: "Whether to populate the entries of the sparse Jacobian matrix using forward-mode automatic differentiation with sparse matrix coloring (only used when `use_dense_jacobian` is `false`) [`true`, `false` (default)]"
9494
value: true # TODO: Change this to false
95-
auto_jacobian_padding_bands:
96-
help: "Minimum number of bands to add in every block of the sparse Jacobian matrix, eliminating errors from Jacobian entries that lie outside of the default sparsity pattern (only used when `use_auto_jacobian` is `true`; default is `0`)"
97-
value: 2 # TODO: Change this to 0
9895
debug_approximate_jacobian:
9996
help: "Whether to compare approximations of the Jacobian matrix in the first column of the final state against the exact Jacobian [`true`, `false` (default)]"
10097
value: true # TODO: Change this to false

src/cache/temporary_quantities.jl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,20 @@ using ClimaCore.Utilities: half
55
# but cannot be computed on the fly. Unlike the precomputed quantities, these
66
# can be modified at any point, so they should never be assumed to be unchanged
77
# between function calls.
8+
function implicit_temporary_quantities(Y, atmos)
9+
center_space, face_space = axes(Y.c), axes(Y.f)
10+
11+
FT = Spaces.undertype(center_space)
12+
uvw_vec = UVW(FT(0), FT(0), FT(0))
13+
return (;
14+
ᶠtemp_scalar = Fields.Field(FT, face_space), # ᶠρaK_h, ᶠρaK_u
15+
ᶜtemp_scalar = Fields.Field(FT, center_space), # ᶜρχₜ_diffusion, ᶜa_scalar
16+
ᶜtemp_scalar_2 = Fields.Field(FT, center_space), # ᶜKᵥʲ, ᶜK_h_scaled
17+
ᶜtemp_C3 = Fields.Field(C3{FT}, center_space), # ᶜu₃ʲ
18+
ᶠtemp_CT3 = Fields.Field(CT3{FT}, face_space), # ᶠuₕ³, ᶠu³_diff
19+
ᶠtemp_UVWxUVW = Fields.Field(typeof(uvw_vec * uvw_vec'), face_space), # ᶠstrain_rate
20+
)
21+
end
822
function temporary_quantities(Y, atmos)
923
center_space, face_space = axes(Y.c), axes(Y.f)
1024

src/prognostic_equations/implicit/auto_dense_jacobian.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ function jacobian_cache(alg::AutoDenseJacobian, Y, atmos)
5353
DA = ClimaComms.array_type(Y)
5454

5555
precomputed = implicit_precomputed_quantities(Y, atmos)
56-
scratch = temporary_quantities(Y, atmos)
56+
scratch = implicit_temporary_quantities(Y, atmos)
5757

5858
FT_dual = ForwardDiff.Dual{Jacobian, FT, max_simultaneous_derivatives(alg)}
5959
precomputed_dual = replace_parent_eltype(precomputed, FT_dual)
@@ -112,11 +112,11 @@ function update_column_matrices!(alg::AutoDenseJacobian, cache, Y, p, t)
112112
ClimaComms.@threaded device begin
113113
# On multithreaded devices, use one thread for each dual number.
114114
for column_index in column_indices,
115-
(diagonal_ε_index, (_, (scalar_index, level_index))) in
115+
(diagonal_entry_ε_index, (_, (scalar_index, level_index))) in
116116
enumerate(jacobian_index_to_Y_index_map_partition)
117117

118118
n_εs_val = Val(max_simultaneous_derivatives(alg))
119-
ε_coefficients = ntuple(==(diagonal_ε_index), n_εs_val)
119+
ε_coefficients = ntuple(==(diagonal_entry_ε_index), n_εs_val)
120120
unrolled_applyat(scalar_index, scalar_names) do name
121121
field = MatrixFields.get_field(Y_dual, name)
122122
@inbounds point(field, level_index, column_index...)[] +=

src/prognostic_equations/implicit/auto_sparse_jacobian.jl

Lines changed: 92 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import SparseMatrixColorings
22

33
"""
4-
AutoSparseJacobian(sparse_jacobian_alg, max_padding_bands_per_block)
4+
AutoSparseJacobian(sparse_jacobian_alg, [max_padding_bands_per_block])
55
66
A [`JacobianAlgorithm`](@ref) that computes the Jacobian using forward-mode
77
automatic differentiation, assuming that the Jacobian's sparsity structure is
@@ -11,10 +11,12 @@ introduce errors to the updated entries.
1111
1212
TODO: Add a short explanation of how this algorithm works.
1313
"""
14-
struct AutoSparseJacobian{A <: SparseJacobian} <: SparseJacobian
14+
struct AutoSparseJacobian{A <: SparseJacobian, M} <: SparseJacobian
1515
sparse_jacobian_alg::A
16-
max_padding_bands_per_block::Int
16+
max_padding_bands_per_block::M
1717
end
18+
AutoSparseJacobian(sparse_jacobian_alg) =
19+
AutoSparseJacobian(sparse_jacobian_alg, nothing)
1820

1921
function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
2022
(; sparse_jacobian_alg, max_padding_bands_per_block) = alg
@@ -28,7 +30,7 @@ function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
2830
scalar_names = scalar_field_names(Y) # iterator of names corresponding to f
2931

3032
precomputed = implicit_precomputed_quantities(Y, atmos)
31-
scratch = temporary_quantities(Y, atmos)
33+
scratch = implicit_temporary_quantities(Y, atmos)
3234

3335
# Allocate ∂R/∂Y and its corresponding linear solver.
3436
# TODO: Add FieldNameTree(Y) to the matrix in FieldMatrixWithSolver. The
@@ -85,7 +87,9 @@ function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
8587
sparsity_mask = Array{Bool}(undef, N, N)
8688
sparsity_mask .= false
8789
padded_sparsity_mask = copy(sparsity_mask)
88-
for block_row_name in scalar_names, block_column_name in scalar_names
90+
for block_key in Iterators.product(scalar_names, scalar_names)
91+
(block_row_name, block_column_name) = block_key
92+
8993
# Get a view of this block's sparsity masks with its row/column indices.
9094
block_jacobian_row_index_to_Yₜ_index_map =
9195
Iterators.filter(enumerate(field_vector_indices)) do index_pair
@@ -109,23 +113,70 @@ function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
109113
# blocks corresponding to index ranges whose length is -1 (centered
110114
# around 0 for square blocks and around ±1/2 for non-square blocks).
111115
(n_rows_in_block, n_columns_in_block) = size(block_sparsity_mask)
112-
if (block_row_name, block_column_name) in keys(autodiff_matrix)
113-
matrix_field = autodiff_matrix[block_row_name, block_column_name]
116+
if block_key in keys(autodiff_matrix)
114117
(_, _, lower_band, upper_band) =
115-
MatrixFields.band_matrix_info(matrix_field)
118+
MatrixFields.band_matrix_info(autodiff_matrix[block_key])
116119
else
117120
(lower_band, upper_band) =
118121
n_rows_in_block == n_columns_in_block ? (1 / 2, -1 / 2) :
119122
(n_rows_in_block < n_columns_in_block ? (1, 0) : (0, -1))
120123
end
121124

122-
# Expand the range of band indices by up to max_padding_bands_per_block.
123-
n_padding_bands_per_side = max_padding_bands_per_block / 2
124-
lower_padding_band = ceil(Int, lower_band - n_padding_bands_per_side)
125-
upper_padding_band = floor(Int, upper_band + n_padding_bands_per_side)
125+
# Symmetrically expand the range of band indices, with the number of
126+
# new bands either limited by max_padding_bands_per_block, or hardcoded
127+
# for each block when max_padding_bands_per_block is not specified.
128+
max_padding_bands = if !isnothing(max_padding_bands_per_block)
129+
max_padding_bands_per_block
130+
elseif (
131+
!(block_key in keys(autodiff_matrix)) &&
132+
(block_row_name, block_row_name) in keys(autodiff_matrix)
133+
)
134+
if block_key in (
135+
(@name(c.uₕ.components.data.:(1)), @name(c.ρ)),
136+
(@name(c.uₕ.components.data.:(2)), @name(c.ρ)),
137+
)
138+
# ∂ᶜuₕₜ/∂ᶜρ and ∂ᶜuₕₜ/∂ᶜuₕ can have similar unnormalized entries
139+
3
140+
elseif block_key in (
141+
(@name(c.ρe_tot), @name(c.ρ)),
142+
(@name(c.ρe_tot), @name(c.ρq_liq)),
143+
(@name(c.ρe_tot), @name(c.ρq_ice)),
144+
(@name(c.ρe_tot), @name(c.ρq_rai)),
145+
(@name(c.ρe_tot), @name(c.ρq_sno)),
146+
(@name(c.ρe_tot), @name(c.sgsʲs.:(1).q_tot)),
147+
)
148+
# ∂ᶜρe_totₜ/∂ᶜχ and ∂ᶜρe_totₜ/∂ᶜρe_tot can have similar
149+
# unnormalized entries for several different variables ᶜχ
150+
3
151+
elseif (
152+
block_row_name == @name(f.sgsʲs.:(1).u₃.components.data.:(1)) &&
153+
block_column_name in (
154+
@name(c.uₕ.components.data.:(1)),
155+
@name(c.uₕ.components.data.:(2)),
156+
)
157+
)
158+
# ∂ᶠu₃ʲₜ/∂ᶜuₕ and ∂ᶠu₃ʲₜ/∂ᶠu₃ʲ can have similar unnormalized
159+
# entries (and ∂ᶠu₃ʲₜ/∂ᶜmseʲ can also have similar entries)
160+
2
161+
else
162+
0
163+
end
164+
else
165+
0
166+
end
167+
padded_lower_band = ceil(Int, lower_band - max_padding_bands / 2)
168+
padded_upper_band = floor(Int, upper_band + max_padding_bands / 2)
169+
170+
if verbose
171+
n_padding_bands =
172+
length(padded_lower_band:padded_upper_band) -
173+
length(lower_band:upper_band)
174+
n_padding_bands > 0 &&
175+
@info "Adding $n_padding_bands padding bands for $block_key"
176+
end
126177

127178
# Update the sparsity mask entries corresponding to bands in this block.
128-
for band in lower_padding_band:upper_padding_band
179+
for band in padded_lower_band:padded_upper_band
129180
is_not_padding_band = band in lower_band:upper_band
130181
level_index_min = band < 0 ? 1 - band : 1
131182
level_index_max =
@@ -152,12 +203,12 @@ function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
152203
n_colors = SparseMatrixColorings.ncolors(best_jacobian_column_coloring)
153204

154205
# When running on GPU devices, divide n_colors into partitions that are each
155-
# guaranteed to fit in 70% of the memory that is currently free.
206+
# guaranteed to fit in 90% of the memory that is currently free.
156207
n_partitions = if device isa ClimaComms.AbstractCPUDevice
157208
1
158209
else
159210
free_memory = ClimaComms.free_memory(device)
160-
max_memory = free_memory * 7 ÷ 10
211+
max_memory = free_memory * 9 ÷ 10
161212
memory_for_I_matrix = n_colors * parent_memory(Y)
162213
memory_per_ε =
163214
(parent_memory(precomputed) + parent_memory(scratch)) +
@@ -191,7 +242,7 @@ function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
191242
# FieldVectors and cached fields with dual numbers instead of real numbers,
192243
# with dual numbers using the tag "Jacobian" for specialized dispatch
193244
# TODO: Refactor FieldVector broadcasting so that performance does not
194-
# deteriorate if we only store one column of each I_matrix_partition_εs.
245+
# deteriorate if we only store one column of each partition_εs.
195246
FT_dual = ForwardDiff.Dual{Jacobian, FT, n_εs}
196247
precomputed_dual = replace_parent_eltype(precomputed, FT_dual)
197248
scratch_dual = replace_parent_eltype(scratch, FT_dual)
@@ -219,29 +270,34 @@ function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
219270
Y_index_to_diagonal_color_map =
220271
zip(field_vector_indices, jacobian_column_colors)
221272

222-
# Set the dual numbers in each FieldVector I_matrix_partition_εs so that the
223-
# ε components correspond to partitions of the N × N identity matrix ∂Y/∂Y.
224-
# Specifically, every column of I_matrix_partition_εs is a vector of N dual
225-
# numbers, each of which is stored as a combination of a value and n_εs
226-
# partial derivatives. The ε components can be interpreted as representing
227-
# N × n_εs slices of a sparse N × n_colors representation of ∂Y/∂Y.
228-
# Y_index_to_diagonal_color_map is converted to a DA for GPU compatibility.
273+
# Set the dual numbers in each FieldVector partition_εs so that the ε
274+
# components correspond to partitions of the N × N identity matrix ∂Y/∂Y.
275+
# Specifically, every column of partition_εs is a vector of N dual numbers,
276+
# each of which is stored as a combination of a value and n_εs partial
277+
# derivatives. The ε components can be interpreted as representing N × n_εs
278+
# slices of a sparse N × n_colors representation of ∂Y/∂Y. Convert n_εs to
279+
# a Val and Y_index_to_diagonal_color_map to a DA for GPU compatibility, and
280+
# drop spatial information from every Field to ensure that this kernel stays
281+
# below the GPU parameter memory limit.
282+
n_εs_val = Val(n_εs)
283+
I_matrix_partitions_data = unrolled_map(I_matrix_partitions) do partition_εs
284+
unrolled_map(Fields.field_values, Fields._values(partition_εs))
285+
end
229286
ClimaComms.@threaded device begin
230287
# On multithreaded devices, use one thread for each dual number.
231-
for (partition, I_matrix_partition_εs) in
232-
enumerate(I_matrix_partitions),
288+
for (partition_index, partition_εs_data) in
289+
enumerate(I_matrix_partitions_data),
233290
column_index in column_indices,
234291
index_pair in DA(collect(Y_index_to_diagonal_color_map))
235292

236293
((scalar_index, level_index), diagonal_entry_color) = index_pair
237-
ε_offset = (partition - 1) * n_εs
238-
diagonal_ε_index =
294+
ε_offset = (partition_index - 1) * n_εs
295+
diagonal_entry_ε_index =
239296
ε_offset < diagonal_entry_color <= ε_offset + n_εs ?
240297
diagonal_entry_color - ε_offset : 0
241-
n_εs_val = Val(ForwardDiff.npartials(eltype(I_matrix_partition_εs)))
242-
ε_coefficients = ntuple(==(diagonal_ε_index), n_εs_val)
298+
ε_coefficients = ntuple(==(diagonal_entry_ε_index), n_εs_val)
243299
unrolled_applyat(scalar_index, scalar_names) do name
244-
field = MatrixFields.get_field(I_matrix_partition_εs, name)
300+
field = MatrixFields.get_field(partition_εs_data, name)
245301
@inbounds point(field, level_index, column_index...)[] =
246302
ForwardDiff.Dual{Jacobian}(0, ε_coefficients)
247303
end
@@ -323,15 +379,15 @@ function update_jacobian!(::AutoSparseJacobian, cache, Y, p, dtγ, t)
323379
scalar_names = scalar_field_names(Y)
324380
p_dual = append_to_atmos_cache(p, precomputed_dual, scratch_dual)
325381

326-
for (partition, I_matrix_partition_εs) in enumerate(I_matrix_partitions)
382+
for (partition_index, partition_εs) in enumerate(I_matrix_partitions)
327383
# Set the εs in Y_dual to represent a partition of the identity matrix.
328-
Y_dual .= Y .+ I_matrix_partition_εs
384+
Y_dual .= Y .+ partition_εs
329385

330-
# Compute ∂p/∂Y * I_matrix_partition and ∂Yₜ/∂Y * I_matrix_partition.
386+
# Compute ∂p/∂Y * partition_εs and ∂Yₜ/∂Y * partition_εs.
331387
set_implicit_precomputed_quantities!(Y_dual, p_dual, t)
332388
implicit_tendency!(Yₜ_dual, Y_dual, p_dual, t)
333389

334-
# Move the entries of ∂Yₜ/∂Y * I_matrix_partition from Yₜ_dual into the
390+
# Move the entries of ∂Yₜ/∂Y * partition_εs from Yₜ_dual into the
335391
# blocks of autodiff_matrix. Drop spatial information from every Field
336392
# to ensure that this kernel stays below the GPU parameter memory limit.
337393
Yₜ_dual_data =
@@ -353,7 +409,7 @@ function update_jacobian!(::AutoSparseJacobian, cache, Y, p, dtγ, t)
353409
end
354410
ε_coefficients = ForwardDiff.partials(dual_number)
355411
n_εs = length(ε_coefficients)
356-
ε_offset = (partition - 1) * n_εs
412+
ε_offset = (partition_index - 1) * n_εs
357413
unrolled_applyat(block_index, matrix_fields_data) do block_data
358414
@inbounds entries_data =
359415
point(block_data, level_index, column_index...).entries
@@ -365,7 +421,7 @@ function update_jacobian!(::AutoSparseJacobian, cache, Y, p, dtγ, t)
365421
ε_offset < entry_color <= ε_offset + n_εs ?
366422
(@inbounds ε_coefficients[entry_color - ε_offset]) :
367423
entry
368-
end
424+
end # TODO: Why does unrolled_map break GPU compilation?
369425
end
370426
end
371427
end

src/solver/type_getters.jl

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -466,14 +466,9 @@ function get_jacobian(ode_algo, Y, atmos, parsed_args)
466466
DerivativeFlag(atmos.sgs_nh_pressure_mode),
467467
parsed_args["approximate_linear_solve_iters"],
468468
)
469-
if parsed_args["use_auto_jacobian"]
470-
AutoSparseJacobian(
471-
manual_jacobian_algorithm,
472-
parsed_args["auto_jacobian_padding_bands"],
473-
)
474-
else
475-
manual_jacobian_algorithm
476-
end
469+
parsed_args["use_auto_jacobian"] ?
470+
AutoSparseJacobian(manual_jacobian_algorithm) :
471+
manual_jacobian_algorithm
477472
end
478473
@info "Jacobian algorithm: $(summary_string(jacobian_algorithm))"
479474
return Jacobian(jacobian_algorithm, Y, atmos)

0 commit comments

Comments
 (0)