1
1
import SparseMatrixColorings
2
2
3
3
"""
4
- AutoSparseJacobian(sparse_jacobian_alg, max_padding_bands_per_block)
4
+ AutoSparseJacobian(sparse_jacobian_alg, [ max_padding_bands_per_block] )
5
5
6
6
A [`JacobianAlgorithm`](@ref) that computes the Jacobian using forward-mode
7
7
automatic differentiation, assuming that the Jacobian's sparsity structure is
@@ -11,10 +11,12 @@ introduce errors to the updated entries.
11
11
12
12
TODO: Add a short explanation of how this algorithm works.
13
13
"""
14
- struct AutoSparseJacobian{A <: SparseJacobian } <: SparseJacobian
14
+ struct AutoSparseJacobian{A <: SparseJacobian , M } <: SparseJacobian
15
15
sparse_jacobian_alg:: A
16
- max_padding_bands_per_block:: Int
16
+ max_padding_bands_per_block:: M
17
17
end
18
+ AutoSparseJacobian (sparse_jacobian_alg) =
19
+ AutoSparseJacobian (sparse_jacobian_alg, nothing )
18
20
19
21
function jacobian_cache (alg:: AutoSparseJacobian , Y, atmos; verbose = true )
20
22
(; sparse_jacobian_alg, max_padding_bands_per_block) = alg
@@ -28,7 +30,7 @@ function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
28
30
scalar_names = scalar_field_names (Y) # iterator of names corresponding to f
29
31
30
32
precomputed = implicit_precomputed_quantities (Y, atmos)
31
- scratch = temporary_quantities (Y, atmos)
33
+ scratch = implicit_temporary_quantities (Y, atmos)
32
34
33
35
# Allocate ∂R/∂Y and its corresponding linear solver.
34
36
# TODO : Add FieldNameTree(Y) to the matrix in FieldMatrixWithSolver. The
@@ -85,7 +87,9 @@ function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
85
87
sparsity_mask = Array {Bool} (undef, N, N)
86
88
sparsity_mask .= false
87
89
padded_sparsity_mask = copy (sparsity_mask)
88
- for block_row_name in scalar_names, block_column_name in scalar_names
90
+ for block_key in Iterators. product (scalar_names, scalar_names)
91
+ (block_row_name, block_column_name) = block_key
92
+
89
93
# Get a view of this block's sparsity masks with its row/column indices.
90
94
block_jacobian_row_index_to_Yₜ_index_map =
91
95
Iterators. filter (enumerate (field_vector_indices)) do index_pair
@@ -109,23 +113,70 @@ function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
109
113
# blocks corresponding to index ranges whose length is -1 (centered
110
114
# around 0 for square blocks and around ±1/2 for non-square blocks).
111
115
(n_rows_in_block, n_columns_in_block) = size (block_sparsity_mask)
112
- if (block_row_name, block_column_name) in keys (autodiff_matrix)
113
- matrix_field = autodiff_matrix[block_row_name, block_column_name]
116
+ if block_key in keys (autodiff_matrix)
114
117
(_, _, lower_band, upper_band) =
115
- MatrixFields. band_matrix_info (matrix_field )
118
+ MatrixFields. band_matrix_info (autodiff_matrix[block_key] )
116
119
else
117
120
(lower_band, upper_band) =
118
121
n_rows_in_block == n_columns_in_block ? (1 / 2 , - 1 / 2 ) :
119
122
(n_rows_in_block < n_columns_in_block ? (1 , 0 ) : (0 , - 1 ))
120
123
end
121
124
122
- # Expand the range of band indices by up to max_padding_bands_per_block.
123
- n_padding_bands_per_side = max_padding_bands_per_block / 2
124
- lower_padding_band = ceil (Int, lower_band - n_padding_bands_per_side)
125
- upper_padding_band = floor (Int, upper_band + n_padding_bands_per_side)
125
+ # Symmetrically expand the range of band indices, with the number of
126
+ # new bands either limited by max_padding_bands_per_block, or hardcoded
127
+ # for each block when max_padding_bands_per_block is not specified.
128
+ max_padding_bands = if ! isnothing (max_padding_bands_per_block)
129
+ max_padding_bands_per_block
130
+ elseif (
131
+ ! (block_key in keys (autodiff_matrix)) &&
132
+ (block_row_name, block_row_name) in keys (autodiff_matrix)
133
+ )
134
+ if block_key in (
135
+ (@name (c. uₕ. components. data.:(1 )), @name (c. ρ)),
136
+ (@name (c. uₕ. components. data.:(2 )), @name (c. ρ)),
137
+ )
138
+ # ∂ᶜuₕₜ/∂ᶜρ and ∂ᶜuₕₜ/∂ᶜuₕ can have similar unnormalized entries
139
+ 3
140
+ elseif block_key in (
141
+ (@name (c. ρe_tot), @name (c. ρ)),
142
+ (@name (c. ρe_tot), @name (c. ρq_liq)),
143
+ (@name (c. ρe_tot), @name (c. ρq_ice)),
144
+ (@name (c. ρe_tot), @name (c. ρq_rai)),
145
+ (@name (c. ρe_tot), @name (c. ρq_sno)),
146
+ (@name (c. ρe_tot), @name (c. sgsʲs.:(1 ). q_tot)),
147
+ )
148
+ # ∂ᶜρe_totₜ/∂ᶜχ and ∂ᶜρe_totₜ/∂ᶜρe_tot can have similar
149
+ # unnormalized entries for several different variables ᶜχ
150
+ 3
151
+ elseif (
152
+ block_row_name == @name (f. sgsʲs.:(1 ). u₃. components. data.:(1 )) &&
153
+ block_column_name in (
154
+ @name (c. uₕ. components. data.:(1 )),
155
+ @name (c. uₕ. components. data.:(2 )),
156
+ )
157
+ )
158
+ # ∂ᶠu₃ʲₜ/∂ᶜuₕ and ∂ᶠu₃ʲₜ/∂ᶠu₃ʲ can have similar unnormalized
159
+ # entries (and ∂ᶠu₃ʲₜ/∂ᶜmseʲ can also have similar entries)
160
+ 2
161
+ else
162
+ 0
163
+ end
164
+ else
165
+ 0
166
+ end
167
+ padded_lower_band = ceil (Int, lower_band - max_padding_bands / 2 )
168
+ padded_upper_band = floor (Int, upper_band + max_padding_bands / 2 )
169
+
170
+ if verbose
171
+ n_padding_bands =
172
+ length (padded_lower_band: padded_upper_band) -
173
+ length (lower_band: upper_band)
174
+ n_padding_bands > 0 &&
175
+ @info " Adding $n_padding_bands padding bands for $block_key "
176
+ end
126
177
127
178
# Update the sparsity mask entries corresponding to bands in this block.
128
- for band in lower_padding_band : upper_padding_band
179
+ for band in padded_lower_band : padded_upper_band
129
180
is_not_padding_band = band in lower_band: upper_band
130
181
level_index_min = band < 0 ? 1 - band : 1
131
182
level_index_max =
@@ -152,12 +203,12 @@ function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
152
203
n_colors = SparseMatrixColorings. ncolors (best_jacobian_column_coloring)
153
204
154
205
# When running on GPU devices, divide n_colors into partitions that are each
155
- # guaranteed to fit in 70 % of the memory that is currently free.
206
+ # guaranteed to fit in 90 % of the memory that is currently free.
156
207
n_partitions = if device isa ClimaComms. AbstractCPUDevice
157
208
1
158
209
else
159
210
free_memory = ClimaComms. free_memory (device)
160
- max_memory = free_memory * 7 ÷ 10
211
+ max_memory = free_memory * 9 ÷ 10
161
212
memory_for_I_matrix = n_colors * parent_memory (Y)
162
213
memory_per_ε =
163
214
(parent_memory (precomputed) + parent_memory (scratch)) +
@@ -191,7 +242,7 @@ function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
191
242
# FieldVectors and cached fields with dual numbers instead of real numbers,
192
243
# with dual numbers using the tag "Jacobian" for specialized dispatch
193
244
# TODO : Refactor FieldVector broadcasting so that performance does not
194
- # deteriorate if we only store one column of each I_matrix_partition_εs .
245
+ # deteriorate if we only store one column of each partition_εs .
195
246
FT_dual = ForwardDiff. Dual{Jacobian, FT, n_εs}
196
247
precomputed_dual = replace_parent_eltype (precomputed, FT_dual)
197
248
scratch_dual = replace_parent_eltype (scratch, FT_dual)
@@ -219,29 +270,34 @@ function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
219
270
Y_index_to_diagonal_color_map =
220
271
zip (field_vector_indices, jacobian_column_colors)
221
272
222
- # Set the dual numbers in each FieldVector I_matrix_partition_εs so that the
223
- # ε components correspond to partitions of the N × N identity matrix ∂Y/∂Y.
224
- # Specifically, every column of I_matrix_partition_εs is a vector of N dual
225
- # numbers, each of which is stored as a combination of a value and n_εs
226
- # partial derivatives. The ε components can be interpreted as representing
227
- # N × n_εs slices of a sparse N × n_colors representation of ∂Y/∂Y.
228
- # Y_index_to_diagonal_color_map is converted to a DA for GPU compatibility.
273
+ # Set the dual numbers in each FieldVector partition_εs so that the ε
274
+ # components correspond to partitions of the N × N identity matrix ∂Y/∂Y.
275
+ # Specifically, every column of partition_εs is a vector of N dual numbers,
276
+ # each of which is stored as a combination of a value and n_εs partial
277
+ # derivatives. The ε components can be interpreted as representing N × n_εs
278
+ # slices of a sparse N × n_colors representation of ∂Y/∂Y. Convert n_εs to
279
+ # a Val and Y_index_to_diagonal_color_map to a DA for GPU compatibility, and
280
+ # drop spatial information from every Field to ensure that this kernel stays
281
+ # below the GPU parameter memory limit.
282
+ n_εs_val = Val (n_εs)
283
+ I_matrix_partitions_data = unrolled_map (I_matrix_partitions) do partition_εs
284
+ unrolled_map (Fields. field_values, Fields. _values (partition_εs))
285
+ end
229
286
ClimaComms. @threaded device begin
230
287
# On multithreaded devices, use one thread for each dual number.
231
- for (partition, I_matrix_partition_εs ) in
232
- enumerate (I_matrix_partitions ),
288
+ for (partition_index, partition_εs_data ) in
289
+ enumerate (I_matrix_partitions_data ),
233
290
column_index in column_indices,
234
291
index_pair in DA (collect (Y_index_to_diagonal_color_map))
235
292
236
293
((scalar_index, level_index), diagonal_entry_color) = index_pair
237
- ε_offset = (partition - 1 ) * n_εs
238
- diagonal_ε_index =
294
+ ε_offset = (partition_index - 1 ) * n_εs
295
+ diagonal_entry_ε_index =
239
296
ε_offset < diagonal_entry_color <= ε_offset + n_εs ?
240
297
diagonal_entry_color - ε_offset : 0
241
- n_εs_val = Val (ForwardDiff. npartials (eltype (I_matrix_partition_εs)))
242
- ε_coefficients = ntuple (== (diagonal_ε_index), n_εs_val)
298
+ ε_coefficients = ntuple (== (diagonal_entry_ε_index), n_εs_val)
243
299
unrolled_applyat (scalar_index, scalar_names) do name
244
- field = MatrixFields. get_field (I_matrix_partition_εs , name)
300
+ field = MatrixFields. get_field (partition_εs_data , name)
245
301
@inbounds point (field, level_index, column_index... )[] =
246
302
ForwardDiff. Dual {Jacobian} (0 , ε_coefficients)
247
303
end
@@ -323,15 +379,15 @@ function update_jacobian!(::AutoSparseJacobian, cache, Y, p, dtγ, t)
323
379
scalar_names = scalar_field_names (Y)
324
380
p_dual = append_to_atmos_cache (p, precomputed_dual, scratch_dual)
325
381
326
- for (partition, I_matrix_partition_εs ) in enumerate (I_matrix_partitions)
382
+ for (partition_index, partition_εs ) in enumerate (I_matrix_partitions)
327
383
# Set the εs in Y_dual to represent a partition of the identity matrix.
328
- Y_dual .= Y .+ I_matrix_partition_εs
384
+ Y_dual .= Y .+ partition_εs
329
385
330
- # Compute ∂p/∂Y * I_matrix_partition and ∂Yₜ/∂Y * I_matrix_partition .
386
+ # Compute ∂p/∂Y * partition_εs and ∂Yₜ/∂Y * partition_εs .
331
387
set_implicit_precomputed_quantities! (Y_dual, p_dual, t)
332
388
implicit_tendency! (Yₜ_dual, Y_dual, p_dual, t)
333
389
334
- # Move the entries of ∂Yₜ/∂Y * I_matrix_partition from Yₜ_dual into the
390
+ # Move the entries of ∂Yₜ/∂Y * partition_εs from Yₜ_dual into the
335
391
# blocks of autodiff_matrix. Drop spatial information from every Field
336
392
# to ensure that this kernel stays below the GPU parameter memory limit.
337
393
Yₜ_dual_data =
@@ -353,7 +409,7 @@ function update_jacobian!(::AutoSparseJacobian, cache, Y, p, dtγ, t)
353
409
end
354
410
ε_coefficients = ForwardDiff. partials (dual_number)
355
411
n_εs = length (ε_coefficients)
356
- ε_offset = (partition - 1 ) * n_εs
412
+ ε_offset = (partition_index - 1 ) * n_εs
357
413
unrolled_applyat (block_index, matrix_fields_data) do block_data
358
414
@inbounds entries_data =
359
415
point (block_data, level_index, column_index... ). entries
@@ -365,7 +421,7 @@ function update_jacobian!(::AutoSparseJacobian, cache, Y, p, dtγ, t)
365
421
ε_offset < entry_color <= ε_offset + n_εs ?
366
422
(@inbounds ε_coefficients[entry_color - ε_offset]) :
367
423
entry
368
- end
424
+ end # TODO : Why does unrolled_map break GPU compilation?
369
425
end
370
426
end
371
427
end
0 commit comments