Add batching to AutoSparseJacobian

dennisYatunin · dennisYatunin · commit 65ba13490361 · 2025-07-22T22:08:09.000-07:00
diff --git a/.buildkite/Manifest-v1.11.toml b/.buildkite/Manifest-v1.11.toml
@@ -370,7 +370,9 @@ version = "0.31.0"
 
 [[deps.ClimaComms]]
 deps = ["Adapt", "Logging", "LoggingExtras"]
-git-tree-sha1 = "75b9d1a3b4e3efa2cbbae2eb7b52f14c0b38ccf0"
+git-tree-sha1 = "d341c3fc97a98dbecd6b635f34638b2d9771f94e"
+repo-rev = "dy/memory_api"
+repo-url = "https://github.com/CliMA/ClimaComms.jl.git"
 uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
 version = "0.6.8"
 weakdeps = ["CUDA", "MPI"]
diff --git a/.buildkite/ci_driver.jl b/.buildkite/ci_driver.jl
@@ -66,60 +66,54 @@ if (
     scalar_names = CA.scalar_field_names(Y_end)
     block_keys = Iterators.product(scalar_names, scalar_names)
 
-    @info "Debugging Jacobian in first column of final state"
-    rms(block) = sqrt(mean(abs2.(block)))
-    highlighters = (
-        Highlighter((d, i, j) -> d[i, j] < 1e-12; foreground = :dark_gray),
-        Highlighter((d, i, j) -> 1e-12 <= d[i, j] < 1e-6; foreground = :green),
-        Highlighter((d, i, j) -> 1e-6 <= d[i, j] < 1e-2; foreground = :blue),
-        Highlighter((d, i, j) -> 1e-2 <= d[i, j] < 1e-1; foreground = :cyan),
-        Highlighter((d, i, j) -> 1e-1 <= d[i, j] < 1; foreground = :light_cyan),
-        Highlighter((d, i, j) -> d[i, j] == 1; foreground = :yellow),
-        Highlighter((d, i, j) -> d[i, j] > 1; foreground = :red),
-    )
+    exact_jacobian_alg = CA.AutoDenseJacobian()
+    all_approx_jacobian_algs =
+        jacobian.alg isa CA.AutoSparseJacobian ?
+        (; manual = jacobian.alg.sparse_jacobian_alg, auto = jacobian.alg) :
+        (; manual = jacobian.alg)
+    exact_blocks =
+        CA.first_column_block_arrays(exact_jacobian_alg, Y_end, p, dtγ, t_end)
+    all_approx_blocks = map(all_approx_jacobian_algs) do jacobian_alg
+        CA.first_column_block_arrays(jacobian_alg, Y_end, p, dtγ, t_end)
+    end
+    block_rescalings = CA.first_column_rescaling_arrays(Y_end, p, t_end)
+
     table_kwargs = (;
         columns_width = 5,
         crop = :none,
         formatters = ft_printf("%1.0e"),
-        highlighters,
         row_labels = collect(scalar_names),
         show_header = false,
         tf = tf_matrix,
         vlines = [1],
     )
+    highlighters = (
+        Highlighter((d, i, j) -> d[i, j] < 1e-12; foreground = :dark_gray),
+        Highlighter((d, i, j) -> 1e-12 <= d[i, j] < 1e-6; foreground = :blue),
+        Highlighter((d, i, j) -> 1e-6 <= d[i, j] < 1e-3; foreground = :cyan),
+        Highlighter((d, i, j) -> 1e-3 <= d[i, j] < 1e-1; foreground = :green),
+        Highlighter((d, i, j) -> 1e-1 <= d[i, j] < 1; foreground = :yellow),
+        Highlighter((d, i, j) -> d[i, j] == 1; foreground = :light_red),
+        Highlighter((d, i, j) -> d[i, j] > 1; foreground = :light_magenta),
+    )
+    rms(block) = sqrt(mean(abs2.(block)))
 
-    block_rescalings = CA.first_column_rescaling_arrays(Y_end, p, t_end)
+    @info "Debugging Jacobian in first column of final state"
 
-    exact_jacobian_alg = CA.AutoDenseJacobian()
-    exact_blocks =
-        CA.first_column_block_arrays(exact_jacobian_alg, Y_end, p, dtγ, t_end)
+    exact_table_kwargs = (; table_kwargs..., highlighters = highlighters[1])
     exact_rms_values = map(block_keys) do block_key
         rms(exact_blocks[block_key])
     end
     exact_rescaled_rms_values = map(block_keys) do block_key
         rms(exact_blocks[block_key] .* block_rescalings[block_key])
     end
-    @info "exact, RMS per block [inconsistent units]:"
-    pretty_table(
-        exact_rms_values;
-        table_kwargs...,
-        highlighters = highlighters[1],
-    )
+    @info "exact, RMS per block [unnormalized]:"
+    pretty_table(exact_rms_values; exact_table_kwargs...)
     @info "exact, rescaled RMS per block [s^-1]:"
-    pretty_table(
-        exact_rescaled_rms_values;
-        table_kwargs...,
-        highlighters = highlighters[1],
-    )
-    println("<$('='^70)>")
+    pretty_table(exact_rescaled_rms_values; exact_table_kwargs...)
+    println("<$('='^70)>\n")
 
-    all_approx_jacobian_algs =
-        jacobian.alg isa CA.AutoSparseJacobian ?
-        (; manual = jacobian.alg.sparse_jacobian_alg, auto = jacobian.alg) :
-        (; manual = jacobian.alg)
-    all_approx_blocks = map(all_approx_jacobian_algs) do jacobian_alg
-        CA.first_column_block_arrays(jacobian_alg, Y_end, p, dtγ, t_end)
-    end
+    approx_table_kwargs = (; table_kwargs..., highlighters)
     if jacobian.alg isa CA.AutoSparseJacobian
         approx_diff_rescaled_rms_values = map(block_keys) do block_key
             (; manual, auto) = all_approx_blocks
@@ -129,7 +123,7 @@ if (
             rms((manual[block_key] - auto[block_key]) .* rescaling) : FT(0)
         end
         @info "manual approx - auto approx, rescaled RMS per block [s^-1]:"
-        pretty_table(approx_diff_rescaled_rms_values; table_kwargs...)
+        pretty_table(approx_diff_rescaled_rms_values; approx_table_kwargs...)
     end
     for (approx_name, approx_blocks) in pairs(all_approx_blocks)
         approx_error_rescaled_rms_values = map(block_keys) do block_key
@@ -141,9 +135,9 @@ if (
             rms(approx_error .* rescaling)
         end
         @info "$approx_name approx - exact, rescaled RMS per block [s^-1]:"
-        pretty_table(approx_error_rescaled_rms_values; table_kwargs...)
+        pretty_table(approx_error_rescaled_rms_values; approx_table_kwargs...)
     end
-    println("<$('='^70)>")
+    println("<$('='^70)>\n")
     if jacobian.alg isa CA.AutoSparseJacobian
         approx_diff_relative_rms_values = map(block_keys) do block_key
             (; manual, auto) = all_approx_blocks
@@ -156,7 +150,7 @@ if (
             approx_diff_rms_value / rms(exact_blocks[block_key])
         end
         @info "manual approx - auto approx, relative RMS per block [unitless]:"
-        pretty_table(approx_diff_relative_rms_values; table_kwargs...)
+        pretty_table(approx_diff_relative_rms_values; approx_table_kwargs...)
     end
     for (approx_name, approx_blocks) in pairs(all_approx_blocks)
         approx_error_relative_rms_values = map(block_keys) do block_key
@@ -168,7 +162,7 @@ if (
             approx_error_rms_value / rms(exact_blocks[block_key])
         end
         @info "$approx_name approx - exact, relative RMS per block [unitless]:"
-        pretty_table(approx_error_relative_rms_values; table_kwargs...)
+        pretty_table(approx_error_relative_rms_values; approx_table_kwargs...)
     end
 end
 
diff --git a/.buildkite/gpu_pipeline/pipeline.yml b/.buildkite/gpu_pipeline/pipeline.yml
@@ -24,6 +24,7 @@ steps:
       - julia --project=.buildkite -e 'using Pkg; Pkg.precompile()'
       - julia --project=.buildkite -e 'using CUDA; CUDA.precompile_runtime()'
       - julia --project=.buildkite -e 'using Pkg; Pkg.status()'
+      - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaComms", rev="dy/memory_api"))'
       - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaCore", rev="main"))'
 
     agents:
diff --git a/.buildkite/longruns_gpu/pipeline.yml b/.buildkite/longruns_gpu/pipeline.yml
@@ -25,6 +25,7 @@ steps:
       - julia --project=.buildkite -e 'using Pkg; Pkg.precompile()'
       - julia --project=.buildkite -e 'using CUDA; CUDA.precompile_runtime()'
       - julia --project=.buildkite -e 'using Pkg; Pkg.status()'
+      - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaComms", rev="dy/memory_api"))'
       - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaCore", rev="main"))'
 
     agents:
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -28,7 +28,8 @@ steps:
 
       - echo "--- Instantiate .buildkite"
       - "julia --project=.buildkite -e 'using Pkg; Pkg.instantiate(;verbose=true); Pkg.precompile(;strict=true); using CUDA; CUDA.precompile_runtime(); Pkg.status()'"
-      - "julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name=\"ClimaCore\", rev=\"main\"))'"
+      - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaComms", rev="dy/memory_api"))'
+      - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaCore", rev="main"))'
 
     agents:
       slurm_cpus_per_task: 8
diff --git a/src/prognostic_equations/implicit/auto_dense_jacobian.jl b/src/prognostic_equations/implicit/auto_dense_jacobian.jl
@@ -34,12 +34,12 @@ very large memory requirements at higher vertical resolutions.
 
 When the number of values in each column is very large, computing the entire
 dense matrix in a single evaluation of `implicit_tendency!` can be too expensive
-to compile and run. So, the dual number components are split into batches with a
-maximum size of `max_simultaneous_derivatives`, and we call `implicit_tendency!`
-once for each batch. That is, if the batch size is ``s``, then the first batch
-evaluates the coefficients of ``ε₁`` through ``εₛ``, the second evaluates the
-coefficients of ``εₛ₊₁`` through ``ε₂ₛ``, and so on until ``εₙ``. The default
-batch size is 32.
+to compile and run. So, the dual number components are split into partitions
+with a maximum size of `max_simultaneous_derivatives`, and we call
+`implicit_tendency!` once for each partition. That is, if the partition size is
+``s``, then the first partition evaluates the coefficients of ``ε₁`` through
+``εₛ``, the second evaluates the coefficients of ``εₛ₊₁`` through ``ε₂ₛ``, and so
+on until ``εₙ``. The default partition size is 32.
 """
 struct AutoDenseJacobian{S} <: JacobianAlgorithm end
 AutoDenseJacobian(max_simultaneous_derivatives = 32) =
@@ -53,11 +53,11 @@ function jacobian_cache(alg::AutoDenseJacobian, Y, atmos)
     DA = ClimaComms.array_type(Y)
 
     FT_dual = ForwardDiff.Dual{Jacobian, FT, max_simultaneous_derivatives(alg)}
-    Y_dual = replace_parent_type(Y, FT_dual)
-    Yₜ_dual = similar(Y_dual)
     precomputed_dual =
         replace_parent_type(implicit_precomputed_quantities(Y, atmos), FT_dual)
     scratch_dual = replace_parent_type(temporary_quantities(Y, atmos), FT_dual)
+    Y_dual = replace_parent_type(Y, FT_dual)
+    Yₜ_dual = similar(Y_dual)
 
     N = length(Fields.column(Y, 1, 1, 1))
     n_columns = Fields.ncolumns(Y.c)
@@ -72,10 +72,10 @@ function jacobian_cache(alg::AutoDenseJacobian, Y, atmos)
     I_matrix = reshape(I_column_matrix, N, N, 1)
 
     return (;
-        Y_dual,
-        Yₜ_dual,
         precomputed_dual,
         scratch_dual,
+        Y_dual,
+        Yₜ_dual,
         column_matrices,
         column_lu_factors,
         column_lu_vectors,
@@ -85,7 +85,7 @@ function jacobian_cache(alg::AutoDenseJacobian, Y, atmos)
 end
 
 function update_column_matrices!(alg::AutoDenseJacobian, cache, Y, p, t)
-    (; Y_dual, Yₜ_dual, precomputed_dual, scratch_dual, column_matrices) = cache
+    (; precomputed_dual, scratch_dual, Y_dual, Yₜ_dual, column_matrices) = cache
     device = ClimaComms.device(Y.c)
     column_indices = column_index_iterator(Y)
     scalar_names = scalar_field_names(Y)
@@ -99,48 +99,47 @@ function update_column_matrices!(alg::AutoDenseJacobian, cache, Y, p, t)
     for jacobian_index_to_Y_index_map_partition in
         ClimaComms.threadable(device, jacobian_index_to_Y_index_map_partitions)
 
-        # Add a unique ε to Y for each Jacobian column index in this batch. With
+        # Add a unique ε to each value in Y that is part of this partition. With
         # Y_col and Yᴰ_col denoting the columns of Y and Y_dual at column_index,
-        # set Yᴰ_col to Y_col + I[:, jacobian_indices_in_partition] * εs, where
-        # I is the identity matrix for Y_col (i.e., the value of ∂Y_col/∂Y_col),
-        # εs is a vector of max_simultaneous_derivatives(alg) dual number
-        # components, and jacobian_indices_in_partition is equal to
+        # set Yᴰ_col to Y_col + I[:, jacobian_column_indices] * εs, where I is
+        # the identity matrix for Y_col (i.e., the value of ∂Y_col/∂Y_col), εs
+        # is a vector of max_simultaneous_derivatives(alg) dual number
+        # components, and jacobian_column_indices is equal to
         # first.(jacobian_index_to_Y_index_map_partition).
         Y_dual .= Y
         ClimaComms.@threaded device begin
             # On multithreaded devices, assign one thread to each combination of
-            # spatial column index and Jacobian index in this batch.
+            # spatial column index and Jacobian index in this partition.
             for column_index in column_indices,
-                (ε_index, (_, (scalar_index, level_index))) in
+                (diagonal_ε_index, (_, (scalar_index, level_index))) in
                 enumerate(jacobian_index_to_Y_index_map_partition)
 
-                Y_partials =
-                    ntuple(==(ε_index), Val(max_simultaneous_derivatives(alg)))
-                Y_dual_εs_value = ForwardDiff.Dual{Jacobian}(0, Y_partials)
+                n_εs_val = Val(max_simultaneous_derivatives(alg))
+                Y_dual_ε_coefficients = ntuple(==(diagonal_ε_index), n_εs_val)
                 unrolled_applyat(scalar_index, scalar_names) do name
                     field = MatrixFields.get_field(Y_dual, name)
                     @inbounds point(field, level_index, column_index...)[] +=
-                        Y_dual_εs_value
+                        ForwardDiff.Dual{Jacobian}(0, Y_dual_ε_coefficients)
                 end
             end
         end
 
-        # Compute this batch's portions of ∂p/∂Y and ∂Yₜ/∂Y.
+        # Compute this partition of ∂p/∂Y and ∂Yₜ/∂Y.
         set_implicit_precomputed_quantities!(Y_dual, p_dual, t)
         implicit_tendency!(Yₜ_dual, Y_dual, p_dual, t)
 
-        # Copy this batch's portion of ∂Yₜ/∂Y into column_matrices. With Yₜ_col
-        # and Yₜᴰ_col denoting the columns of Yₜ and Yₜ_dual at column_index, and
+        # Copy this partition of ∂Yₜ/∂Y into column_matrices. With Yₜ_col and
+        # Yₜᴰ_col denoting the columns of Yₜ and Yₜ_dual at column_index, and
         # with col_matrix denoting the matrix at the corresponding matrix_index
         # in column_matrices, copy the coefficients of the εs in Yₜᴰ_col into
         # col_matrix, where the previous steps have set Yₜᴰ_col to
-        # Yₜ_col + (∂Yₜ_col/∂Y_col)[:, jacobian_indices_in_batch] * εs. In other
-        # words, set col_matrix[jacobian_row_index, jacobian_column_index] to
-        # ∂Yₜ_col[jacobian_row_index]/∂Y_col[jacobian_column_index], obtaining
-        # this derivative from the coefficient of εs[ε_index] in
-        # Yₜᴰ_col[jacobian_row_index], where ε_index is the index of
-        # jacobian_column_index in jacobian_indices_in_batch. After all batches
-        # are processed, col_matrix contains the full Jacobian ∂Yₜ_col/∂Y_col.
+        # Yₜ_col + (∂Yₜ_col/∂Y_col)[:, jacobian_column_indices] * εs. In
+        # other words, set col_matrix[jacobian_row_index, jacobian_column_index]
+        # to ∂Yₜ_col[jacobian_row_index]/∂Y_col[jacobian_column_index],
+        # obtaining this derivative from the coefficient of
+        # εs[jacobian_column_ε_index] in Yₜᴰ_col[jacobian_row_index], where
+        # jacobian_column_ε_index is the index of jacobian_column_index in
+        # jacobian_column_indices.
         ClimaComms.@threaded device begin
             # On multithreaded devices, assign one thread to each combination of
             # spatial column index and scalar level index.
@@ -153,16 +152,16 @@ function update_column_matrices!(alg::AutoDenseJacobian, cache, Y, p, t)
                         field = MatrixFields.get_field(Yₜ_dual, name)
                         @inbounds point(field, level_index, column_index...)[]
                     end
-                Yₜ_partials = ForwardDiff.partials(Yₜ_dual_value)
-                for (ε_index, (jacobian_column_index, _)) in
+                Yₜ_dual_ε_coefficients = ForwardDiff.partials(Yₜ_dual_value)
+                for (jacobian_column_ε_index, (jacobian_column_index, _)) in
                     enumerate(jacobian_index_to_Y_index_map_partition)
                     cartesian_index = (
                         jacobian_row_index,
                         jacobian_column_index,
                         matrix_index,
                     )
                     @inbounds column_matrices[cartesian_index...] =
-                        Yₜ_partials[ε_index]
+                        Yₜ_dual_ε_coefficients[jacobian_column_ε_index]
                 end
             end
         end
diff --git a/src/prognostic_equations/implicit/auto_sparse_jacobian.jl b/src/prognostic_equations/implicit/auto_sparse_jacobian.jl
diff --git a/src/prognostic_equations/implicit/autodiff_utils.jl b/src/prognostic_equations/implicit/autodiff_utils.jl
diff --git a/src/prognostic_equations/implicit/jacobian.jl b/src/prognostic_equations/implicit/jacobian.jl