diff --git a/.buildkite/Manifest-v1.11.toml b/.buildkite/Manifest-v1.11.toml
index 891d84bf96..196882e50b 100644
--- a/.buildkite/Manifest-v1.11.toml
+++ b/.buildkite/Manifest-v1.11.toml
@@ -363,14 +363,16 @@ version = "0.5.18"
     Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 
 [[deps.ClimaAtmos]]
-deps = ["Adapt", "ArgParse", "Artifacts", "AtmosphericProfilesLibrary", "ClimaComms", "ClimaCore", "ClimaDiagnostics", "ClimaInterpolations", "ClimaParams", "ClimaTimeSteppers", "ClimaUtilities", "CloudMicrophysics", "Dates", "ForwardDiff", "Insolation", "Interpolations", "LazyArtifacts", "LazyBroadcast", "LinearAlgebra", "Logging", "NCDatasets", "NVTX", "NullBroadcasts", "RRTMGP", "Random", "SciMLBase", "StaticArrays", "Statistics", "SurfaceFluxes", "Thermodynamics", "UnrolledUtilities", "YAML"]
+deps = ["Adapt", "ArgParse", "Artifacts", "AtmosphericProfilesLibrary", "ClimaComms", "ClimaCore", "ClimaDiagnostics", "ClimaInterpolations", "ClimaParams", "ClimaTimeSteppers", "ClimaUtilities", "CloudMicrophysics", "Dates", "ForwardDiff", "Insolation", "Interpolations", "LazyArtifacts", "LazyBroadcast", "LinearAlgebra", "Logging", "NCDatasets", "NVTX", "NullBroadcasts", "RRTMGP", "Random", "SciMLBase", "SparseMatrixColorings", "StaticArrays", "Statistics", "SurfaceFluxes", "Thermodynamics", "UnrolledUtilities", "YAML"]
 path = ".."
 uuid = "b2c96348-7fb7-4fe0-8da9-78d88439e717"
 version = "0.31.0"
 
 [[deps.ClimaComms]]
 deps = ["Adapt", "Logging", "LoggingExtras"]
-git-tree-sha1 = "75b9d1a3b4e3efa2cbbae2eb7b52f14c0b38ccf0"
+git-tree-sha1 = "d341c3fc97a98dbecd6b635f34638b2d9771f94e"
+repo-rev = "dy/memory_api"
+repo-url = "https://github.com/CliMA/ClimaComms.jl.git"
 uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
 version = "0.6.8"
 weakdeps = ["CUDA", "MPI"]
@@ -381,9 +383,11 @@ weakdeps = ["CUDA", "MPI"]
 
 [[deps.ClimaCore]]
 deps = ["Adapt", "BandedMatrices", "BlockArrays", "ClimaComms", "CubedSphere", "DataStructures", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LazyBroadcast", "LinearAlgebra", "MultiBroadcastFusion", "NVTX", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "StaticArrays", "Statistics", "UnrolledUtilities"]
-git-tree-sha1 = "14d3d5810ce1e3c990450a2ce7abc6a1e162855f"
+git-tree-sha1 = "a273452127dbb052f2963e3c1095730a996d49a6"
+repo-rev = "main"
+repo-url = "https://github.com/CliMA/ClimaCore.jl.git"
 uuid = "d414da3d-4745-48bb-8d80-42e94e092884"
-version = "0.14.35"
+version = "0.14.36"
 weakdeps = ["CUDA", "Krylov"]
 
     [deps.ClimaCore.extensions]
@@ -2318,6 +2322,20 @@ deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 version = "1.11.0"
 
+[[deps.SparseMatrixColorings]]
+deps = ["ADTypes", "DocStringExtensions", "LinearAlgebra", "PrecompileTools", "Random", "SparseArrays"]
+git-tree-sha1 = "ab958b4fec46d1f1d057bb8e2a99bfdb90744646"
+uuid = "0a514795-09f3-496d-8182-132a7b665d35"
+version = "0.4.20"
+
+    [deps.SparseMatrixColorings.extensions]
+    SparseMatrixColoringsCliqueTreesExt = "CliqueTrees"
+    SparseMatrixColoringsColorsExt = "Colors"
+
+    [deps.SparseMatrixColorings.weakdeps]
+    CliqueTrees = "60701a23-6482-424a-84db-faee86b9b1f8"
+    Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
+
 [[deps.SpecialFunctions]]
 deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
 git-tree-sha1 = "41852b8679f78c8d8961eeadc8f62cef861a52e3"
diff --git a/.buildkite/ci_driver.jl b/.buildkite/ci_driver.jl
index 3207bcd3a0..af04fe79f4 100644
--- a/.buildkite/ci_driver.jl
+++ b/.buildkite/ci_driver.jl
@@ -39,12 +39,152 @@ using Test
 import Tar
 import Base.Filesystem: rm
 import Statistics: mean
-import LinearAlgebra: norm_sqr
+import LinearAlgebra: norm_sqr, diag, UniformScaling
 include(joinpath(pkgdir(CA), "post_processing", "ci_plots.jl"))
 
 ref_job_id = config.parsed_args["reference_job_id"]
 reference_job_id = isnothing(ref_job_id) ? simulation.job_id : ref_job_id
 
+if (
+    config.parsed_args["debug_approximate_jacobian"] &&
+    !config.parsed_args["use_dense_jacobian"]
+)
+    @info "Debugging Jacobian in first column of final state"
+
+    Y_end = integrator.u
+    t_end = integrator.t
+    (; p, dt) = integrator
+    timestepper_alg = integrator.alg
+    tableau_coefficients =
+        timestepper_alg isa CA.CTS.RosenbrockAlgorithm ?
+        timestepper_alg.tableau.Γ : timestepper_alg.tableau.a_imp
+    jacobian =
+        timestepper_alg isa CA.CTS.RosenbrockAlgorithm ? integrator.cache.W :
+        integrator.cache.newtons_method_cache.j
+
+    FT = eltype(Y_end)
+    γs = filter(!iszero, CA.LinearAlgebra.diag(tableau_coefficients))
+    dtγ = FT(float(dt) * γs[end])
+    scalar_names = CA.scalar_field_names(Y_end)
+    block_keys = Iterators.product(scalar_names, scalar_names)
+
+    exact_jacobian_alg = CA.AutoDenseJacobian()
+    all_approx_jacobian_algs =
+        jacobian.alg isa CA.AutoSparseJacobian ?
+        (; manual = jacobian.alg.sparse_jacobian_alg, auto = jacobian.alg) :
+        (; manual = jacobian.alg)
+    exact_blocks =
+        CA.first_column_block_arrays(exact_jacobian_alg, Y_end, p, dtγ, t_end)
+    all_approx_blocks = map(all_approx_jacobian_algs) do jacobian_alg
+        CA.first_column_block_arrays(jacobian_alg, Y_end, p, dtγ, t_end)
+    end
+    block_rescalings = CA.first_column_rescaling_arrays(Y_end, p, t_end)
+
+    highlighters = (
+        Highlighter((d, i, j) -> d[i, j] < 1e-12; foreground = :dark_gray),
+        Highlighter((d, i, j) -> 1e-12 <= d[i, j] < 1e-6; foreground = :blue),
+        Highlighter((d, i, j) -> 1e-6 <= d[i, j] < 1e-3; foreground = :cyan),
+        Highlighter((d, i, j) -> 1e-3 <= d[i, j] < 1e-1; foreground = :green),
+        Highlighter((d, i, j) -> 1e-1 <= d[i, j] < 1; foreground = :yellow),
+        Highlighter((d, i, j) -> d[i, j] == 1; foreground = :light_red),
+        Highlighter((d, i, j) -> d[i, j] > 1; foreground = :light_magenta),
+    )
+    table_kwargs = (;
+        columns_width = 5,
+        crop = :none,
+        formatters = ft_printf("%1.0e"),
+        highlighters,
+        row_labels = collect(scalar_names),
+        show_header = false,
+        tf = tf_matrix,
+        vlines = [1],
+    )
+    exact_table_kwargs = (; table_kwargs..., highlighters = highlighters[1])
+    bandwidth_table_kwargs = (; exact_table_kwargs..., formatters = nothing)
+
+    rms(block) = sqrt(mean(abs2.(block)))
+    bandwidth(block) =
+        count((1 - size(block, 1)):(size(block, 2) - 1)) do band_index
+            any(!iszero, diag(block, band_index))
+        end
+
+    bandwidth_error_values = map(block_keys) do block_key
+        approx_blocks = first(all_approx_blocks)
+        approx_bandwidth =
+            haskey(approx_blocks, block_key) ?
+            (
+                approx_blocks[block_key] isa UniformScaling ? 1 :
+                bandwidth(approx_blocks[block_key])
+            ) : 0
+        max(bandwidth(exact_blocks[block_key]) - approx_bandwidth, 0)
+    end
+    @info "exact - approx, number of missing bands per block:"
+    pretty_table(bandwidth_error_values; bandwidth_table_kwargs...)
+
+    exact_rms_values = map(block_keys) do block_key
+        rms(exact_blocks[block_key])
+    end
+    normalized_exact_rms_values = map(block_keys) do block_key
+        rms(exact_blocks[block_key] .* block_rescalings[block_key])
+    end
+    @info "unnormalized exact, RMS per block:"
+    pretty_table(exact_rms_values; exact_table_kwargs...)
+    @info "normalized exact, RMS per block [s^-1]:"
+    pretty_table(normalized_exact_rms_values; exact_table_kwargs...)
+
+    println("<$('='^70)>\n")
+    if jacobian.alg isa CA.AutoSparseJacobian
+        normalized_approx_difference_rms_values = map(block_keys) do block_key
+            (; manual, auto) = all_approx_blocks
+            rescaling = block_rescalings[block_key]
+            haskey(manual, block_key) &&
+                !(manual[block_key] isa UniformScaling) ?
+            rms((manual[block_key] - auto[block_key]) .* rescaling) : FT(0)
+        end
+        @info "normalized manual approx - auto approx, RMS per block [s^-1]:"
+        pretty_table(normalized_approx_difference_rms_values; table_kwargs...)
+    end
+    for (approx_name, approx_blocks) in pairs(all_approx_blocks)
+        normalized_approx_error_rms_values = map(block_keys) do block_key
+            approx_error =
+                haskey(approx_blocks, block_key) ?
+                exact_blocks[block_key] - approx_blocks[block_key] :
+                exact_blocks[block_key]
+            rescaling = block_rescalings[block_key]
+            rms(approx_error .* rescaling)
+        end
+        @info "normalized exact - $approx_name approx, RMS per block [s^-1]:"
+        pretty_table(normalized_approx_error_rms_values; table_kwargs...)
+    end
+    println("<$('='^70)>\n")
+    if jacobian.alg isa CA.AutoSparseJacobian
+        approx_difference_relative_rms_values = map(block_keys) do block_key
+            (; manual, auto) = all_approx_blocks
+            rescaling = block_rescalings[block_key]
+            approx_difference_rms_value =
+                haskey(manual, block_key) &&
+                !(manual[block_key] isa UniformScaling) ?
+                rms((manual[block_key] - auto[block_key])) : FT(0)
+            approx_difference_rms_value == 0 ? FT(0) :
+            approx_difference_rms_value / rms(exact_blocks[block_key])
+        end
+        @info "manual approx - auto approx, relative RMS per block [unitless]:"
+        pretty_table(approx_difference_relative_rms_values; table_kwargs...)
+    end
+    for (approx_name, approx_blocks) in pairs(all_approx_blocks)
+        approx_error_relative_rms_values = map(block_keys) do block_key
+            approx_error_rms_value =
+                haskey(approx_blocks, block_key) ?
+                rms(exact_blocks[block_key] - approx_blocks[block_key]) :
+                rms(exact_blocks[block_key])
+            approx_error_rms_value == 0 ? FT(0) :
+            approx_error_rms_value / rms(exact_blocks[block_key])
+        end
+        @info "exact - $approx_name approx, relative RMS per block [unitless]:"
+        pretty_table(approx_error_relative_rms_values; table_kwargs...)
+    end
+end
+
 if sol_res.ret_code == :simulation_crashed
     error(
         "The ClimaAtmos simulation has crashed. See the stack trace for details.",
diff --git a/.buildkite/gpu_pipeline/pipeline.yml b/.buildkite/gpu_pipeline/pipeline.yml
index cf04a4db22..81544498be 100644
--- a/.buildkite/gpu_pipeline/pipeline.yml
+++ b/.buildkite/gpu_pipeline/pipeline.yml
@@ -24,6 +24,8 @@ steps:
       - julia --project=.buildkite -e 'using Pkg; Pkg.precompile()'
       - julia --project=.buildkite -e 'using CUDA; CUDA.precompile_runtime()'
       - julia --project=.buildkite -e 'using Pkg; Pkg.status()'
+      - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaComms", rev="dy/memory_api"))'
+      - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaCore", rev="main"))'
 
     agents:
       slurm_gpus: 1
diff --git a/.buildkite/longruns_gpu/pipeline.yml b/.buildkite/longruns_gpu/pipeline.yml
index 1cc951ecf0..60ca8d188e 100644
--- a/.buildkite/longruns_gpu/pipeline.yml
+++ b/.buildkite/longruns_gpu/pipeline.yml
@@ -25,6 +25,8 @@ steps:
       - julia --project=.buildkite -e 'using Pkg; Pkg.precompile()'
       - julia --project=.buildkite -e 'using CUDA; CUDA.precompile_runtime()'
       - julia --project=.buildkite -e 'using Pkg; Pkg.status()'
+      - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaComms", rev="dy/memory_api"))'
+      - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaCore", rev="main"))'
 
     agents:
       slurm_gpus: 1
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 274e3767b0..766005dd3b 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -28,6 +28,8 @@ steps:
 
       - echo "--- Instantiate .buildkite"
       - "julia --project=.buildkite -e 'using Pkg; Pkg.instantiate(;verbose=true); Pkg.precompile(;strict=true); using CUDA; CUDA.precompile_runtime(); Pkg.status()'"
+      - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaComms", rev="dy/memory_api"))'
+      - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaCore", rev="main"))'
 
     agents:
       slurm_cpus_per_task: 8
@@ -631,7 +633,7 @@ steps:
           --job_id amip_target_edonly_nonequil
         artifact_paths: "amip_target_edonly_nonequil/output_active/*"
         agents:
-          slurm_mem: 20GB
+          slurm_mem: 64GB
 
   - group: "Diagnostic EDMFX"
     steps:
diff --git a/Project.toml b/Project.toml
index bdefca8f3f..bab2077834 100644
--- a/Project.toml
+++ b/Project.toml
@@ -30,6 +30,7 @@ NullBroadcasts = "0d71be07-595a-4f89-9529-4065a4ab43a6"
 RRTMGP = "a01a1ee8-cea4-48fc-987c-fc7878d79da1"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
+SparseMatrixColorings = "0a514795-09f3-496d-8182-132a7b665d35"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 SurfaceFluxes = "49b00bb7-8bd4-4f2b-b78c-51cd0450215f"
@@ -64,6 +65,7 @@ NullBroadcasts = "0.1"
 RRTMGP = "0.21.3"
 Random = "1"
 SciMLBase = "2.12"
+SparseMatrixColorings = "0.4.20"
 StaticArrays = "1.7"
 Statistics = "1"
 SurfaceFluxes = "0.11, 0.12"
diff --git a/config/default_configs/default_config.yml b/config/default_configs/default_config.yml
index 871b8baeb4..da960c6331 100644
--- a/config/default_configs/default_config.yml
+++ b/config/default_configs/default_config.yml
@@ -89,6 +89,12 @@ jvp_step_adjustment:
 use_dense_jacobian:
   help: "Whether to use a dense Jacobian matrix that is computed using forward-mode automatic differentiation and inverted using LU factorization [`true`, `false` (default)]"
   value: false
+use_auto_jacobian:
+  help: "Whether to populate the entries of the sparse Jacobian matrix using forward-mode automatic differentiation with sparse matrix coloring (only used when `use_dense_jacobian` is `false`) [`true`, `false` (default)]"
+  value: true # TODO: Change this to false
+debug_approximate_jacobian:
+  help: "Whether to compare approximations of the Jacobian matrix in the first column of the final state against the exact Jacobian [`true`, `false` (default)]"
+  value: true # TODO: Change this to false
 update_jacobian_every:
   help: "Frequency at which the Jacobian matrix should be updated (once per timestep, once per timestepper stage, or once per linear solve) [`dt`, `stage`, `solve` (default)]"
   value: "solve"
diff --git a/reproducibility_tests/ref_counter.jl b/reproducibility_tests/ref_counter.jl
index 6491e2a62e..ac99383505 100644
--- a/reproducibility_tests/ref_counter.jl
+++ b/reproducibility_tests/ref_counter.jl
@@ -1,4 +1,4 @@
-248
+249
 
 # **README**
 #
diff --git a/src/ClimaAtmos.jl b/src/ClimaAtmos.jl
index 7ed4d5a5fc..ae521764de 100644
--- a/src/ClimaAtmos.jl
+++ b/src/ClimaAtmos.jl
@@ -62,6 +62,7 @@ include(
     joinpath("prognostic_equations", "implicit", "manual_sparse_jacobian.jl"),
 )
 include(joinpath("prognostic_equations", "implicit", "auto_dense_jacobian.jl"))
+include(joinpath("prognostic_equations", "implicit", "auto_sparse_jacobian.jl"))
 include(joinpath("prognostic_equations", "implicit", "autodiff_utils.jl"))
 
 include(joinpath("prognostic_equations", "water_advection.jl"))
diff --git a/src/cache/temporary_quantities.jl b/src/cache/temporary_quantities.jl
index 505effd70c..5477e177f2 100644
--- a/src/cache/temporary_quantities.jl
+++ b/src/cache/temporary_quantities.jl
@@ -5,6 +5,20 @@ using ClimaCore.Utilities: half
 # but cannot be computed on the fly. Unlike the precomputed quantities, these
 # can be modified at any point, so they should never be assumed to be unchanged
 # between function calls.
+function implicit_temporary_quantities(Y, atmos)
+    center_space, face_space = axes(Y.c), axes(Y.f)
+
+    FT = Spaces.undertype(center_space)
+    uvw_vec = UVW(FT(0), FT(0), FT(0))
+    return (;
+        ᶠtemp_scalar = Fields.Field(FT, face_space), # ᶠρaK_h, ᶠρaK_u
+        ᶜtemp_scalar = Fields.Field(FT, center_space), # ᶜρχₜ_diffusion, ᶜa_scalar
+        ᶜtemp_scalar_2 = Fields.Field(FT, center_space), # ᶜKᵥʲ, ᶜK_h_scaled
+        ᶜtemp_C3 = Fields.Field(C3{FT}, center_space), # ᶜu₃ʲ
+        ᶠtemp_CT3 = Fields.Field(CT3{FT}, face_space), # ᶠuₕ³, ᶠu³_diff
+        ᶠtemp_UVWxUVW = Fields.Field(typeof(uvw_vec * uvw_vec'), face_space), # ᶠstrain_rate
+    )
+end
 function temporary_quantities(Y, atmos)
     center_space, face_space = axes(Y.c), axes(Y.f)
 
diff --git a/src/prognostic_equations/implicit/auto_dense_jacobian.jl b/src/prognostic_equations/implicit/auto_dense_jacobian.jl
index f7e11400d5..9a119f5c2f 100644
--- a/src/prognostic_equations/implicit/auto_dense_jacobian.jl
+++ b/src/prognostic_equations/implicit/auto_dense_jacobian.jl
@@ -34,12 +34,12 @@ very large memory requirements at higher vertical resolutions.
 
 When the number of values in each column is very large, computing the entire
 dense matrix in a single evaluation of `implicit_tendency!` can be too expensive
-to compile and run. So, the dual number components are split into batches with a
-maximum size of `max_simultaneous_derivatives`, and we call `implicit_tendency!`
-once for each batch. That is, if the batch size is ``s``, then the first batch
-evaluates the coefficients of ``ε₁`` through ``εₛ``, the second evaluates the
-coefficients of ``εₛ₊₁`` through ``ε₂ₛ``, and so on until ``εₙ``. The default
-batch size is 32.
+to compile and run. So, the dual number components are split into partitions
+with a maximum size of `max_simultaneous_derivatives`, and we call
+`implicit_tendency!` once for each partition. That is, if the partition size is
+``s``, then the first partition evaluates the coefficients of ``ε₁`` through
+``εₛ``, the second evaluates the coefficients of ``εₛ₊₁`` through ``ε₂ₛ``, and so
+on until ``εₙ``. The default partition size is 32.
 """
 struct AutoDenseJacobian{S} <: JacobianAlgorithm end
 AutoDenseJacobian(max_simultaneous_derivatives = 32) =
@@ -52,12 +52,14 @@ function jacobian_cache(alg::AutoDenseJacobian, Y, atmos)
     FT = eltype(Y)
     DA = ClimaComms.array_type(Y)
 
+    precomputed = implicit_precomputed_quantities(Y, atmos)
+    scratch = implicit_temporary_quantities(Y, atmos)
+
     FT_dual = ForwardDiff.Dual{Jacobian, FT, max_simultaneous_derivatives(alg)}
-    Y_dual = replace_parent_type(Y, FT_dual)
+    precomputed_dual = replace_parent_eltype(precomputed, FT_dual)
+    scratch_dual = replace_parent_eltype(scratch, FT_dual)
+    Y_dual = replace_parent_eltype(Y, FT_dual)
     Yₜ_dual = similar(Y_dual)
-    precomputed_dual =
-        replace_parent_type(implicit_precomputed_quantities(Y, atmos), FT_dual)
-    scratch_dual = replace_parent_type(temporary_quantities(Y, atmos), FT_dual)
 
     N = length(Fields.column(Y, 1, 1, 1))
     n_columns = Fields.ncolumns(Y.c)
@@ -72,10 +74,10 @@ function jacobian_cache(alg::AutoDenseJacobian, Y, atmos)
     I_matrix = reshape(I_column_matrix, N, N, 1)
 
     return (;
-        Y_dual,
-        Yₜ_dual,
         precomputed_dual,
         scratch_dual,
+        Y_dual,
+        Yₜ_dual,
         column_matrices,
         column_lu_factors,
         column_lu_vectors,
@@ -84,89 +86,82 @@ function jacobian_cache(alg::AutoDenseJacobian, Y, atmos)
     )
 end
 
-function update_column_matrices!(alg::AutoDenseJacobian, cache, Y, p, dtγ, t)
-    (; Y_dual, Yₜ_dual, precomputed_dual, scratch_dual, column_matrices) = cache
+function update_column_matrices!(alg::AutoDenseJacobian, cache, Y, p, t)
+    (; precomputed_dual, scratch_dual, Y_dual, Yₜ_dual, column_matrices) = cache
     device = ClimaComms.device(Y.c)
     column_indices = column_index_iterator(Y)
     scalar_names = scalar_field_names(Y)
-    scalar_level_indices = scalar_level_index_pairs(Y)
-    batch_size = max_simultaneous_derivatives(alg)
-    batch_size_val = Val(batch_size)
-
-    p_dual_args = ntuple(Val(fieldcount(typeof(p)))) do cache_field_index
-        cache_field_name = fieldname(typeof(p), cache_field_index)
-        if cache_field_name == :precomputed
-            (; p.precomputed..., precomputed_dual...)
-        elseif cache_field_name == :scratch
-            scratch_dual
-        else
-            getfield(p, cache_field_index)
-        end
-    end
-    p_dual = AtmosCache(p_dual_args...)
+    field_vector_indices = field_vector_index_iterator(Y)
+    p_dual = append_to_atmos_cache(p, precomputed_dual, scratch_dual)
 
-    batches = Iterators.partition(scalar_level_indices, batch_size)
-    for batch_scalar_level_indices in ClimaComms.threadable(device, batches)
-        Y_dual .= Y
+    jacobian_index_to_Y_index_map_partitions = Iterators.partition(
+        enumerate(field_vector_indices),
+        max_simultaneous_derivatives(alg),
+    )
+    for jacobian_index_to_Y_index_map_partition in
+        ClimaComms.threadable(device, jacobian_index_to_Y_index_map_partitions)
 
-        # Add a unique ε to Y for each scalar level index in this batch. With
+        # Add a unique ε to each value in Y that is part of this partition. With
         # Y_col and Yᴰ_col denoting the columns of Y and Y_dual at column_index,
-        # set Yᴰ_col to Y_col + I[:, batch_scalar_level_indices] * εs, where I
-        # is the identity matrix for Y_col (i.e., the value of ∂Y_col/∂Y_col),
-        # εs is a vector of batch_size dual number components, and
-        # batch_scalar_level_indices are the batch's indices into Y_col.
+        # set Yᴰ_col to Y_col + I[:, jacobian_column_indices] * εs, where I is
+        # the identity matrix for Y_col (i.e., the value of ∂Y_col/∂Y_col), εs
+        # is a vector of max_simultaneous_derivatives(alg) dual number
+        # components, and jacobian_column_indices is equal to
+        # first.(jacobian_index_to_Y_index_map_partition).
+        Y_dual .= Y
         ClimaComms.@threaded device begin
-            # On multithreaded devices, assign one thread to each combination of
-            # spatial column index and scalar level index in this batch.
+            # On multithreaded devices, use one thread for each dual number.
             for column_index in column_indices,
-                (ε_index, (_, (scalar_index, level_index))) in
-                enumerate(batch_scalar_level_indices)
+                (diagonal_entry_ε_index, (_, (scalar_index, level_index))) in
+                enumerate(jacobian_index_to_Y_index_map_partition)
 
-                Y_partials = ntuple(i -> i == ε_index ? 1 : 0, batch_size_val)
-                Y_dual_increment = ForwardDiff.Dual{Jacobian}(0, Y_partials...)
+                n_εs_val = Val(max_simultaneous_derivatives(alg))
+                ε_coefficients = ntuple(==(diagonal_entry_ε_index), n_εs_val)
                 unrolled_applyat(scalar_index, scalar_names) do name
                     field = MatrixFields.get_field(Y_dual, name)
                     @inbounds point(field, level_index, column_index...)[] +=
-                        Y_dual_increment
+                        ForwardDiff.Dual{Jacobian}(0, ε_coefficients)
                 end
             end
         end
 
-        # Compute this batch's portions of ∂p/∂Y and ∂Yₜ/∂Y.
+        # Compute this partition of ∂p/∂Y and ∂Yₜ/∂Y.
         set_implicit_precomputed_quantities!(Y_dual, p_dual, t)
         implicit_tendency!(Yₜ_dual, Y_dual, p_dual, t)
 
-        # Copy this batch's portion of ∂Yₜ/∂Y into column_matrices. With Yₜ_col
-        # and Yₜᴰ_col denoting the columns of Yₜ and Yₜ_dual at column_index, and
+        # Copy this partition of ∂Yₜ/∂Y into column_matrices. With Yₜ_col and
+        # Yₜᴰ_col denoting the columns of Yₜ and Yₜ_dual at column_index, and
         # with col_matrix denoting the matrix at the corresponding matrix_index
         # in column_matrices, copy the coefficients of the εs in Yₜᴰ_col into
         # col_matrix, where the previous steps have set Yₜᴰ_col to
-        # Yₜ_col + (∂Yₜ_col/∂Y_col)[:, batch_scalar_level_indices] * εs.
-        # Specifically, set col_matrix[scalar_level_index1, scalar_level_index2]
-        # to ∂Yₜ_col[scalar_level_index1]/∂Y_col[scalar_level_index2], obtaining
-        # this derivative from the coefficient of εs[ε_index] in
-        # Yₜᴰ_col[scalar_level_index1], where ε_index is the index of
-        # scalar_level_index2 in batch_scalar_level_indices. After all batches
-        # have been processed, col_matrix is the full Jacobian ∂Yₜ_col/∂Y_col.
+        # Yₜ_col + (∂Yₜ_col/∂Y_col)[:, jacobian_column_indices] * εs. In
+        # other words, set col_matrix[jacobian_row_index, jacobian_column_index]
+        # to ∂Yₜ_col[jacobian_row_index]/∂Y_col[jacobian_column_index],
+        # obtaining this derivative from the coefficient of
+        # εs[jacobian_column_ε_index] in Yₜᴰ_col[jacobian_row_index], where
+        # jacobian_column_ε_index is the index of jacobian_column_index in
+        # jacobian_column_indices.
         ClimaComms.@threaded device begin
-            # On multithreaded devices, assign one thread to each combination of
-            # spatial column index and scalar level index.
+            # On multithreaded devices, use one thread for each dual number.
             for (matrix_index, column_index) in enumerate(column_indices),
-                (scalar_level_index1, (scalar_index1, level_index1)) in
-                scalar_level_indices
+                (jacobian_row_index, (scalar_index, level_index)) in
+                enumerate(field_vector_indices)
 
-                Yₜ_dual_value =
-                    unrolled_applyat(scalar_index1, scalar_names) do name
+                dual_number =
+                    unrolled_applyat(scalar_index, scalar_names) do name
                         field = MatrixFields.get_field(Yₜ_dual, name)
-                        @inbounds point(field, level_index1, column_index...)[]
+                        @inbounds point(field, level_index, column_index...)[]
                     end
-                Yₜ_partials = ForwardDiff.partials(Yₜ_dual_value)
-                for (ε_index, (scalar_level_index2, _)) in
-                    enumerate(batch_scalar_level_indices)
-                    cartesian_index =
-                        (scalar_level_index1, scalar_level_index2, matrix_index)
+                ε_coefficients = ForwardDiff.partials(dual_number)
+                for (jacobian_column_ε_index, (jacobian_column_index, _)) in
+                    enumerate(jacobian_index_to_Y_index_map_partition)
+                    cartesian_index = (
+                        jacobian_row_index,
+                        jacobian_column_index,
+                        matrix_index,
+                    )
                     @inbounds column_matrices[cartesian_index...] =
-                        Yₜ_partials[ε_index]
+                        ε_coefficients[jacobian_column_ε_index]
                 end
             end
         end
@@ -178,7 +173,7 @@ function update_jacobian!(alg::AutoDenseJacobian, cache, Y, p, dtγ, t)
     device = ClimaComms.device(Y.c)
 
     # Set column_matrices to ∂Yₜ/∂Y.
-    update_column_matrices!(alg, cache, Y, p, dtγ, t)
+    update_column_matrices!(alg, cache, Y, p, t)
 
     # Set column_lu_factors to ∂R/∂Y = dtγ * ∂Yₜ/∂Y - I, where R is the residual
     # of the implicit equation.
@@ -193,21 +188,22 @@ function invert_jacobian!(::AutoDenseJacobian, cache, ΔY, R)
     device = ClimaComms.device(ΔY.c)
     column_indices = column_index_iterator(ΔY)
     scalar_names = scalar_field_names(ΔY)
-    scalar_level_indices = scalar_level_index_pairs(ΔY)
+    vector_index_to_field_vector_index_map =
+        enumerate(field_vector_index_iterator(ΔY))
 
     # Copy all scalar values from R into column_lu_vectors.
     ClimaComms.@threaded device begin
-        # On multithreaded devices, assign one thread to each index into R.
+        # On multithreaded devices, use one thread for each number.
         for (vector_index, column_index) in enumerate(column_indices),
             (scalar_level_index, (scalar_index, level_index)) in
-            scalar_level_indices
+            vector_index_to_field_vector_index_map
 
-            value = unrolled_applyat(scalar_index, scalar_names) do name
+            number = unrolled_applyat(scalar_index, scalar_names) do name
                 field = MatrixFields.get_field(R, name)
                 @inbounds point(field, level_index, column_index...)[]
             end
             @inbounds column_lu_vectors[scalar_level_index, vector_index] =
-                value
+                number
         end
     end
 
@@ -216,21 +212,53 @@ function invert_jacobian!(::AutoDenseJacobian, cache, ΔY, R)
 
     # Copy all scalar values from column_lu_vectors into ΔY.
     ClimaComms.@threaded device begin
-        # On multithreaded devices, assign one thread to each index into ΔY.
+        # On multithreaded devices, use one thread for each number.
         for (vector_index, column_index) in enumerate(column_indices),
             (scalar_level_index, (scalar_index, level_index)) in
-            scalar_level_indices
+            vector_index_to_field_vector_index_map
 
-            @inbounds value =
+            @inbounds number =
                 column_lu_vectors[scalar_level_index, vector_index]
             unrolled_applyat(scalar_index, scalar_names) do name
                 field = MatrixFields.get_field(ΔY, name)
-                @inbounds point(field, level_index, column_index...)[] = value
+                @inbounds point(field, level_index, column_index...)[] = number
             end
         end
     end
 end
 
+function first_column_block_arrays(alg::AutoDenseJacobian, Y, p, dtγ, t)
+    scalar_names = scalar_field_names(Y)
+    field_vector_indices = field_vector_index_iterator(Y)
+    column_Y = first_column_view(Y)
+    column_p = first_column_view(p)
+    column_cache = jacobian_cache(alg, column_Y, p.atmos)
+
+    update_column_matrices!(alg, column_cache, column_Y, column_p, t)
+    column_∂R_∂Y = dtγ .* column_cache.column_matrices .- column_cache.I_matrix
+
+    block_arrays = Dict()
+    for block_key in Iterators.product(scalar_names, scalar_names)
+        block_jacobian_row_index_to_Yₜ_index_map =
+            Iterators.filter(enumerate(field_vector_indices)) do index_pair
+                (_, (scalar_index, _)) = index_pair
+                scalar_names[scalar_index] == block_key[1]
+            end
+        block_jacobian_column_index_to_Y_index_map =
+            Iterators.filter(enumerate(field_vector_indices)) do index_pair
+                (_, (scalar_index, _)) = index_pair
+                scalar_names[scalar_index] == block_key[2]
+            end
+        block_view_indices = (
+            map(first, block_jacobian_row_index_to_Yₜ_index_map),
+            map(first, block_jacobian_column_index_to_Y_index_map),
+        )
+        block_arrays[block_key] =
+            Array(view(column_∂R_∂Y, block_view_indices..., 1))
+    end
+    return block_arrays
+end
+
 """
     parallel_lu_factorize!(device, matrices, ::Val{N})
 
diff --git a/src/prognostic_equations/implicit/auto_sparse_jacobian.jl b/src/prognostic_equations/implicit/auto_sparse_jacobian.jl
new file mode 100644
index 0000000000..08f24d9c07
--- /dev/null
+++ b/src/prognostic_equations/implicit/auto_sparse_jacobian.jl
@@ -0,0 +1,435 @@
+import SparseMatrixColorings
+
+"""
+    AutoSparseJacobian(sparse_jacobian_alg, [max_padding_bands_per_block])
+
+A [`JacobianAlgorithm`](@ref) that computes the Jacobian using forward-mode
+automatic differentiation, assuming that the Jacobian's sparsity structure is
+given by `sparse_jacobian_alg`. Only entries that should be nonzero according to
+the sparsity structure are updated, but any other entries that are nonzero can
+introduce errors to the updated entries.
+
+TODO: Add a short explanation of how this algorithm works.
+"""
+struct AutoSparseJacobian{A <: SparseJacobian, M} <: SparseJacobian
+    sparse_jacobian_alg::A
+    max_padding_bands_per_block::M
+end
+AutoSparseJacobian(sparse_jacobian_alg) =
+    AutoSparseJacobian(sparse_jacobian_alg, nothing)
+
+function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true)
+    (; sparse_jacobian_alg, max_padding_bands_per_block) = alg
+
+    FT = eltype(Y)
+    DA = ClimaComms.array_type(Y)
+    device = ClimaComms.device(Y.c)
+
+    column_indices = column_index_iterator(Y) # iterator of (i, j, h)
+    field_vector_indices = field_vector_index_iterator(Y) # iterator of (f, v)
+    scalar_names = scalar_field_names(Y) # iterator of names corresponding to f
+
+    precomputed = implicit_precomputed_quantities(Y, atmos)
+    scratch = implicit_temporary_quantities(Y, atmos)
+
+    # Allocate ∂R/∂Y and its corresponding linear solver.
+    # TODO: Add FieldNameTree(Y) to the matrix in FieldMatrixWithSolver. The
+    # tree is needed to evaluate scalar_tendency_matrix[autodiff_matrix_keys].
+    # (; matrix) = jacobian_cache(sparse_jacobian_alg, Y, atmos)
+    matrix_without_tree = jacobian_cache(sparse_jacobian_alg, Y, atmos).matrix
+    tree = MatrixFields.FieldNameTree(Y)
+    matrix = MatrixFields.FieldMatrixWithSolver(
+        MatrixFields.replace_name_tree(matrix_without_tree.matrix, tree),
+        matrix_without_tree.solver,
+    )
+
+    # Allocate ∂Yₜ/∂Y and create a view of the scalar components of its blocks.
+    tendency_matrix = matrix .+ one(matrix)
+    scalar_tendency_matrix = MatrixFields.scalar_field_matrix(tendency_matrix)
+
+    # Find scalar keys that correspond to the scalar components of the blocks.
+    # When we approximate a tensor derivative as a scalar multiple of the
+    # identity tensor, we compute the scalar quantity using the top-left
+    # component of the tensor. For example, the derivative of Yₜ.c.uₕ with
+    # respect to Y.c.uₕ is computed as the derivative of
+    # Yₜ.c.uₕ.components.data.:1 with respect to Y.c.uₕ.components.data.:1.
+    scalar_block_keys =
+        map(keys(scalar_tendency_matrix)) do (block_row_name, block_column_name)
+            scalar_block_row_name =
+                block_row_name in scalar_names ? block_row_name :
+                unrolled_argfirst(scalar_names) do scalar_name
+                    MatrixFields.is_child_name(scalar_name, block_row_name)
+                end
+            scalar_block_column_name =
+                block_column_name in scalar_names ? block_column_name :
+                unrolled_argfirst(scalar_names) do scalar_name
+                    MatrixFields.is_child_name(scalar_name, block_column_name)
+                end
+            (scalar_block_row_name, scalar_block_column_name)
+        end
+
+    # Find keys of non-constant blocks, which are represented by matrix fields.
+    non_constant_scalar_block_keys =
+        unrolled_filter(scalar_block_keys) do scalar_block_key
+            scalar_tendency_matrix[scalar_block_key] isa Fields.Field
+        end
+
+    # Create a new view of ∂Yₜ/∂Y that has scalar keys and non-constant blocks.
+    autodiff_matrix_keys =
+        MatrixFields.FieldMatrixKeys(non_constant_scalar_block_keys)
+    autodiff_matrix = scalar_tendency_matrix[autodiff_matrix_keys]
+
+    # Construct a mask for nonzero entries in autodiff_matrix as a dense array,
+    # and a similar mask with additional padding bands in each block.
+    # TODO: Improve performance by only adding bands where they are necessary,
+    # or by rescaling blocks instead of adding new bands.
+    N = length(Fields.column(Y, 1, 1, 1))
+    sparsity_mask = Array{Bool}(undef, N, N)
+    sparsity_mask .= false
+    padded_sparsity_mask = copy(sparsity_mask)
+    for block_key in Iterators.product(scalar_names, scalar_names)
+        (block_row_name, block_column_name) = block_key
+
+        # Get a view of this block's sparsity masks with its row/column indices.
+        block_jacobian_row_index_to_Yₜ_index_map =
+            Iterators.filter(enumerate(field_vector_indices)) do index_pair
+                (_, (scalar_index, _)) = index_pair
+                scalar_names[scalar_index] == block_row_name
+            end
+        block_jacobian_column_index_to_Y_index_map =
+            Iterators.filter(enumerate(field_vector_indices)) do index_pair
+                (_, (scalar_index, _)) = index_pair
+                scalar_names[scalar_index] == block_column_name
+            end
+        block_view_indices = (
+            map(first, block_jacobian_row_index_to_Yₜ_index_map),
+            map(first, block_jacobian_column_index_to_Y_index_map),
+        )
+        block_sparsity_mask = view(sparsity_mask, block_view_indices...)
+        padded_block_sparsity_mask =
+            view(padded_sparsity_mask, block_view_indices...)
+
+        # Compute the lower and upper band indices of this block, with empty
+        # blocks corresponding to index ranges whose length is -1 (centered
+        # around 0 for square blocks and around ±1/2 for non-square blocks).
+        (n_rows_in_block, n_columns_in_block) = size(block_sparsity_mask)
+        if block_key in keys(autodiff_matrix)
+            (_, _, lower_band, upper_band) =
+                MatrixFields.band_matrix_info(autodiff_matrix[block_key])
+        else
+            (lower_band, upper_band) =
+                n_rows_in_block == n_columns_in_block ? (1 / 2, -1 / 2) :
+                (n_rows_in_block < n_columns_in_block ? (1, 0) : (0, -1))
+        end
+
+        # Symmetrically expand the range of band indices, with the number of
+        # new bands either limited by max_padding_bands_per_block, or hardcoded
+        # for each block when max_padding_bands_per_block is not specified.
+        max_padding_bands = if !isnothing(max_padding_bands_per_block)
+            max_padding_bands_per_block
+        elseif (
+            !(block_key in keys(autodiff_matrix)) &&
+            (block_row_name, block_row_name) in keys(autodiff_matrix)
+        )
+            if block_key in (
+                (@name(c.uₕ.components.data.:(1)), @name(c.ρ)),
+                (@name(c.uₕ.components.data.:(2)), @name(c.ρ)),
+            )
+                # ∂ᶜuₕₜ/∂ᶜρ and ∂ᶜuₕₜ/∂ᶜuₕ can have similar unnormalized entries
+                3
+            elseif block_key in (
+                (@name(c.ρe_tot), @name(c.ρ)),
+                (@name(c.ρe_tot), @name(c.ρq_liq)),
+                (@name(c.ρe_tot), @name(c.ρq_ice)),
+                (@name(c.ρe_tot), @name(c.ρq_rai)),
+                (@name(c.ρe_tot), @name(c.ρq_sno)),
+                (@name(c.ρe_tot), @name(c.sgsʲs.:(1).q_tot)),
+            )
+                # ∂ᶜρe_totₜ/∂ᶜχ and ∂ᶜρe_totₜ/∂ᶜρe_tot can have similar
+                # unnormalized entries for several different variables ᶜχ
+                3
+            elseif (
+                block_row_name == @name(f.sgsʲs.:(1).u₃.components.data.:(1)) &&
+                block_column_name in (
+                    @name(c.uₕ.components.data.:(1)),
+                    @name(c.uₕ.components.data.:(2)),
+                )
+            )
+                # ∂ᶠu₃ʲₜ/∂ᶜuₕ and ∂ᶠu₃ʲₜ/∂ᶠu₃ʲ can have similar unnormalized
+                # entries (and ∂ᶠu₃ʲₜ/∂ᶜmseʲ can also have similar entries)
+                2
+            else
+                0
+            end
+        else
+            0
+        end
+        padded_lower_band = ceil(Int, lower_band - max_padding_bands / 2)
+        padded_upper_band = floor(Int, upper_band + max_padding_bands / 2)
+
+        if verbose
+            n_padding_bands =
+                length(padded_lower_band:padded_upper_band) -
+                length(lower_band:upper_band)
+            n_padding_bands > 0 &&
+                @info "Adding $n_padding_bands padding bands for $block_key"
+        end
+
+        # Update the sparsity mask entries corresponding to bands in this block.
+        for band in padded_lower_band:padded_upper_band
+            is_not_padding_band = band in lower_band:upper_band
+            level_index_min = band < 0 ? 1 - band : 1
+            level_index_max =
+                band < n_columns_in_block - n_rows_in_block ? n_rows_in_block :
+                n_columns_in_block - band
+            for level_index in level_index_min:level_index_max
+                block_mask_index = (level_index, level_index + band)
+                block_sparsity_mask[block_mask_index...] = is_not_padding_band
+                padded_block_sparsity_mask[block_mask_index...] = true
+            end
+        end
+    end
+
+    # Find a coloring that minimizes the number of required colors.
+    jacobian_column_colorings = map(SparseMatrixColorings.all_orders()) do order
+        sparse_matrix = SparseMatrixColorings.sparse(padded_sparsity_mask)
+        problem = SparseMatrixColorings.ColoringProblem()
+        greedy_algorithm =
+            SparseMatrixColorings.GreedyColoringAlgorithm(order)
+        SparseMatrixColorings.coloring(sparse_matrix, problem, greedy_algorithm)
+    end
+    best_jacobian_column_coloring =
+        argmin(SparseMatrixColorings.ncolors, jacobian_column_colorings)
+    n_colors = SparseMatrixColorings.ncolors(best_jacobian_column_coloring)
+
+    # When running on GPU devices, divide n_colors into partitions that are each
+    # guaranteed to fit in 90% of the memory that is currently free.
+    n_partitions = if device isa ClimaComms.AbstractCPUDevice
+        1
+    else
+        free_memory = ClimaComms.free_memory(device)
+        max_memory = free_memory * 9 ÷ 10
+        memory_for_I_matrix = n_colors * parent_memory(Y)
+        memory_per_ε =
+            (parent_memory(precomputed) + parent_memory(scratch)) +
+            2 * parent_memory(Y)
+        # Find the smallest possible integer n_partitions and some other integer
+        # n_εs such that n_partitions * n_εs >= n_colors and
+        # (n_εs + 1) * memory_per_ε + memory_for_I_matrix <= max_memory, where
+        # (n_εs + 1) * memory_per_ε is the memory required to store
+        # precomputed_dual, scratch_dual, Y_dual, and Yₜ_dual, and where
+        # memory_for_I_matrix is an approximation of the memory required to
+        # store I_matrix_partitions. The actual memory_for_I_matrix is given by
+        # (n_colors + n_partitions) * parent_memory(Y), but we can ignore the
+        # value of n_partitions if it is negligible compared to n_colors. When
+        # max_memory is too small to fit any εs, try using one ε per partition.
+        # TODO: Replace the fields in I_matrix_partitions with column fields,
+        # making memory_for_I_matrix negligible compared to max_memory.
+        n_εs_max = (max_memory - memory_for_I_matrix) ÷ memory_per_ε - 1
+        cld(n_colors, max(n_εs_max, 1))
+    end
+    n_εs = cld(n_colors, n_partitions)
+
+    if verbose
+        if n_partitions == 1
+            @info "Updating Jacobian using $n_εs ε components per dual number"
+        else
+            @info "Updating Jacobian using $n_partitions partitions of \
+                   $n_colors colors, with $n_εs ε components per dual number"
+        end
+    end
+
+    # FieldVectors and cached fields with dual numbers instead of real numbers,
+    # with dual numbers using the tag "Jacobian" for specialized dispatch
+    # TODO: Refactor FieldVector broadcasting so that performance does not
+    # deteriorate if we only store one column of each partition_εs.
+    FT_dual = ForwardDiff.Dual{Jacobian, FT, n_εs}
+    precomputed_dual = replace_parent_eltype(precomputed, FT_dual)
+    scratch_dual = replace_parent_eltype(scratch, FT_dual)
+    Y_dual = replace_parent_eltype(Y, FT_dual)
+    Yₜ_dual = similar(Y_dual)
+    I_matrix_partitions = ntuple(_ -> similar(Y_dual), n_partitions)
+
+    # iterator of colors for each of the values in Y that require autodiff, and
+    # 0 for values that do not require it (by not initializing dual number
+    # components when they are unneeded, we avoid some of the errors introduced
+    # by our sparsity approximation)
+    jacobian_column_colors =
+        SparseMatrixColorings.column_colors(best_jacobian_column_coloring)
+    for Y_index_and_sparsity_mask in enumerate(eachcol(sparsity_mask))
+        (Y_index, jacobian_column_sparsity_mask) = Y_index_and_sparsity_mask
+        if !any(jacobian_column_sparsity_mask)
+            jacobian_column_colors[Y_index] = 0
+        end
+    end
+
+    # iterator of pairs ((f, v), c), where the color c identifies a component
+    # of the dual number in row (f, v) of Y_dual that corresponds to the
+    # diagonal entry in the same row of the matrix ∂Y/∂Y (or 0 if the
+    # corresponding value in Y does not require autodiff)
+    Y_index_to_diagonal_color_map =
+        zip(field_vector_indices, jacobian_column_colors)
+
+    # Set the dual numbers in each FieldVector partition_εs so that the ε
+    # components correspond to partitions of the N × N identity matrix ∂Y/∂Y.
+    # Specifically, every column of partition_εs is a vector of N dual numbers,
+    # each of which is stored as a combination of a value and n_εs partial
+    # derivatives. The ε components can be interpreted as representing N × n_εs
+    # slices of a sparse N × n_colors representation of ∂Y/∂Y. Convert n_εs to
+    # a Val and Y_index_to_diagonal_color_map to a DA for GPU compatibility, and
+    # drop spatial information from every Field to ensure that this kernel stays
+    # below the GPU parameter memory limit.
+    n_εs_val = Val(n_εs)
+    I_matrix_partitions_data = unrolled_map(I_matrix_partitions) do partition_εs
+        unrolled_map(Fields.field_values, Fields._values(partition_εs))
+    end
+    ClimaComms.@threaded device begin
+        # On multithreaded devices, use one thread for each dual number.
+        for (partition_index, partition_εs_data) in
+            enumerate(I_matrix_partitions_data),
+            column_index in column_indices,
+            index_pair in DA(collect(Y_index_to_diagonal_color_map))
+
+            ((scalar_index, level_index), diagonal_entry_color) = index_pair
+            ε_offset = (partition_index - 1) * n_εs
+            diagonal_entry_ε_index =
+                ε_offset < diagonal_entry_color <= ε_offset + n_εs ?
+                diagonal_entry_color - ε_offset : 0
+            ε_coefficients = ntuple(==(diagonal_entry_ε_index), n_εs_val)
+            unrolled_applyat(scalar_index, scalar_names) do name
+                field = MatrixFields.get_field(partition_εs_data, name)
+                @inbounds point(field, level_index, column_index...)[] =
+                    ForwardDiff.Dual{Jacobian}(0, ε_coefficients)
+            end
+        end
+    end
+
+    # number of colors needed to represent a band matrix row in any block
+    colors_per_band_matrix_row =
+        maximum(values(autodiff_matrix)) do matrix_field
+            (_, _, lower_band, upper_band) =
+                MatrixFields.band_matrix_info(matrix_field)
+            upper_band - lower_band + 1
+        end
+
+    # iterator of pairs ((b, v), (f, cs)), where (b, v) is the index of a band
+    # matrix row in autodiff_matrix, (f, v) is the index of a dual number in
+    # Yₜ_dual, and cs is a tuple that contains the colors of the band matrix row
+    # entries, which is padded to have a constant size for GPU compatibility
+    # TODO: Use an iterator of pairs ((f, v), ((b1, cs1), (b2, cs2), ...)), so
+    # that each pair corresponds to a dual number instead of a band matrix row.
+    band_matrix_row_index_to_colors_map = Iterators.flatmap(
+        enumerate(pairs(autodiff_matrix)),
+    ) do (block_index, (block_key, matrix_field))
+        (block_row_name, block_column_name) = block_key
+        (n_rows_in_block, n_columns_in_block, lower_band, upper_band) =
+            MatrixFields.band_matrix_info(matrix_field)
+
+        block_Yₜ_indices =
+            Iterators.filter(field_vector_indices) do (scalar_index, _)
+                scalar_names[scalar_index] == block_row_name
+            end
+        block_Y_index_to_color_map =
+            Iterators.filter(Y_index_to_diagonal_color_map) do index_pair
+                ((scalar_index, _), _) = index_pair
+                scalar_names[scalar_index] == block_column_name
+            end
+        block_colors = last.(block_Y_index_to_color_map)
+
+        map(block_Yₜ_indices) do (scalar_index, level_index)
+            entry_colors = ntuple(colors_per_band_matrix_row) do band_index
+                band = lower_band + band_index - 1
+                level_index_min = band < 0 ? 1 - band : 1
+                level_index_max =
+                    band < n_columns_in_block - n_rows_in_block ?
+                    n_rows_in_block : n_columns_in_block - band
+                is_color_at_index =
+                    band <= upper_band &&
+                    level_index_min <= level_index <= level_index_max
+                is_color_at_index ? block_colors[level_index + band] : 0
+            end
+            ((block_index, level_index), (scalar_index, entry_colors))
+        end
+    end
+
+    # Convert the lazy iterator to a DA for GPU compatibility.
+    band_matrix_row_index_to_colors_map =
+        DA(collect(band_matrix_row_index_to_colors_map))
+
+    return (;
+        matrix,
+        tendency_matrix,
+        autodiff_matrix,
+        precomputed_dual,
+        scratch_dual,
+        Y_dual,
+        Yₜ_dual,
+        I_matrix_partitions,
+        band_matrix_row_index_to_colors_map,
+    )
+end
+
+function update_jacobian!(::AutoSparseJacobian, cache, Y, p, dtγ, t)
+    (; matrix, tendency_matrix, autodiff_matrix) = cache
+    (; precomputed_dual, scratch_dual, Y_dual, Yₜ_dual) = cache
+    (; I_matrix_partitions, band_matrix_row_index_to_colors_map) = cache
+
+    device = ClimaComms.device(Y.c)
+    column_indices = column_index_iterator(Y)
+    scalar_names = scalar_field_names(Y)
+    p_dual = append_to_atmos_cache(p, precomputed_dual, scratch_dual)
+
+    for (partition_index, partition_εs) in enumerate(I_matrix_partitions)
+        # Set the εs in Y_dual to represent a partition of the identity matrix.
+        Y_dual .= Y .+ partition_εs
+
+        # Compute ∂p/∂Y * partition_εs and ∂Yₜ/∂Y * partition_εs.
+        set_implicit_precomputed_quantities!(Y_dual, p_dual, t)
+        implicit_tendency!(Yₜ_dual, Y_dual, p_dual, t)
+
+        # Move the entries of ∂Yₜ/∂Y * partition_εs from Yₜ_dual into the
+        # blocks of autodiff_matrix. Drop spatial information from every Field
+        # to ensure that this kernel stays below the GPU parameter memory limit.
+        Yₜ_dual_data =
+            unrolled_map(Fields.field_values, Fields._values(Yₜ_dual))
+        matrix_fields_data =
+            unrolled_map(Fields.field_values, values(autodiff_matrix))
+        ClimaComms.@threaded device begin
+            # On multithreaded devices, use one thread for each band matrix row.
+            # TODO: Modify the map and use one thread for each dual number.
+            for column_index in column_indices,
+                index_pair in band_matrix_row_index_to_colors_map
+
+                ((block_index, level_index), (scalar_index, entry_colors)) =
+                    index_pair
+                dual_number =
+                    unrolled_applyat(scalar_index, scalar_names) do name
+                        data = MatrixFields.get_field(Yₜ_dual_data, name)
+                        @inbounds point(data, level_index, column_index...)[]
+                    end
+                ε_coefficients = ForwardDiff.partials(dual_number)
+                n_εs = length(ε_coefficients)
+                ε_offset = (partition_index - 1) * n_εs
+                unrolled_applyat(block_index, matrix_fields_data) do block_data
+                    @inbounds entries_data =
+                        point(block_data, level_index, column_index...).entries
+                    entries_data[] =
+                        map(entry_colors, entries_data[]) do entry_color, entry
+                            # If the entry has a color in the current partition,
+                            # set the entry to the ε coefficient for that color.
+                            # Otherwise, keep the value from the block's data.
+                            ε_offset < entry_color <= ε_offset + n_εs ?
+                            (@inbounds ε_coefficients[entry_color - ε_offset]) :
+                            entry
+                        end # TODO: Why does unrolled_map break GPU compilation?
+                end
+            end
+        end
+    end
+
+    # Update the matrix for ∂R/∂Y using the new values of ∂Yₜ/∂Y.
+    matrix .= dtγ .* tendency_matrix .- one(matrix)
+end
+
+invert_jacobian!(alg::AutoSparseJacobian, cache, ΔY, R) =
+    invert_jacobian!(alg.sparse_jacobian_alg, cache, ΔY, R)
diff --git a/src/prognostic_equations/implicit/autodiff_utils.jl b/src/prognostic_equations/implicit/autodiff_utils.jl
index 4c0c5da81d..17300215ea 100644
--- a/src/prognostic_equations/implicit/autodiff_utils.jl
+++ b/src/prognostic_equations/implicit/autodiff_utils.jl
@@ -1,17 +1,67 @@
-# Replaces all numbers in Fields and DataLayouts with values of type T.
-replace_parent_type(x::Fields.FieldVector, ::Type{T}) where {T} = similar(x, T)
-replace_parent_type(x::Fields.Field, ::Type{T}) where {T} =
-    Fields.Field(replace_parent_type(Fields.field_values(x), T), axes(x))
-replace_parent_type(x::DataLayouts.AbstractData, ::Type{T}) where {T} =
+# Tuple that contains the components of a generic object.
+components(x) = ntuple(i -> getfield(x, i), Val(fieldcount(typeof(x))))
+
+# Analogue of map that applies a function to each component of a generic object.
+# Aside from some special cases, this assumes that the object's type has a
+# default constructor (one that does not need any type parameters).
+map_components(f::F, x::Union{Tuple, NamedTuple}) where {F} = unrolled_map(f, x)
+map_components(f::F, x) where {F} =
+    typeof(x).name.wrapper(unrolled_map(f, components(x))...)
+
+# Determines whether there is at least one DataLayout within an object.
+contains_data_layout(::DataLayouts.AbstractData) = true
+contains_data_layout(x) = unrolled_any(contains_data_layout, components(x))
+
+# Adds up the memory required to allocate all DataLayouts within an object.
+parent_memory(x::DataLayouts.AbstractData) = sizeof(parent(x))
+parent_memory(x) =
+    contains_data_layout(x) ? unrolled_sum(parent_memory, components(x)) : 0
+
+# Extracts a view of the first column from every DataLayout within an object,
+# except for DataLayouts in DSSBuffers, which are not needed in a single column.
+first_column_view(x::DataLayouts.AbstractData) = Fields.column(x, 1, 1, 1)
+first_column_view(x::Fields.Field) = Fields.column(x, 1, 1, 1)
+first_column_view(x::Fields.FieldVector) = Fields.column(x, 1, 1, 1)
+first_column_view(x::Topologies.DSSBuffer) = nothing
+first_column_view(x) =
+    contains_data_layout(x) ? map_components(first_column_view, x) : x
+
+# Makes T the parent array element type of every DataLayout within an object.
+replace_parent_eltype(x::DataLayouts.AbstractData, ::Type{T}) where {T} =
     DataLayouts.replace_basetype(x, T)
-replace_parent_type(x::Union{Tuple, NamedTuple}, ::Type{T}) where {T} =
-    unrolled_map(Base.Fix2(replace_parent_type, T), x)
+replace_parent_eltype(x::Fields.Field, ::Type{T}) where {T} =
+    Fields.Field(replace_parent_eltype(Fields.field_values(x), T), axes(x))
+replace_parent_eltype(x::Fields.FieldVector, ::Type{T}) where {T} =
+    similar(x, T)
+replace_parent_eltype(x, ::Type{T}) where {T} =
+    contains_data_layout(x) ?
+    map_components(Base.Fix2(replace_parent_eltype, T), x) : x
+
+# Appends values to the precomputed and scratch components of an AtmosCache.
+append_to_atmos_cache(atmos_cache, precomputed, scratch) = AtmosCache(
+    unrolled_map(fieldnames(typeof(atmos_cache))) do cache_component_name
+        if cache_component_name == :precomputed
+            (; atmos_cache.precomputed..., precomputed...)
+        elseif cache_component_name == :scratch
+            (; atmos_cache.scratch..., scratch...)
+        else
+            getfield(atmos_cache, cache_component_name)
+        end
+    end...,
+)
+
+# The horizontal SpectralElementSpace of the fields in a FieldVector.
+function horizontal_space(field_vector)
+    all_values = Fields._values(field_vector)
+    space = axes(unrolled_argfirst(value -> value isa Fields.Field, all_values))
+    return space isa Spaces.FiniteDifferenceSpace ? nothing :
+           Spaces.horizontal_space(space)
+end
 
 # An iterator over the column indices of all fields in a FieldVector.
 function column_index_iterator(field_vector)
-    all_values = Fields._values(field_vector)
-    first_field = unrolled_argfirst(value -> value isa Fields.Field, all_values)
-    horz_space = Spaces.horizontal_space(axes(first_field))
+    horz_space = horizontal_space(field_vector)
+    isnothing(horz_space) && return ((1, 1, 1),)
     qs = 1:Quadratures.degrees_of_freedom(Spaces.quadrature_style(horz_space))
     hs = Spaces.eachslabindex(horz_space)
     return horz_space isa Spaces.SpectralElementSpace1D ?
@@ -24,31 +74,33 @@ scalar_field_names(field_vector) =
         x isa Fields.Field && eltype(x) == eltype(field_vector)
     end
 
-# An iterator with pairs of the form
-# (scalar_level_index, (scalar_index, level_index)), where scalar_index is an
-# index into scalar_field_names(field_vector), level_index is a vertical index
-# into the scalar field with this name, and scalar_level_index is a unique
-# linear index for each combination of scalar field name and vertical index.
-function scalar_level_index_pairs(field_vector)
+# An iterator with tuples of the form (scalar_index, level_index)), where
+# scalar_index is an index into scalar_field_names(field_vector) and level_index
+# is a vertical index into the scalar field with this name.
+function field_vector_index_iterator(field_vector)
     scalar_names = scalar_field_names(field_vector)
     scalar_index_and_level_pairs =
         unrolled_map(enumerate(scalar_names)) do (scalar_index, name)
             field = MatrixFields.get_field(field_vector, name)
-            is_one_level = field isa Fields.SpectralElementField
+            is_one_level =
+                field isa Union{Fields.PointField, Fields.SpectralElementField}
             Iterators.map(
                 level_index -> (scalar_index, level_index),
                 Base.OneTo(is_one_level ? 1 : Spaces.nlevels(axes(field))),
             )
         end
-    return enumerate(Iterators.flatten(scalar_index_and_level_pairs))
+    return Iterators.flatten(scalar_index_and_level_pairs)
 end
 
-# A view of the data for one point in a field, which can be used like a Ref.
-Base.@propagate_inbounds function point(field, level_index, column_index...)
-    column_field = Fields.column(field, column_index...)
-    column_field isa Fields.PointField && return column_field
-    level_index_offset = Operators.left_idx(axes(column_field)) - 1
-    return Fields.level(column_field, level_index + level_index_offset)
+# A view of one point in a Field or DataLayout, which can be used like a Ref.
+Base.@propagate_inbounds function point(value, level_index, column_index...)
+    column_value = Fields.column(value, column_index...)
+    column_value isa Union{Fields.PointField, DataLayouts.DataF} &&
+        return column_value
+    level_index_offset =
+        column_value isa Fields.Field ?
+        Operators.left_idx(axes(column_value)) - 1 : 0
+    return Fields.level(column_value, level_index + level_index_offset)
 end
 
 # TODO: This needs to be moved into ClimaCore to avoid breaking precompilation.
@@ -90,3 +142,33 @@ end
     deriv = iszero(x) ? zero(x) : inv(2 * val)
     return ForwardDiff.dual_definition_retval(tag, val, deriv, partials)
 end
+
+# Ignore all derivative information when comparing Duals to other Numbers. This
+# ensures that conditional statements are always equivalent for Duals and Reals.
+for func in (:iszero,)
+    @eval @inline Base.$func(arg::ForwardDiff.Dual{Jacobian}) =
+        $func(ForwardDiff.value(arg))
+end
+for func in (:isequal, :isless, :<, :>, :(==), :!=, :<=, :>=)
+    # These methods handle combinations of Duals without ambiguities.
+    @eval @inline Base.$func(
+        arg1::ForwardDiff.Dual{Jacobian},
+        arg2::ForwardDiff.Dual{Jacobian},
+    ) = $func(ForwardDiff.value(arg1), ForwardDiff.value(arg2))
+    @eval @inline Base.$func(
+        arg1::ForwardDiff.Dual{Tx},
+        arg2::ForwardDiff.Dual{Jacobian},
+    ) where {Tx} = $func(ForwardDiff.value(arg1), ForwardDiff.value(arg2))
+    @eval @inline Base.$func(
+        arg1::ForwardDiff.Dual{Jacobian},
+        arg2::ForwardDiff.Dual{Ty},
+    ) where {Ty} = $func(ForwardDiff.value(arg1), ForwardDiff.value(arg2))
+
+    # These methods handle combinations of Reals and Duals without ambiguities.
+    for R in (ForwardDiff.AMBIGUOUS_TYPES..., AbstractIrrational)
+        @eval @inline Base.$func(arg1::ForwardDiff.Dual{Jacobian}, arg2::$R) =
+            $func(ForwardDiff.value(arg1), arg2)
+        @eval @inline Base.$func(arg1::$R, arg2::ForwardDiff.Dual{Jacobian}) =
+            $func(arg1, ForwardDiff.value(arg2))
+    end
+end
diff --git a/src/prognostic_equations/implicit/jacobian.jl b/src/prognostic_equations/implicit/jacobian.jl
index fa4680dcb7..41fe2c956e 100644
--- a/src/prognostic_equations/implicit/jacobian.jl
+++ b/src/prognostic_equations/implicit/jacobian.jl
@@ -4,14 +4,21 @@
 A description of how to compute the matrix ``∂R/∂Y``, where ``R(Y)`` denotes the
 residual of an implicit step with the state ``Y``. Concrete implementations of
 this abstract type should define 3 methods:
- - `jacobian_cache(alg::JacobianAlgorithm, Y, atmos)`
+ - `jacobian_cache(alg::JacobianAlgorithm, Y, atmos; [verbose])`
  - `update_jacobian!(alg::JacobianAlgorithm, cache, Y, p, dtγ, t)`
  - `invert_jacobian!(alg::JacobianAlgorithm, cache, ΔY, R)`
+To facilitate debugging, concrete implementations should also define
+ - `first_column_block_arrays(alg::JacobianAlgorithm, Y, p, dtγ, t)`
 
 See [Implicit Solver](@ref) for additional background information.
 """
 abstract type JacobianAlgorithm end
 
+abstract type SparseJacobian <: JacobianAlgorithm end
+
+# Ignore the verbose flag in jacobian_cache when it is not needed.
+jacobian_cache(alg, Y, atmos; verbose) = jacobian_cache(alg, Y, atmos)
+
 """
     Jacobian(alg, Y, atmos)
 
@@ -59,3 +66,45 @@ function LinearAlgebra.ldiv!(
     LinearAlgebra.ldiv!(ΔY_krylov, jacobian, R_krylov)
     ΔY .= ΔY_krylov
 end
+
+# This defines a standardized format for comparing different types of Jacobians.
+function first_column_block_arrays(alg::SparseJacobian, Y, p, dtγ, t)
+    scalar_names = scalar_field_names(Y)
+    column_Y = first_column_view(Y)
+    column_p = first_column_view(p)
+    column_cache = jacobian_cache(alg, column_Y, p.atmos; verbose = false)
+
+    update_jacobian!(alg, column_cache, column_Y, column_p, dtγ, t)
+    column_∂R_∂Y = column_cache.matrix
+
+    block_arrays = Dict()
+    for block_key in Iterators.product(scalar_names, scalar_names)
+        block_key in keys(column_∂R_∂Y) || continue
+        block_value = Base.materialize(column_∂R_∂Y[block_key])
+        block_arrays[block_key] =
+            block_value isa Fields.Field ?
+            MatrixFields.column_field2array(block_value) : block_value
+    end
+    return block_arrays
+end
+function first_column_rescaling_arrays(Y, p, t)
+    scalar_names = scalar_field_names(Y)
+    column_Y = first_column_view(Y)
+    column_p = first_column_view(p)
+    column_Yₜ = similar(column_Y)
+
+    implicit_tendency!(column_Yₜ, column_Y, column_p, t)
+
+    rescaling_arrays = Dict()
+    for block_key in Iterators.product(scalar_names, scalar_names)
+        (block_row_name, block_column_name) = block_key
+        block_row_Yₜ_values =
+            parent(MatrixFields.get_field(column_Yₜ, block_row_name))
+        block_column_Yₜ_values =
+            parent(MatrixFields.get_field(column_Yₜ, block_column_name))
+        safe_inverse = x -> iszero(x) || issubnormal(x) ? zero(x) : inv(x)
+        rescaling_arrays[block_key] =
+            Array(safe_inverse.(block_row_Yₜ_values) .* block_column_Yₜ_values')
+    end
+    return rescaling_arrays
+end
diff --git a/src/prognostic_equations/implicit/manual_sparse_jacobian.jl b/src/prognostic_equations/implicit/manual_sparse_jacobian.jl
index 872a7185ea..01fed10db4 100644
--- a/src/prognostic_equations/implicit/manual_sparse_jacobian.jl
+++ b/src/prognostic_equations/implicit/manual_sparse_jacobian.jl
@@ -48,7 +48,7 @@ solver. Certain groups of derivatives can be toggled on or off by setting their
 - `approximate_solve_iters::Int`: number of iterations to take for the
   approximate linear solve required when the `diffusion_flag` is `UseDerivative`
 """
-struct ManualSparseJacobian{F1, F2, F3, F4, F5, F6} <: JacobianAlgorithm
+struct ManualSparseJacobian{F1, F2, F3, F4, F5, F6} <: SparseJacobian
     topography_flag::F1
     diffusion_flag::F2
     sgs_advection_flag::F3
diff --git a/src/prognostic_equations/vertical_diffusion_boundary_layer.jl b/src/prognostic_equations/vertical_diffusion_boundary_layer.jl
index bca3cd65c2..4bb7f743e9 100644
--- a/src/prognostic_equations/vertical_diffusion_boundary_layer.jl
+++ b/src/prognostic_equations/vertical_diffusion_boundary_layer.jl
@@ -102,16 +102,16 @@ function vertical_diffusion_boundary_layer_tendency!(
 
     ᶜρχₜ_diffusion = p.scratch.ᶜtemp_scalar
     ᶜK_h_scaled = p.scratch.ᶜtemp_scalar_2
-    ᶜdivᵥ_ρχ = Operators.DivergenceF2C(
-        top = Operators.SetValue(C3(0)),
-        bottom = Operators.SetValue(C3(0)),
-    )
     foreach_gs_tracer(Yₜ, Y) do ᶜρχₜ, ᶜρχ, ρχ_name
         if ρχ_name in (@name(ρq_rai), @name(ρq_sno), @name(ρn_rai))
             @. ᶜK_h_scaled = α_vert_diff_tracer * ᶜK_h
         else
             @. ᶜK_h_scaled = ᶜK_h
         end
+        ᶜdivᵥ_ρχ = Operators.DivergenceF2C(
+            top = Operators.SetValue(C3(0)),
+            bottom = Operators.SetValue(C3(0)),
+        )
         @. ᶜρχₜ_diffusion = ᶜdivᵥ_ρχ(
             -(
                 ᶠinterp(Y.c.ρ) *
diff --git a/src/solver/type_getters.jl b/src/solver/type_getters.jl
index c4347fdbfe..1cd60d5211 100644
--- a/src/solver/type_getters.jl
+++ b/src/solver/type_getters.jl
@@ -451,24 +451,28 @@ function get_surface_setup(parsed_args)
     return getproperty(SurfaceConditions, Symbol(parsed_args["surface_setup"]))()
 end
 
-get_jacobian(ode_algo, Y, atmos, parsed_args) =
-    if ode_algo isa Union{CTS.IMEXAlgorithm, CTS.RosenbrockAlgorithm}
-        jacobian_algorithm =
-            parsed_args["use_dense_jacobian"] ? AutoDenseJacobian() :
-            ManualSparseJacobian(
-                DerivativeFlag(has_topography(axes(Y.c))),
-                DerivativeFlag(atmos.diff_mode),
-                DerivativeFlag(atmos.sgs_adv_mode),
-                DerivativeFlag(atmos.sgs_entr_detr_mode),
-                DerivativeFlag(atmos.sgs_mf_mode),
-                DerivativeFlag(atmos.sgs_nh_pressure_mode),
-                parsed_args["approximate_linear_solve_iters"],
-            )
-        @info "Jacobian algorithm: $(summary_string(jacobian_algorithm))"
-        Jacobian(jacobian_algorithm, Y, atmos)
+function get_jacobian(ode_algo, Y, atmos, parsed_args)
+    ode_algo isa Union{CTS.IMEXAlgorithm, CTS.RosenbrockAlgorithm} ||
+        return nothing
+    jacobian_algorithm = if parsed_args["use_dense_jacobian"]
+        AutoDenseJacobian()
     else
-        nothing
+        manual_jacobian_algorithm = ManualSparseJacobian(
+            DerivativeFlag(has_topography(axes(Y.c))),
+            DerivativeFlag(atmos.diff_mode),
+            DerivativeFlag(atmos.sgs_adv_mode),
+            DerivativeFlag(atmos.sgs_entr_detr_mode),
+            DerivativeFlag(atmos.sgs_mf_mode),
+            DerivativeFlag(atmos.sgs_nh_pressure_mode),
+            parsed_args["approximate_linear_solve_iters"],
+        )
+        parsed_args["use_auto_jacobian"] ?
+        AutoSparseJacobian(manual_jacobian_algorithm) :
+        manual_jacobian_algorithm
     end
+    @info "Jacobian algorithm: $(summary_string(jacobian_algorithm))"
+    return Jacobian(jacobian_algorithm, Y, atmos)
+end
 
 #=
     ode_configuration(Y, parsed_args)
diff --git a/test/dependencies.jl b/test/dependencies.jl
index 45330c1ca0..3eb361b841 100644
--- a/test/dependencies.jl
+++ b/test/dependencies.jl
@@ -60,6 +60,8 @@ known_dependencies = Set([
     # Random is used to reset seed for random number generator used for cloudy RRTMGP runs to enable bit-wise reproducibility for tests
     "Random",
     "SciMLBase",
+    # SparseMatrixColorings is used to generate the column coloring of the AutoSparseJacobian
+    "SparseMatrixColorings",
     "StaticArrays",
     # Statistics is used to call 'mean' on ClimaCore Fields
     "Statistics",