diff --git a/.buildkite/Manifest-v1.11.toml b/.buildkite/Manifest-v1.11.toml index 891d84bf96..196882e50b 100644 --- a/.buildkite/Manifest-v1.11.toml +++ b/.buildkite/Manifest-v1.11.toml @@ -363,14 +363,16 @@ version = "0.5.18" Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" [[deps.ClimaAtmos]] -deps = ["Adapt", "ArgParse", "Artifacts", "AtmosphericProfilesLibrary", "ClimaComms", "ClimaCore", "ClimaDiagnostics", "ClimaInterpolations", "ClimaParams", "ClimaTimeSteppers", "ClimaUtilities", "CloudMicrophysics", "Dates", "ForwardDiff", "Insolation", "Interpolations", "LazyArtifacts", "LazyBroadcast", "LinearAlgebra", "Logging", "NCDatasets", "NVTX", "NullBroadcasts", "RRTMGP", "Random", "SciMLBase", "StaticArrays", "Statistics", "SurfaceFluxes", "Thermodynamics", "UnrolledUtilities", "YAML"] +deps = ["Adapt", "ArgParse", "Artifacts", "AtmosphericProfilesLibrary", "ClimaComms", "ClimaCore", "ClimaDiagnostics", "ClimaInterpolations", "ClimaParams", "ClimaTimeSteppers", "ClimaUtilities", "CloudMicrophysics", "Dates", "ForwardDiff", "Insolation", "Interpolations", "LazyArtifacts", "LazyBroadcast", "LinearAlgebra", "Logging", "NCDatasets", "NVTX", "NullBroadcasts", "RRTMGP", "Random", "SciMLBase", "SparseMatrixColorings", "StaticArrays", "Statistics", "SurfaceFluxes", "Thermodynamics", "UnrolledUtilities", "YAML"] path = ".." uuid = "b2c96348-7fb7-4fe0-8da9-78d88439e717" version = "0.31.0" [[deps.ClimaComms]] deps = ["Adapt", "Logging", "LoggingExtras"] -git-tree-sha1 = "75b9d1a3b4e3efa2cbbae2eb7b52f14c0b38ccf0" +git-tree-sha1 = "d341c3fc97a98dbecd6b635f34638b2d9771f94e" +repo-rev = "dy/memory_api" +repo-url = "https://github.com/CliMA/ClimaComms.jl.git" uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d" version = "0.6.8" weakdeps = ["CUDA", "MPI"] @@ -381,9 +383,11 @@ weakdeps = ["CUDA", "MPI"] [[deps.ClimaCore]] deps = ["Adapt", "BandedMatrices", "BlockArrays", "ClimaComms", "CubedSphere", "DataStructures", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LazyBroadcast", "LinearAlgebra", "MultiBroadcastFusion", "NVTX", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "StaticArrays", "Statistics", "UnrolledUtilities"] -git-tree-sha1 = "14d3d5810ce1e3c990450a2ce7abc6a1e162855f" +git-tree-sha1 = "a273452127dbb052f2963e3c1095730a996d49a6" +repo-rev = "main" +repo-url = "https://github.com/CliMA/ClimaCore.jl.git" uuid = "d414da3d-4745-48bb-8d80-42e94e092884" -version = "0.14.35" +version = "0.14.36" weakdeps = ["CUDA", "Krylov"] [deps.ClimaCore.extensions] @@ -2318,6 +2322,20 @@ deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" version = "1.11.0" +[[deps.SparseMatrixColorings]] +deps = ["ADTypes", "DocStringExtensions", "LinearAlgebra", "PrecompileTools", "Random", "SparseArrays"] +git-tree-sha1 = "ab958b4fec46d1f1d057bb8e2a99bfdb90744646" +uuid = "0a514795-09f3-496d-8182-132a7b665d35" +version = "0.4.20" + + [deps.SparseMatrixColorings.extensions] + SparseMatrixColoringsCliqueTreesExt = "CliqueTrees" + SparseMatrixColoringsColorsExt = "Colors" + + [deps.SparseMatrixColorings.weakdeps] + CliqueTrees = "60701a23-6482-424a-84db-faee86b9b1f8" + Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" + [[deps.SpecialFunctions]] deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] git-tree-sha1 = "41852b8679f78c8d8961eeadc8f62cef861a52e3" diff --git a/.buildkite/ci_driver.jl b/.buildkite/ci_driver.jl index 3207bcd3a0..af04fe79f4 100644 --- a/.buildkite/ci_driver.jl +++ b/.buildkite/ci_driver.jl @@ -39,12 +39,152 @@ using Test import Tar import Base.Filesystem: rm import Statistics: mean -import LinearAlgebra: norm_sqr +import LinearAlgebra: norm_sqr, diag, UniformScaling include(joinpath(pkgdir(CA), "post_processing", "ci_plots.jl")) ref_job_id = config.parsed_args["reference_job_id"] reference_job_id = isnothing(ref_job_id) ? simulation.job_id : ref_job_id +if ( + config.parsed_args["debug_approximate_jacobian"] && + !config.parsed_args["use_dense_jacobian"] +) + @info "Debugging Jacobian in first column of final state" + + Y_end = integrator.u + t_end = integrator.t + (; p, dt) = integrator + timestepper_alg = integrator.alg + tableau_coefficients = + timestepper_alg isa CA.CTS.RosenbrockAlgorithm ? + timestepper_alg.tableau.Γ : timestepper_alg.tableau.a_imp + jacobian = + timestepper_alg isa CA.CTS.RosenbrockAlgorithm ? integrator.cache.W : + integrator.cache.newtons_method_cache.j + + FT = eltype(Y_end) + γs = filter(!iszero, CA.LinearAlgebra.diag(tableau_coefficients)) + dtγ = FT(float(dt) * γs[end]) + scalar_names = CA.scalar_field_names(Y_end) + block_keys = Iterators.product(scalar_names, scalar_names) + + exact_jacobian_alg = CA.AutoDenseJacobian() + all_approx_jacobian_algs = + jacobian.alg isa CA.AutoSparseJacobian ? + (; manual = jacobian.alg.sparse_jacobian_alg, auto = jacobian.alg) : + (; manual = jacobian.alg) + exact_blocks = + CA.first_column_block_arrays(exact_jacobian_alg, Y_end, p, dtγ, t_end) + all_approx_blocks = map(all_approx_jacobian_algs) do jacobian_alg + CA.first_column_block_arrays(jacobian_alg, Y_end, p, dtγ, t_end) + end + block_rescalings = CA.first_column_rescaling_arrays(Y_end, p, t_end) + + highlighters = ( + Highlighter((d, i, j) -> d[i, j] < 1e-12; foreground = :dark_gray), + Highlighter((d, i, j) -> 1e-12 <= d[i, j] < 1e-6; foreground = :blue), + Highlighter((d, i, j) -> 1e-6 <= d[i, j] < 1e-3; foreground = :cyan), + Highlighter((d, i, j) -> 1e-3 <= d[i, j] < 1e-1; foreground = :green), + Highlighter((d, i, j) -> 1e-1 <= d[i, j] < 1; foreground = :yellow), + Highlighter((d, i, j) -> d[i, j] == 1; foreground = :light_red), + Highlighter((d, i, j) -> d[i, j] > 1; foreground = :light_magenta), + ) + table_kwargs = (; + columns_width = 5, + crop = :none, + formatters = ft_printf("%1.0e"), + highlighters, + row_labels = collect(scalar_names), + show_header = false, + tf = tf_matrix, + vlines = [1], + ) + exact_table_kwargs = (; table_kwargs..., highlighters = highlighters[1]) + bandwidth_table_kwargs = (; exact_table_kwargs..., formatters = nothing) + + rms(block) = sqrt(mean(abs2.(block))) + bandwidth(block) = + count((1 - size(block, 1)):(size(block, 2) - 1)) do band_index + any(!iszero, diag(block, band_index)) + end + + bandwidth_error_values = map(block_keys) do block_key + approx_blocks = first(all_approx_blocks) + approx_bandwidth = + haskey(approx_blocks, block_key) ? + ( + approx_blocks[block_key] isa UniformScaling ? 1 : + bandwidth(approx_blocks[block_key]) + ) : 0 + max(bandwidth(exact_blocks[block_key]) - approx_bandwidth, 0) + end + @info "exact - approx, number of missing bands per block:" + pretty_table(bandwidth_error_values; bandwidth_table_kwargs...) + + exact_rms_values = map(block_keys) do block_key + rms(exact_blocks[block_key]) + end + normalized_exact_rms_values = map(block_keys) do block_key + rms(exact_blocks[block_key] .* block_rescalings[block_key]) + end + @info "unnormalized exact, RMS per block:" + pretty_table(exact_rms_values; exact_table_kwargs...) + @info "normalized exact, RMS per block [s^-1]:" + pretty_table(normalized_exact_rms_values; exact_table_kwargs...) + + println("<$('='^70)>\n") + if jacobian.alg isa CA.AutoSparseJacobian + normalized_approx_difference_rms_values = map(block_keys) do block_key + (; manual, auto) = all_approx_blocks + rescaling = block_rescalings[block_key] + haskey(manual, block_key) && + !(manual[block_key] isa UniformScaling) ? + rms((manual[block_key] - auto[block_key]) .* rescaling) : FT(0) + end + @info "normalized manual approx - auto approx, RMS per block [s^-1]:" + pretty_table(normalized_approx_difference_rms_values; table_kwargs...) + end + for (approx_name, approx_blocks) in pairs(all_approx_blocks) + normalized_approx_error_rms_values = map(block_keys) do block_key + approx_error = + haskey(approx_blocks, block_key) ? + exact_blocks[block_key] - approx_blocks[block_key] : + exact_blocks[block_key] + rescaling = block_rescalings[block_key] + rms(approx_error .* rescaling) + end + @info "normalized exact - $approx_name approx, RMS per block [s^-1]:" + pretty_table(normalized_approx_error_rms_values; table_kwargs...) + end + println("<$('='^70)>\n") + if jacobian.alg isa CA.AutoSparseJacobian + approx_difference_relative_rms_values = map(block_keys) do block_key + (; manual, auto) = all_approx_blocks + rescaling = block_rescalings[block_key] + approx_difference_rms_value = + haskey(manual, block_key) && + !(manual[block_key] isa UniformScaling) ? + rms((manual[block_key] - auto[block_key])) : FT(0) + approx_difference_rms_value == 0 ? FT(0) : + approx_difference_rms_value / rms(exact_blocks[block_key]) + end + @info "manual approx - auto approx, relative RMS per block [unitless]:" + pretty_table(approx_difference_relative_rms_values; table_kwargs...) + end + for (approx_name, approx_blocks) in pairs(all_approx_blocks) + approx_error_relative_rms_values = map(block_keys) do block_key + approx_error_rms_value = + haskey(approx_blocks, block_key) ? + rms(exact_blocks[block_key] - approx_blocks[block_key]) : + rms(exact_blocks[block_key]) + approx_error_rms_value == 0 ? FT(0) : + approx_error_rms_value / rms(exact_blocks[block_key]) + end + @info "exact - $approx_name approx, relative RMS per block [unitless]:" + pretty_table(approx_error_relative_rms_values; table_kwargs...) + end +end + if sol_res.ret_code == :simulation_crashed error( "The ClimaAtmos simulation has crashed. See the stack trace for details.", diff --git a/.buildkite/gpu_pipeline/pipeline.yml b/.buildkite/gpu_pipeline/pipeline.yml index cf04a4db22..81544498be 100644 --- a/.buildkite/gpu_pipeline/pipeline.yml +++ b/.buildkite/gpu_pipeline/pipeline.yml @@ -24,6 +24,8 @@ steps: - julia --project=.buildkite -e 'using Pkg; Pkg.precompile()' - julia --project=.buildkite -e 'using CUDA; CUDA.precompile_runtime()' - julia --project=.buildkite -e 'using Pkg; Pkg.status()' + - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaComms", rev="dy/memory_api"))' + - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaCore", rev="main"))' agents: slurm_gpus: 1 diff --git a/.buildkite/longruns_gpu/pipeline.yml b/.buildkite/longruns_gpu/pipeline.yml index 1cc951ecf0..60ca8d188e 100644 --- a/.buildkite/longruns_gpu/pipeline.yml +++ b/.buildkite/longruns_gpu/pipeline.yml @@ -25,6 +25,8 @@ steps: - julia --project=.buildkite -e 'using Pkg; Pkg.precompile()' - julia --project=.buildkite -e 'using CUDA; CUDA.precompile_runtime()' - julia --project=.buildkite -e 'using Pkg; Pkg.status()' + - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaComms", rev="dy/memory_api"))' + - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaCore", rev="main"))' agents: slurm_gpus: 1 diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 274e3767b0..766005dd3b 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -28,6 +28,8 @@ steps: - echo "--- Instantiate .buildkite" - "julia --project=.buildkite -e 'using Pkg; Pkg.instantiate(;verbose=true); Pkg.precompile(;strict=true); using CUDA; CUDA.precompile_runtime(); Pkg.status()'" + - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaComms", rev="dy/memory_api"))' + - julia --project=.buildkite -e 'using Pkg; Pkg.add(Pkg.PackageSpec(;name="ClimaCore", rev="main"))' agents: slurm_cpus_per_task: 8 @@ -631,7 +633,7 @@ steps: --job_id amip_target_edonly_nonequil artifact_paths: "amip_target_edonly_nonequil/output_active/*" agents: - slurm_mem: 20GB + slurm_mem: 64GB - group: "Diagnostic EDMFX" steps: diff --git a/Project.toml b/Project.toml index bdefca8f3f..bab2077834 100644 --- a/Project.toml +++ b/Project.toml @@ -30,6 +30,7 @@ NullBroadcasts = "0d71be07-595a-4f89-9529-4065a4ab43a6" RRTMGP = "a01a1ee8-cea4-48fc-987c-fc7878d79da1" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" +SparseMatrixColorings = "0a514795-09f3-496d-8182-132a7b665d35" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" SurfaceFluxes = "49b00bb7-8bd4-4f2b-b78c-51cd0450215f" @@ -64,6 +65,7 @@ NullBroadcasts = "0.1" RRTMGP = "0.21.3" Random = "1" SciMLBase = "2.12" +SparseMatrixColorings = "0.4.20" StaticArrays = "1.7" Statistics = "1" SurfaceFluxes = "0.11, 0.12" diff --git a/config/default_configs/default_config.yml b/config/default_configs/default_config.yml index 871b8baeb4..da960c6331 100644 --- a/config/default_configs/default_config.yml +++ b/config/default_configs/default_config.yml @@ -89,6 +89,12 @@ jvp_step_adjustment: use_dense_jacobian: help: "Whether to use a dense Jacobian matrix that is computed using forward-mode automatic differentiation and inverted using LU factorization [`true`, `false` (default)]" value: false +use_auto_jacobian: + help: "Whether to populate the entries of the sparse Jacobian matrix using forward-mode automatic differentiation with sparse matrix coloring (only used when `use_dense_jacobian` is `false`) [`true`, `false` (default)]" + value: true # TODO: Change this to false +debug_approximate_jacobian: + help: "Whether to compare approximations of the Jacobian matrix in the first column of the final state against the exact Jacobian [`true`, `false` (default)]" + value: true # TODO: Change this to false update_jacobian_every: help: "Frequency at which the Jacobian matrix should be updated (once per timestep, once per timestepper stage, or once per linear solve) [`dt`, `stage`, `solve` (default)]" value: "solve" diff --git a/reproducibility_tests/ref_counter.jl b/reproducibility_tests/ref_counter.jl index 6491e2a62e..ac99383505 100644 --- a/reproducibility_tests/ref_counter.jl +++ b/reproducibility_tests/ref_counter.jl @@ -1,4 +1,4 @@ -248 +249 # **README** # diff --git a/src/ClimaAtmos.jl b/src/ClimaAtmos.jl index 7ed4d5a5fc..ae521764de 100644 --- a/src/ClimaAtmos.jl +++ b/src/ClimaAtmos.jl @@ -62,6 +62,7 @@ include( joinpath("prognostic_equations", "implicit", "manual_sparse_jacobian.jl"), ) include(joinpath("prognostic_equations", "implicit", "auto_dense_jacobian.jl")) +include(joinpath("prognostic_equations", "implicit", "auto_sparse_jacobian.jl")) include(joinpath("prognostic_equations", "implicit", "autodiff_utils.jl")) include(joinpath("prognostic_equations", "water_advection.jl")) diff --git a/src/cache/temporary_quantities.jl b/src/cache/temporary_quantities.jl index 505effd70c..5477e177f2 100644 --- a/src/cache/temporary_quantities.jl +++ b/src/cache/temporary_quantities.jl @@ -5,6 +5,20 @@ using ClimaCore.Utilities: half # but cannot be computed on the fly. Unlike the precomputed quantities, these # can be modified at any point, so they should never be assumed to be unchanged # between function calls. +function implicit_temporary_quantities(Y, atmos) + center_space, face_space = axes(Y.c), axes(Y.f) + + FT = Spaces.undertype(center_space) + uvw_vec = UVW(FT(0), FT(0), FT(0)) + return (; + ᶠtemp_scalar = Fields.Field(FT, face_space), # ᶠρaK_h, ᶠρaK_u + ᶜtemp_scalar = Fields.Field(FT, center_space), # ᶜρχₜ_diffusion, ᶜa_scalar + ᶜtemp_scalar_2 = Fields.Field(FT, center_space), # ᶜKᵥʲ, ᶜK_h_scaled + ᶜtemp_C3 = Fields.Field(C3{FT}, center_space), # ᶜu₃ʲ + ᶠtemp_CT3 = Fields.Field(CT3{FT}, face_space), # ᶠuₕ³, ᶠu³_diff + ᶠtemp_UVWxUVW = Fields.Field(typeof(uvw_vec * uvw_vec'), face_space), # ᶠstrain_rate + ) +end function temporary_quantities(Y, atmos) center_space, face_space = axes(Y.c), axes(Y.f) diff --git a/src/prognostic_equations/implicit/auto_dense_jacobian.jl b/src/prognostic_equations/implicit/auto_dense_jacobian.jl index f7e11400d5..9a119f5c2f 100644 --- a/src/prognostic_equations/implicit/auto_dense_jacobian.jl +++ b/src/prognostic_equations/implicit/auto_dense_jacobian.jl @@ -34,12 +34,12 @@ very large memory requirements at higher vertical resolutions. When the number of values in each column is very large, computing the entire dense matrix in a single evaluation of `implicit_tendency!` can be too expensive -to compile and run. So, the dual number components are split into batches with a -maximum size of `max_simultaneous_derivatives`, and we call `implicit_tendency!` -once for each batch. That is, if the batch size is ``s``, then the first batch -evaluates the coefficients of ``ε₁`` through ``εₛ``, the second evaluates the -coefficients of ``εₛ₊₁`` through ``ε₂ₛ``, and so on until ``εₙ``. The default -batch size is 32. +to compile and run. So, the dual number components are split into partitions +with a maximum size of `max_simultaneous_derivatives`, and we call +`implicit_tendency!` once for each partition. That is, if the partition size is +``s``, then the first partition evaluates the coefficients of ``ε₁`` through +``εₛ``, the second evaluates the coefficients of ``εₛ₊₁`` through ``ε₂ₛ``, and so +on until ``εₙ``. The default partition size is 32. """ struct AutoDenseJacobian{S} <: JacobianAlgorithm end AutoDenseJacobian(max_simultaneous_derivatives = 32) = @@ -52,12 +52,14 @@ function jacobian_cache(alg::AutoDenseJacobian, Y, atmos) FT = eltype(Y) DA = ClimaComms.array_type(Y) + precomputed = implicit_precomputed_quantities(Y, atmos) + scratch = implicit_temporary_quantities(Y, atmos) + FT_dual = ForwardDiff.Dual{Jacobian, FT, max_simultaneous_derivatives(alg)} - Y_dual = replace_parent_type(Y, FT_dual) + precomputed_dual = replace_parent_eltype(precomputed, FT_dual) + scratch_dual = replace_parent_eltype(scratch, FT_dual) + Y_dual = replace_parent_eltype(Y, FT_dual) Yₜ_dual = similar(Y_dual) - precomputed_dual = - replace_parent_type(implicit_precomputed_quantities(Y, atmos), FT_dual) - scratch_dual = replace_parent_type(temporary_quantities(Y, atmos), FT_dual) N = length(Fields.column(Y, 1, 1, 1)) n_columns = Fields.ncolumns(Y.c) @@ -72,10 +74,10 @@ function jacobian_cache(alg::AutoDenseJacobian, Y, atmos) I_matrix = reshape(I_column_matrix, N, N, 1) return (; - Y_dual, - Yₜ_dual, precomputed_dual, scratch_dual, + Y_dual, + Yₜ_dual, column_matrices, column_lu_factors, column_lu_vectors, @@ -84,89 +86,82 @@ function jacobian_cache(alg::AutoDenseJacobian, Y, atmos) ) end -function update_column_matrices!(alg::AutoDenseJacobian, cache, Y, p, dtγ, t) - (; Y_dual, Yₜ_dual, precomputed_dual, scratch_dual, column_matrices) = cache +function update_column_matrices!(alg::AutoDenseJacobian, cache, Y, p, t) + (; precomputed_dual, scratch_dual, Y_dual, Yₜ_dual, column_matrices) = cache device = ClimaComms.device(Y.c) column_indices = column_index_iterator(Y) scalar_names = scalar_field_names(Y) - scalar_level_indices = scalar_level_index_pairs(Y) - batch_size = max_simultaneous_derivatives(alg) - batch_size_val = Val(batch_size) - - p_dual_args = ntuple(Val(fieldcount(typeof(p)))) do cache_field_index - cache_field_name = fieldname(typeof(p), cache_field_index) - if cache_field_name == :precomputed - (; p.precomputed..., precomputed_dual...) - elseif cache_field_name == :scratch - scratch_dual - else - getfield(p, cache_field_index) - end - end - p_dual = AtmosCache(p_dual_args...) + field_vector_indices = field_vector_index_iterator(Y) + p_dual = append_to_atmos_cache(p, precomputed_dual, scratch_dual) - batches = Iterators.partition(scalar_level_indices, batch_size) - for batch_scalar_level_indices in ClimaComms.threadable(device, batches) - Y_dual .= Y + jacobian_index_to_Y_index_map_partitions = Iterators.partition( + enumerate(field_vector_indices), + max_simultaneous_derivatives(alg), + ) + for jacobian_index_to_Y_index_map_partition in + ClimaComms.threadable(device, jacobian_index_to_Y_index_map_partitions) - # Add a unique ε to Y for each scalar level index in this batch. With + # Add a unique ε to each value in Y that is part of this partition. With # Y_col and Yᴰ_col denoting the columns of Y and Y_dual at column_index, - # set Yᴰ_col to Y_col + I[:, batch_scalar_level_indices] * εs, where I - # is the identity matrix for Y_col (i.e., the value of ∂Y_col/∂Y_col), - # εs is a vector of batch_size dual number components, and - # batch_scalar_level_indices are the batch's indices into Y_col. + # set Yᴰ_col to Y_col + I[:, jacobian_column_indices] * εs, where I is + # the identity matrix for Y_col (i.e., the value of ∂Y_col/∂Y_col), εs + # is a vector of max_simultaneous_derivatives(alg) dual number + # components, and jacobian_column_indices is equal to + # first.(jacobian_index_to_Y_index_map_partition). + Y_dual .= Y ClimaComms.@threaded device begin - # On multithreaded devices, assign one thread to each combination of - # spatial column index and scalar level index in this batch. + # On multithreaded devices, use one thread for each dual number. for column_index in column_indices, - (ε_index, (_, (scalar_index, level_index))) in - enumerate(batch_scalar_level_indices) + (diagonal_entry_ε_index, (_, (scalar_index, level_index))) in + enumerate(jacobian_index_to_Y_index_map_partition) - Y_partials = ntuple(i -> i == ε_index ? 1 : 0, batch_size_val) - Y_dual_increment = ForwardDiff.Dual{Jacobian}(0, Y_partials...) + n_εs_val = Val(max_simultaneous_derivatives(alg)) + ε_coefficients = ntuple(==(diagonal_entry_ε_index), n_εs_val) unrolled_applyat(scalar_index, scalar_names) do name field = MatrixFields.get_field(Y_dual, name) @inbounds point(field, level_index, column_index...)[] += - Y_dual_increment + ForwardDiff.Dual{Jacobian}(0, ε_coefficients) end end end - # Compute this batch's portions of ∂p/∂Y and ∂Yₜ/∂Y. + # Compute this partition of ∂p/∂Y and ∂Yₜ/∂Y. set_implicit_precomputed_quantities!(Y_dual, p_dual, t) implicit_tendency!(Yₜ_dual, Y_dual, p_dual, t) - # Copy this batch's portion of ∂Yₜ/∂Y into column_matrices. With Yₜ_col - # and Yₜᴰ_col denoting the columns of Yₜ and Yₜ_dual at column_index, and + # Copy this partition of ∂Yₜ/∂Y into column_matrices. With Yₜ_col and + # Yₜᴰ_col denoting the columns of Yₜ and Yₜ_dual at column_index, and # with col_matrix denoting the matrix at the corresponding matrix_index # in column_matrices, copy the coefficients of the εs in Yₜᴰ_col into # col_matrix, where the previous steps have set Yₜᴰ_col to - # Yₜ_col + (∂Yₜ_col/∂Y_col)[:, batch_scalar_level_indices] * εs. - # Specifically, set col_matrix[scalar_level_index1, scalar_level_index2] - # to ∂Yₜ_col[scalar_level_index1]/∂Y_col[scalar_level_index2], obtaining - # this derivative from the coefficient of εs[ε_index] in - # Yₜᴰ_col[scalar_level_index1], where ε_index is the index of - # scalar_level_index2 in batch_scalar_level_indices. After all batches - # have been processed, col_matrix is the full Jacobian ∂Yₜ_col/∂Y_col. + # Yₜ_col + (∂Yₜ_col/∂Y_col)[:, jacobian_column_indices] * εs. In + # other words, set col_matrix[jacobian_row_index, jacobian_column_index] + # to ∂Yₜ_col[jacobian_row_index]/∂Y_col[jacobian_column_index], + # obtaining this derivative from the coefficient of + # εs[jacobian_column_ε_index] in Yₜᴰ_col[jacobian_row_index], where + # jacobian_column_ε_index is the index of jacobian_column_index in + # jacobian_column_indices. ClimaComms.@threaded device begin - # On multithreaded devices, assign one thread to each combination of - # spatial column index and scalar level index. + # On multithreaded devices, use one thread for each dual number. for (matrix_index, column_index) in enumerate(column_indices), - (scalar_level_index1, (scalar_index1, level_index1)) in - scalar_level_indices + (jacobian_row_index, (scalar_index, level_index)) in + enumerate(field_vector_indices) - Yₜ_dual_value = - unrolled_applyat(scalar_index1, scalar_names) do name + dual_number = + unrolled_applyat(scalar_index, scalar_names) do name field = MatrixFields.get_field(Yₜ_dual, name) - @inbounds point(field, level_index1, column_index...)[] + @inbounds point(field, level_index, column_index...)[] end - Yₜ_partials = ForwardDiff.partials(Yₜ_dual_value) - for (ε_index, (scalar_level_index2, _)) in - enumerate(batch_scalar_level_indices) - cartesian_index = - (scalar_level_index1, scalar_level_index2, matrix_index) + ε_coefficients = ForwardDiff.partials(dual_number) + for (jacobian_column_ε_index, (jacobian_column_index, _)) in + enumerate(jacobian_index_to_Y_index_map_partition) + cartesian_index = ( + jacobian_row_index, + jacobian_column_index, + matrix_index, + ) @inbounds column_matrices[cartesian_index...] = - Yₜ_partials[ε_index] + ε_coefficients[jacobian_column_ε_index] end end end @@ -178,7 +173,7 @@ function update_jacobian!(alg::AutoDenseJacobian, cache, Y, p, dtγ, t) device = ClimaComms.device(Y.c) # Set column_matrices to ∂Yₜ/∂Y. - update_column_matrices!(alg, cache, Y, p, dtγ, t) + update_column_matrices!(alg, cache, Y, p, t) # Set column_lu_factors to ∂R/∂Y = dtγ * ∂Yₜ/∂Y - I, where R is the residual # of the implicit equation. @@ -193,21 +188,22 @@ function invert_jacobian!(::AutoDenseJacobian, cache, ΔY, R) device = ClimaComms.device(ΔY.c) column_indices = column_index_iterator(ΔY) scalar_names = scalar_field_names(ΔY) - scalar_level_indices = scalar_level_index_pairs(ΔY) + vector_index_to_field_vector_index_map = + enumerate(field_vector_index_iterator(ΔY)) # Copy all scalar values from R into column_lu_vectors. ClimaComms.@threaded device begin - # On multithreaded devices, assign one thread to each index into R. + # On multithreaded devices, use one thread for each number. for (vector_index, column_index) in enumerate(column_indices), (scalar_level_index, (scalar_index, level_index)) in - scalar_level_indices + vector_index_to_field_vector_index_map - value = unrolled_applyat(scalar_index, scalar_names) do name + number = unrolled_applyat(scalar_index, scalar_names) do name field = MatrixFields.get_field(R, name) @inbounds point(field, level_index, column_index...)[] end @inbounds column_lu_vectors[scalar_level_index, vector_index] = - value + number end end @@ -216,21 +212,53 @@ function invert_jacobian!(::AutoDenseJacobian, cache, ΔY, R) # Copy all scalar values from column_lu_vectors into ΔY. ClimaComms.@threaded device begin - # On multithreaded devices, assign one thread to each index into ΔY. + # On multithreaded devices, use one thread for each number. for (vector_index, column_index) in enumerate(column_indices), (scalar_level_index, (scalar_index, level_index)) in - scalar_level_indices + vector_index_to_field_vector_index_map - @inbounds value = + @inbounds number = column_lu_vectors[scalar_level_index, vector_index] unrolled_applyat(scalar_index, scalar_names) do name field = MatrixFields.get_field(ΔY, name) - @inbounds point(field, level_index, column_index...)[] = value + @inbounds point(field, level_index, column_index...)[] = number end end end end +function first_column_block_arrays(alg::AutoDenseJacobian, Y, p, dtγ, t) + scalar_names = scalar_field_names(Y) + field_vector_indices = field_vector_index_iterator(Y) + column_Y = first_column_view(Y) + column_p = first_column_view(p) + column_cache = jacobian_cache(alg, column_Y, p.atmos) + + update_column_matrices!(alg, column_cache, column_Y, column_p, t) + column_∂R_∂Y = dtγ .* column_cache.column_matrices .- column_cache.I_matrix + + block_arrays = Dict() + for block_key in Iterators.product(scalar_names, scalar_names) + block_jacobian_row_index_to_Yₜ_index_map = + Iterators.filter(enumerate(field_vector_indices)) do index_pair + (_, (scalar_index, _)) = index_pair + scalar_names[scalar_index] == block_key[1] + end + block_jacobian_column_index_to_Y_index_map = + Iterators.filter(enumerate(field_vector_indices)) do index_pair + (_, (scalar_index, _)) = index_pair + scalar_names[scalar_index] == block_key[2] + end + block_view_indices = ( + map(first, block_jacobian_row_index_to_Yₜ_index_map), + map(first, block_jacobian_column_index_to_Y_index_map), + ) + block_arrays[block_key] = + Array(view(column_∂R_∂Y, block_view_indices..., 1)) + end + return block_arrays +end + """ parallel_lu_factorize!(device, matrices, ::Val{N}) diff --git a/src/prognostic_equations/implicit/auto_sparse_jacobian.jl b/src/prognostic_equations/implicit/auto_sparse_jacobian.jl new file mode 100644 index 0000000000..08f24d9c07 --- /dev/null +++ b/src/prognostic_equations/implicit/auto_sparse_jacobian.jl @@ -0,0 +1,435 @@ +import SparseMatrixColorings + +""" + AutoSparseJacobian(sparse_jacobian_alg, [max_padding_bands_per_block]) + +A [`JacobianAlgorithm`](@ref) that computes the Jacobian using forward-mode +automatic differentiation, assuming that the Jacobian's sparsity structure is +given by `sparse_jacobian_alg`. Only entries that should be nonzero according to +the sparsity structure are updated, but any other entries that are nonzero can +introduce errors to the updated entries. + +TODO: Add a short explanation of how this algorithm works. +""" +struct AutoSparseJacobian{A <: SparseJacobian, M} <: SparseJacobian + sparse_jacobian_alg::A + max_padding_bands_per_block::M +end +AutoSparseJacobian(sparse_jacobian_alg) = + AutoSparseJacobian(sparse_jacobian_alg, nothing) + +function jacobian_cache(alg::AutoSparseJacobian, Y, atmos; verbose = true) + (; sparse_jacobian_alg, max_padding_bands_per_block) = alg + + FT = eltype(Y) + DA = ClimaComms.array_type(Y) + device = ClimaComms.device(Y.c) + + column_indices = column_index_iterator(Y) # iterator of (i, j, h) + field_vector_indices = field_vector_index_iterator(Y) # iterator of (f, v) + scalar_names = scalar_field_names(Y) # iterator of names corresponding to f + + precomputed = implicit_precomputed_quantities(Y, atmos) + scratch = implicit_temporary_quantities(Y, atmos) + + # Allocate ∂R/∂Y and its corresponding linear solver. + # TODO: Add FieldNameTree(Y) to the matrix in FieldMatrixWithSolver. The + # tree is needed to evaluate scalar_tendency_matrix[autodiff_matrix_keys]. + # (; matrix) = jacobian_cache(sparse_jacobian_alg, Y, atmos) + matrix_without_tree = jacobian_cache(sparse_jacobian_alg, Y, atmos).matrix + tree = MatrixFields.FieldNameTree(Y) + matrix = MatrixFields.FieldMatrixWithSolver( + MatrixFields.replace_name_tree(matrix_without_tree.matrix, tree), + matrix_without_tree.solver, + ) + + # Allocate ∂Yₜ/∂Y and create a view of the scalar components of its blocks. + tendency_matrix = matrix .+ one(matrix) + scalar_tendency_matrix = MatrixFields.scalar_field_matrix(tendency_matrix) + + # Find scalar keys that correspond to the scalar components of the blocks. + # When we approximate a tensor derivative as a scalar multiple of the + # identity tensor, we compute the scalar quantity using the top-left + # component of the tensor. For example, the derivative of Yₜ.c.uₕ with + # respect to Y.c.uₕ is computed as the derivative of + # Yₜ.c.uₕ.components.data.:1 with respect to Y.c.uₕ.components.data.:1. + scalar_block_keys = + map(keys(scalar_tendency_matrix)) do (block_row_name, block_column_name) + scalar_block_row_name = + block_row_name in scalar_names ? block_row_name : + unrolled_argfirst(scalar_names) do scalar_name + MatrixFields.is_child_name(scalar_name, block_row_name) + end + scalar_block_column_name = + block_column_name in scalar_names ? block_column_name : + unrolled_argfirst(scalar_names) do scalar_name + MatrixFields.is_child_name(scalar_name, block_column_name) + end + (scalar_block_row_name, scalar_block_column_name) + end + + # Find keys of non-constant blocks, which are represented by matrix fields. + non_constant_scalar_block_keys = + unrolled_filter(scalar_block_keys) do scalar_block_key + scalar_tendency_matrix[scalar_block_key] isa Fields.Field + end + + # Create a new view of ∂Yₜ/∂Y that has scalar keys and non-constant blocks. + autodiff_matrix_keys = + MatrixFields.FieldMatrixKeys(non_constant_scalar_block_keys) + autodiff_matrix = scalar_tendency_matrix[autodiff_matrix_keys] + + # Construct a mask for nonzero entries in autodiff_matrix as a dense array, + # and a similar mask with additional padding bands in each block. + # TODO: Improve performance by only adding bands where they are necessary, + # or by rescaling blocks instead of adding new bands. + N = length(Fields.column(Y, 1, 1, 1)) + sparsity_mask = Array{Bool}(undef, N, N) + sparsity_mask .= false + padded_sparsity_mask = copy(sparsity_mask) + for block_key in Iterators.product(scalar_names, scalar_names) + (block_row_name, block_column_name) = block_key + + # Get a view of this block's sparsity masks with its row/column indices. + block_jacobian_row_index_to_Yₜ_index_map = + Iterators.filter(enumerate(field_vector_indices)) do index_pair + (_, (scalar_index, _)) = index_pair + scalar_names[scalar_index] == block_row_name + end + block_jacobian_column_index_to_Y_index_map = + Iterators.filter(enumerate(field_vector_indices)) do index_pair + (_, (scalar_index, _)) = index_pair + scalar_names[scalar_index] == block_column_name + end + block_view_indices = ( + map(first, block_jacobian_row_index_to_Yₜ_index_map), + map(first, block_jacobian_column_index_to_Y_index_map), + ) + block_sparsity_mask = view(sparsity_mask, block_view_indices...) + padded_block_sparsity_mask = + view(padded_sparsity_mask, block_view_indices...) + + # Compute the lower and upper band indices of this block, with empty + # blocks corresponding to index ranges whose length is -1 (centered + # around 0 for square blocks and around ±1/2 for non-square blocks). + (n_rows_in_block, n_columns_in_block) = size(block_sparsity_mask) + if block_key in keys(autodiff_matrix) + (_, _, lower_band, upper_band) = + MatrixFields.band_matrix_info(autodiff_matrix[block_key]) + else + (lower_band, upper_band) = + n_rows_in_block == n_columns_in_block ? (1 / 2, -1 / 2) : + (n_rows_in_block < n_columns_in_block ? (1, 0) : (0, -1)) + end + + # Symmetrically expand the range of band indices, with the number of + # new bands either limited by max_padding_bands_per_block, or hardcoded + # for each block when max_padding_bands_per_block is not specified. + max_padding_bands = if !isnothing(max_padding_bands_per_block) + max_padding_bands_per_block + elseif ( + !(block_key in keys(autodiff_matrix)) && + (block_row_name, block_row_name) in keys(autodiff_matrix) + ) + if block_key in ( + (@name(c.uₕ.components.data.:(1)), @name(c.ρ)), + (@name(c.uₕ.components.data.:(2)), @name(c.ρ)), + ) + # ∂ᶜuₕₜ/∂ᶜρ and ∂ᶜuₕₜ/∂ᶜuₕ can have similar unnormalized entries + 3 + elseif block_key in ( + (@name(c.ρe_tot), @name(c.ρ)), + (@name(c.ρe_tot), @name(c.ρq_liq)), + (@name(c.ρe_tot), @name(c.ρq_ice)), + (@name(c.ρe_tot), @name(c.ρq_rai)), + (@name(c.ρe_tot), @name(c.ρq_sno)), + (@name(c.ρe_tot), @name(c.sgsʲs.:(1).q_tot)), + ) + # ∂ᶜρe_totₜ/∂ᶜχ and ∂ᶜρe_totₜ/∂ᶜρe_tot can have similar + # unnormalized entries for several different variables ᶜχ + 3 + elseif ( + block_row_name == @name(f.sgsʲs.:(1).u₃.components.data.:(1)) && + block_column_name in ( + @name(c.uₕ.components.data.:(1)), + @name(c.uₕ.components.data.:(2)), + ) + ) + # ∂ᶠu₃ʲₜ/∂ᶜuₕ and ∂ᶠu₃ʲₜ/∂ᶠu₃ʲ can have similar unnormalized + # entries (and ∂ᶠu₃ʲₜ/∂ᶜmseʲ can also have similar entries) + 2 + else + 0 + end + else + 0 + end + padded_lower_band = ceil(Int, lower_band - max_padding_bands / 2) + padded_upper_band = floor(Int, upper_band + max_padding_bands / 2) + + if verbose + n_padding_bands = + length(padded_lower_band:padded_upper_band) - + length(lower_band:upper_band) + n_padding_bands > 0 && + @info "Adding $n_padding_bands padding bands for $block_key" + end + + # Update the sparsity mask entries corresponding to bands in this block. + for band in padded_lower_band:padded_upper_band + is_not_padding_band = band in lower_band:upper_band + level_index_min = band < 0 ? 1 - band : 1 + level_index_max = + band < n_columns_in_block - n_rows_in_block ? n_rows_in_block : + n_columns_in_block - band + for level_index in level_index_min:level_index_max + block_mask_index = (level_index, level_index + band) + block_sparsity_mask[block_mask_index...] = is_not_padding_band + padded_block_sparsity_mask[block_mask_index...] = true + end + end + end + + # Find a coloring that minimizes the number of required colors. + jacobian_column_colorings = map(SparseMatrixColorings.all_orders()) do order + sparse_matrix = SparseMatrixColorings.sparse(padded_sparsity_mask) + problem = SparseMatrixColorings.ColoringProblem() + greedy_algorithm = + SparseMatrixColorings.GreedyColoringAlgorithm(order) + SparseMatrixColorings.coloring(sparse_matrix, problem, greedy_algorithm) + end + best_jacobian_column_coloring = + argmin(SparseMatrixColorings.ncolors, jacobian_column_colorings) + n_colors = SparseMatrixColorings.ncolors(best_jacobian_column_coloring) + + # When running on GPU devices, divide n_colors into partitions that are each + # guaranteed to fit in 90% of the memory that is currently free. + n_partitions = if device isa ClimaComms.AbstractCPUDevice + 1 + else + free_memory = ClimaComms.free_memory(device) + max_memory = free_memory * 9 ÷ 10 + memory_for_I_matrix = n_colors * parent_memory(Y) + memory_per_ε = + (parent_memory(precomputed) + parent_memory(scratch)) + + 2 * parent_memory(Y) + # Find the smallest possible integer n_partitions and some other integer + # n_εs such that n_partitions * n_εs >= n_colors and + # (n_εs + 1) * memory_per_ε + memory_for_I_matrix <= max_memory, where + # (n_εs + 1) * memory_per_ε is the memory required to store + # precomputed_dual, scratch_dual, Y_dual, and Yₜ_dual, and where + # memory_for_I_matrix is an approximation of the memory required to + # store I_matrix_partitions. The actual memory_for_I_matrix is given by + # (n_colors + n_partitions) * parent_memory(Y), but we can ignore the + # value of n_partitions if it is negligible compared to n_colors. When + # max_memory is too small to fit any εs, try using one ε per partition. + # TODO: Replace the fields in I_matrix_partitions with column fields, + # making memory_for_I_matrix negligible compared to max_memory. + n_εs_max = (max_memory - memory_for_I_matrix) ÷ memory_per_ε - 1 + cld(n_colors, max(n_εs_max, 1)) + end + n_εs = cld(n_colors, n_partitions) + + if verbose + if n_partitions == 1 + @info "Updating Jacobian using $n_εs ε components per dual number" + else + @info "Updating Jacobian using $n_partitions partitions of \ + $n_colors colors, with $n_εs ε components per dual number" + end + end + + # FieldVectors and cached fields with dual numbers instead of real numbers, + # with dual numbers using the tag "Jacobian" for specialized dispatch + # TODO: Refactor FieldVector broadcasting so that performance does not + # deteriorate if we only store one column of each partition_εs. + FT_dual = ForwardDiff.Dual{Jacobian, FT, n_εs} + precomputed_dual = replace_parent_eltype(precomputed, FT_dual) + scratch_dual = replace_parent_eltype(scratch, FT_dual) + Y_dual = replace_parent_eltype(Y, FT_dual) + Yₜ_dual = similar(Y_dual) + I_matrix_partitions = ntuple(_ -> similar(Y_dual), n_partitions) + + # iterator of colors for each of the values in Y that require autodiff, and + # 0 for values that do not require it (by not initializing dual number + # components when they are unneeded, we avoid some of the errors introduced + # by our sparsity approximation) + jacobian_column_colors = + SparseMatrixColorings.column_colors(best_jacobian_column_coloring) + for Y_index_and_sparsity_mask in enumerate(eachcol(sparsity_mask)) + (Y_index, jacobian_column_sparsity_mask) = Y_index_and_sparsity_mask + if !any(jacobian_column_sparsity_mask) + jacobian_column_colors[Y_index] = 0 + end + end + + # iterator of pairs ((f, v), c), where the color c identifies a component + # of the dual number in row (f, v) of Y_dual that corresponds to the + # diagonal entry in the same row of the matrix ∂Y/∂Y (or 0 if the + # corresponding value in Y does not require autodiff) + Y_index_to_diagonal_color_map = + zip(field_vector_indices, jacobian_column_colors) + + # Set the dual numbers in each FieldVector partition_εs so that the ε + # components correspond to partitions of the N × N identity matrix ∂Y/∂Y. + # Specifically, every column of partition_εs is a vector of N dual numbers, + # each of which is stored as a combination of a value and n_εs partial + # derivatives. The ε components can be interpreted as representing N × n_εs + # slices of a sparse N × n_colors representation of ∂Y/∂Y. Convert n_εs to + # a Val and Y_index_to_diagonal_color_map to a DA for GPU compatibility, and + # drop spatial information from every Field to ensure that this kernel stays + # below the GPU parameter memory limit. + n_εs_val = Val(n_εs) + I_matrix_partitions_data = unrolled_map(I_matrix_partitions) do partition_εs + unrolled_map(Fields.field_values, Fields._values(partition_εs)) + end + ClimaComms.@threaded device begin + # On multithreaded devices, use one thread for each dual number. + for (partition_index, partition_εs_data) in + enumerate(I_matrix_partitions_data), + column_index in column_indices, + index_pair in DA(collect(Y_index_to_diagonal_color_map)) + + ((scalar_index, level_index), diagonal_entry_color) = index_pair + ε_offset = (partition_index - 1) * n_εs + diagonal_entry_ε_index = + ε_offset < diagonal_entry_color <= ε_offset + n_εs ? + diagonal_entry_color - ε_offset : 0 + ε_coefficients = ntuple(==(diagonal_entry_ε_index), n_εs_val) + unrolled_applyat(scalar_index, scalar_names) do name + field = MatrixFields.get_field(partition_εs_data, name) + @inbounds point(field, level_index, column_index...)[] = + ForwardDiff.Dual{Jacobian}(0, ε_coefficients) + end + end + end + + # number of colors needed to represent a band matrix row in any block + colors_per_band_matrix_row = + maximum(values(autodiff_matrix)) do matrix_field + (_, _, lower_band, upper_band) = + MatrixFields.band_matrix_info(matrix_field) + upper_band - lower_band + 1 + end + + # iterator of pairs ((b, v), (f, cs)), where (b, v) is the index of a band + # matrix row in autodiff_matrix, (f, v) is the index of a dual number in + # Yₜ_dual, and cs is a tuple that contains the colors of the band matrix row + # entries, which is padded to have a constant size for GPU compatibility + # TODO: Use an iterator of pairs ((f, v), ((b1, cs1), (b2, cs2), ...)), so + # that each pair corresponds to a dual number instead of a band matrix row. + band_matrix_row_index_to_colors_map = Iterators.flatmap( + enumerate(pairs(autodiff_matrix)), + ) do (block_index, (block_key, matrix_field)) + (block_row_name, block_column_name) = block_key + (n_rows_in_block, n_columns_in_block, lower_band, upper_band) = + MatrixFields.band_matrix_info(matrix_field) + + block_Yₜ_indices = + Iterators.filter(field_vector_indices) do (scalar_index, _) + scalar_names[scalar_index] == block_row_name + end + block_Y_index_to_color_map = + Iterators.filter(Y_index_to_diagonal_color_map) do index_pair + ((scalar_index, _), _) = index_pair + scalar_names[scalar_index] == block_column_name + end + block_colors = last.(block_Y_index_to_color_map) + + map(block_Yₜ_indices) do (scalar_index, level_index) + entry_colors = ntuple(colors_per_band_matrix_row) do band_index + band = lower_band + band_index - 1 + level_index_min = band < 0 ? 1 - band : 1 + level_index_max = + band < n_columns_in_block - n_rows_in_block ? + n_rows_in_block : n_columns_in_block - band + is_color_at_index = + band <= upper_band && + level_index_min <= level_index <= level_index_max + is_color_at_index ? block_colors[level_index + band] : 0 + end + ((block_index, level_index), (scalar_index, entry_colors)) + end + end + + # Convert the lazy iterator to a DA for GPU compatibility. + band_matrix_row_index_to_colors_map = + DA(collect(band_matrix_row_index_to_colors_map)) + + return (; + matrix, + tendency_matrix, + autodiff_matrix, + precomputed_dual, + scratch_dual, + Y_dual, + Yₜ_dual, + I_matrix_partitions, + band_matrix_row_index_to_colors_map, + ) +end + +function update_jacobian!(::AutoSparseJacobian, cache, Y, p, dtγ, t) + (; matrix, tendency_matrix, autodiff_matrix) = cache + (; precomputed_dual, scratch_dual, Y_dual, Yₜ_dual) = cache + (; I_matrix_partitions, band_matrix_row_index_to_colors_map) = cache + + device = ClimaComms.device(Y.c) + column_indices = column_index_iterator(Y) + scalar_names = scalar_field_names(Y) + p_dual = append_to_atmos_cache(p, precomputed_dual, scratch_dual) + + for (partition_index, partition_εs) in enumerate(I_matrix_partitions) + # Set the εs in Y_dual to represent a partition of the identity matrix. + Y_dual .= Y .+ partition_εs + + # Compute ∂p/∂Y * partition_εs and ∂Yₜ/∂Y * partition_εs. + set_implicit_precomputed_quantities!(Y_dual, p_dual, t) + implicit_tendency!(Yₜ_dual, Y_dual, p_dual, t) + + # Move the entries of ∂Yₜ/∂Y * partition_εs from Yₜ_dual into the + # blocks of autodiff_matrix. Drop spatial information from every Field + # to ensure that this kernel stays below the GPU parameter memory limit. + Yₜ_dual_data = + unrolled_map(Fields.field_values, Fields._values(Yₜ_dual)) + matrix_fields_data = + unrolled_map(Fields.field_values, values(autodiff_matrix)) + ClimaComms.@threaded device begin + # On multithreaded devices, use one thread for each band matrix row. + # TODO: Modify the map and use one thread for each dual number. + for column_index in column_indices, + index_pair in band_matrix_row_index_to_colors_map + + ((block_index, level_index), (scalar_index, entry_colors)) = + index_pair + dual_number = + unrolled_applyat(scalar_index, scalar_names) do name + data = MatrixFields.get_field(Yₜ_dual_data, name) + @inbounds point(data, level_index, column_index...)[] + end + ε_coefficients = ForwardDiff.partials(dual_number) + n_εs = length(ε_coefficients) + ε_offset = (partition_index - 1) * n_εs + unrolled_applyat(block_index, matrix_fields_data) do block_data + @inbounds entries_data = + point(block_data, level_index, column_index...).entries + entries_data[] = + map(entry_colors, entries_data[]) do entry_color, entry + # If the entry has a color in the current partition, + # set the entry to the ε coefficient for that color. + # Otherwise, keep the value from the block's data. + ε_offset < entry_color <= ε_offset + n_εs ? + (@inbounds ε_coefficients[entry_color - ε_offset]) : + entry + end # TODO: Why does unrolled_map break GPU compilation? + end + end + end + end + + # Update the matrix for ∂R/∂Y using the new values of ∂Yₜ/∂Y. + matrix .= dtγ .* tendency_matrix .- one(matrix) +end + +invert_jacobian!(alg::AutoSparseJacobian, cache, ΔY, R) = + invert_jacobian!(alg.sparse_jacobian_alg, cache, ΔY, R) diff --git a/src/prognostic_equations/implicit/autodiff_utils.jl b/src/prognostic_equations/implicit/autodiff_utils.jl index 4c0c5da81d..17300215ea 100644 --- a/src/prognostic_equations/implicit/autodiff_utils.jl +++ b/src/prognostic_equations/implicit/autodiff_utils.jl @@ -1,17 +1,67 @@ -# Replaces all numbers in Fields and DataLayouts with values of type T. -replace_parent_type(x::Fields.FieldVector, ::Type{T}) where {T} = similar(x, T) -replace_parent_type(x::Fields.Field, ::Type{T}) where {T} = - Fields.Field(replace_parent_type(Fields.field_values(x), T), axes(x)) -replace_parent_type(x::DataLayouts.AbstractData, ::Type{T}) where {T} = +# Tuple that contains the components of a generic object. +components(x) = ntuple(i -> getfield(x, i), Val(fieldcount(typeof(x)))) + +# Analogue of map that applies a function to each component of a generic object. +# Aside from some special cases, this assumes that the object's type has a +# default constructor (one that does not need any type parameters). +map_components(f::F, x::Union{Tuple, NamedTuple}) where {F} = unrolled_map(f, x) +map_components(f::F, x) where {F} = + typeof(x).name.wrapper(unrolled_map(f, components(x))...) + +# Determines whether there is at least one DataLayout within an object. +contains_data_layout(::DataLayouts.AbstractData) = true +contains_data_layout(x) = unrolled_any(contains_data_layout, components(x)) + +# Adds up the memory required to allocate all DataLayouts within an object. +parent_memory(x::DataLayouts.AbstractData) = sizeof(parent(x)) +parent_memory(x) = + contains_data_layout(x) ? unrolled_sum(parent_memory, components(x)) : 0 + +# Extracts a view of the first column from every DataLayout within an object, +# except for DataLayouts in DSSBuffers, which are not needed in a single column. +first_column_view(x::DataLayouts.AbstractData) = Fields.column(x, 1, 1, 1) +first_column_view(x::Fields.Field) = Fields.column(x, 1, 1, 1) +first_column_view(x::Fields.FieldVector) = Fields.column(x, 1, 1, 1) +first_column_view(x::Topologies.DSSBuffer) = nothing +first_column_view(x) = + contains_data_layout(x) ? map_components(first_column_view, x) : x + +# Makes T the parent array element type of every DataLayout within an object. +replace_parent_eltype(x::DataLayouts.AbstractData, ::Type{T}) where {T} = DataLayouts.replace_basetype(x, T) -replace_parent_type(x::Union{Tuple, NamedTuple}, ::Type{T}) where {T} = - unrolled_map(Base.Fix2(replace_parent_type, T), x) +replace_parent_eltype(x::Fields.Field, ::Type{T}) where {T} = + Fields.Field(replace_parent_eltype(Fields.field_values(x), T), axes(x)) +replace_parent_eltype(x::Fields.FieldVector, ::Type{T}) where {T} = + similar(x, T) +replace_parent_eltype(x, ::Type{T}) where {T} = + contains_data_layout(x) ? + map_components(Base.Fix2(replace_parent_eltype, T), x) : x + +# Appends values to the precomputed and scratch components of an AtmosCache. +append_to_atmos_cache(atmos_cache, precomputed, scratch) = AtmosCache( + unrolled_map(fieldnames(typeof(atmos_cache))) do cache_component_name + if cache_component_name == :precomputed + (; atmos_cache.precomputed..., precomputed...) + elseif cache_component_name == :scratch + (; atmos_cache.scratch..., scratch...) + else + getfield(atmos_cache, cache_component_name) + end + end..., +) + +# The horizontal SpectralElementSpace of the fields in a FieldVector. +function horizontal_space(field_vector) + all_values = Fields._values(field_vector) + space = axes(unrolled_argfirst(value -> value isa Fields.Field, all_values)) + return space isa Spaces.FiniteDifferenceSpace ? nothing : + Spaces.horizontal_space(space) +end # An iterator over the column indices of all fields in a FieldVector. function column_index_iterator(field_vector) - all_values = Fields._values(field_vector) - first_field = unrolled_argfirst(value -> value isa Fields.Field, all_values) - horz_space = Spaces.horizontal_space(axes(first_field)) + horz_space = horizontal_space(field_vector) + isnothing(horz_space) && return ((1, 1, 1),) qs = 1:Quadratures.degrees_of_freedom(Spaces.quadrature_style(horz_space)) hs = Spaces.eachslabindex(horz_space) return horz_space isa Spaces.SpectralElementSpace1D ? @@ -24,31 +74,33 @@ scalar_field_names(field_vector) = x isa Fields.Field && eltype(x) == eltype(field_vector) end -# An iterator with pairs of the form -# (scalar_level_index, (scalar_index, level_index)), where scalar_index is an -# index into scalar_field_names(field_vector), level_index is a vertical index -# into the scalar field with this name, and scalar_level_index is a unique -# linear index for each combination of scalar field name and vertical index. -function scalar_level_index_pairs(field_vector) +# An iterator with tuples of the form (scalar_index, level_index)), where +# scalar_index is an index into scalar_field_names(field_vector) and level_index +# is a vertical index into the scalar field with this name. +function field_vector_index_iterator(field_vector) scalar_names = scalar_field_names(field_vector) scalar_index_and_level_pairs = unrolled_map(enumerate(scalar_names)) do (scalar_index, name) field = MatrixFields.get_field(field_vector, name) - is_one_level = field isa Fields.SpectralElementField + is_one_level = + field isa Union{Fields.PointField, Fields.SpectralElementField} Iterators.map( level_index -> (scalar_index, level_index), Base.OneTo(is_one_level ? 1 : Spaces.nlevels(axes(field))), ) end - return enumerate(Iterators.flatten(scalar_index_and_level_pairs)) + return Iterators.flatten(scalar_index_and_level_pairs) end -# A view of the data for one point in a field, which can be used like a Ref. -Base.@propagate_inbounds function point(field, level_index, column_index...) - column_field = Fields.column(field, column_index...) - column_field isa Fields.PointField && return column_field - level_index_offset = Operators.left_idx(axes(column_field)) - 1 - return Fields.level(column_field, level_index + level_index_offset) +# A view of one point in a Field or DataLayout, which can be used like a Ref. +Base.@propagate_inbounds function point(value, level_index, column_index...) + column_value = Fields.column(value, column_index...) + column_value isa Union{Fields.PointField, DataLayouts.DataF} && + return column_value + level_index_offset = + column_value isa Fields.Field ? + Operators.left_idx(axes(column_value)) - 1 : 0 + return Fields.level(column_value, level_index + level_index_offset) end # TODO: This needs to be moved into ClimaCore to avoid breaking precompilation. @@ -90,3 +142,33 @@ end deriv = iszero(x) ? zero(x) : inv(2 * val) return ForwardDiff.dual_definition_retval(tag, val, deriv, partials) end + +# Ignore all derivative information when comparing Duals to other Numbers. This +# ensures that conditional statements are always equivalent for Duals and Reals. +for func in (:iszero,) + @eval @inline Base.$func(arg::ForwardDiff.Dual{Jacobian}) = + $func(ForwardDiff.value(arg)) +end +for func in (:isequal, :isless, :<, :>, :(==), :!=, :<=, :>=) + # These methods handle combinations of Duals without ambiguities. + @eval @inline Base.$func( + arg1::ForwardDiff.Dual{Jacobian}, + arg2::ForwardDiff.Dual{Jacobian}, + ) = $func(ForwardDiff.value(arg1), ForwardDiff.value(arg2)) + @eval @inline Base.$func( + arg1::ForwardDiff.Dual{Tx}, + arg2::ForwardDiff.Dual{Jacobian}, + ) where {Tx} = $func(ForwardDiff.value(arg1), ForwardDiff.value(arg2)) + @eval @inline Base.$func( + arg1::ForwardDiff.Dual{Jacobian}, + arg2::ForwardDiff.Dual{Ty}, + ) where {Ty} = $func(ForwardDiff.value(arg1), ForwardDiff.value(arg2)) + + # These methods handle combinations of Reals and Duals without ambiguities. + for R in (ForwardDiff.AMBIGUOUS_TYPES..., AbstractIrrational) + @eval @inline Base.$func(arg1::ForwardDiff.Dual{Jacobian}, arg2::$R) = + $func(ForwardDiff.value(arg1), arg2) + @eval @inline Base.$func(arg1::$R, arg2::ForwardDiff.Dual{Jacobian}) = + $func(arg1, ForwardDiff.value(arg2)) + end +end diff --git a/src/prognostic_equations/implicit/jacobian.jl b/src/prognostic_equations/implicit/jacobian.jl index fa4680dcb7..41fe2c956e 100644 --- a/src/prognostic_equations/implicit/jacobian.jl +++ b/src/prognostic_equations/implicit/jacobian.jl @@ -4,14 +4,21 @@ A description of how to compute the matrix ``∂R/∂Y``, where ``R(Y)`` denotes the residual of an implicit step with the state ``Y``. Concrete implementations of this abstract type should define 3 methods: - - `jacobian_cache(alg::JacobianAlgorithm, Y, atmos)` + - `jacobian_cache(alg::JacobianAlgorithm, Y, atmos; [verbose])` - `update_jacobian!(alg::JacobianAlgorithm, cache, Y, p, dtγ, t)` - `invert_jacobian!(alg::JacobianAlgorithm, cache, ΔY, R)` +To facilitate debugging, concrete implementations should also define + - `first_column_block_arrays(alg::JacobianAlgorithm, Y, p, dtγ, t)` See [Implicit Solver](@ref) for additional background information. """ abstract type JacobianAlgorithm end +abstract type SparseJacobian <: JacobianAlgorithm end + +# Ignore the verbose flag in jacobian_cache when it is not needed. +jacobian_cache(alg, Y, atmos; verbose) = jacobian_cache(alg, Y, atmos) + """ Jacobian(alg, Y, atmos) @@ -59,3 +66,45 @@ function LinearAlgebra.ldiv!( LinearAlgebra.ldiv!(ΔY_krylov, jacobian, R_krylov) ΔY .= ΔY_krylov end + +# This defines a standardized format for comparing different types of Jacobians. +function first_column_block_arrays(alg::SparseJacobian, Y, p, dtγ, t) + scalar_names = scalar_field_names(Y) + column_Y = first_column_view(Y) + column_p = first_column_view(p) + column_cache = jacobian_cache(alg, column_Y, p.atmos; verbose = false) + + update_jacobian!(alg, column_cache, column_Y, column_p, dtγ, t) + column_∂R_∂Y = column_cache.matrix + + block_arrays = Dict() + for block_key in Iterators.product(scalar_names, scalar_names) + block_key in keys(column_∂R_∂Y) || continue + block_value = Base.materialize(column_∂R_∂Y[block_key]) + block_arrays[block_key] = + block_value isa Fields.Field ? + MatrixFields.column_field2array(block_value) : block_value + end + return block_arrays +end +function first_column_rescaling_arrays(Y, p, t) + scalar_names = scalar_field_names(Y) + column_Y = first_column_view(Y) + column_p = first_column_view(p) + column_Yₜ = similar(column_Y) + + implicit_tendency!(column_Yₜ, column_Y, column_p, t) + + rescaling_arrays = Dict() + for block_key in Iterators.product(scalar_names, scalar_names) + (block_row_name, block_column_name) = block_key + block_row_Yₜ_values = + parent(MatrixFields.get_field(column_Yₜ, block_row_name)) + block_column_Yₜ_values = + parent(MatrixFields.get_field(column_Yₜ, block_column_name)) + safe_inverse = x -> iszero(x) || issubnormal(x) ? zero(x) : inv(x) + rescaling_arrays[block_key] = + Array(safe_inverse.(block_row_Yₜ_values) .* block_column_Yₜ_values') + end + return rescaling_arrays +end diff --git a/src/prognostic_equations/implicit/manual_sparse_jacobian.jl b/src/prognostic_equations/implicit/manual_sparse_jacobian.jl index 872a7185ea..01fed10db4 100644 --- a/src/prognostic_equations/implicit/manual_sparse_jacobian.jl +++ b/src/prognostic_equations/implicit/manual_sparse_jacobian.jl @@ -48,7 +48,7 @@ solver. Certain groups of derivatives can be toggled on or off by setting their - `approximate_solve_iters::Int`: number of iterations to take for the approximate linear solve required when the `diffusion_flag` is `UseDerivative` """ -struct ManualSparseJacobian{F1, F2, F3, F4, F5, F6} <: JacobianAlgorithm +struct ManualSparseJacobian{F1, F2, F3, F4, F5, F6} <: SparseJacobian topography_flag::F1 diffusion_flag::F2 sgs_advection_flag::F3 diff --git a/src/prognostic_equations/vertical_diffusion_boundary_layer.jl b/src/prognostic_equations/vertical_diffusion_boundary_layer.jl index bca3cd65c2..4bb7f743e9 100644 --- a/src/prognostic_equations/vertical_diffusion_boundary_layer.jl +++ b/src/prognostic_equations/vertical_diffusion_boundary_layer.jl @@ -102,16 +102,16 @@ function vertical_diffusion_boundary_layer_tendency!( ᶜρχₜ_diffusion = p.scratch.ᶜtemp_scalar ᶜK_h_scaled = p.scratch.ᶜtemp_scalar_2 - ᶜdivᵥ_ρχ = Operators.DivergenceF2C( - top = Operators.SetValue(C3(0)), - bottom = Operators.SetValue(C3(0)), - ) foreach_gs_tracer(Yₜ, Y) do ᶜρχₜ, ᶜρχ, ρχ_name if ρχ_name in (@name(ρq_rai), @name(ρq_sno), @name(ρn_rai)) @. ᶜK_h_scaled = α_vert_diff_tracer * ᶜK_h else @. ᶜK_h_scaled = ᶜK_h end + ᶜdivᵥ_ρχ = Operators.DivergenceF2C( + top = Operators.SetValue(C3(0)), + bottom = Operators.SetValue(C3(0)), + ) @. ᶜρχₜ_diffusion = ᶜdivᵥ_ρχ( -( ᶠinterp(Y.c.ρ) * diff --git a/src/solver/type_getters.jl b/src/solver/type_getters.jl index c4347fdbfe..1cd60d5211 100644 --- a/src/solver/type_getters.jl +++ b/src/solver/type_getters.jl @@ -451,24 +451,28 @@ function get_surface_setup(parsed_args) return getproperty(SurfaceConditions, Symbol(parsed_args["surface_setup"]))() end -get_jacobian(ode_algo, Y, atmos, parsed_args) = - if ode_algo isa Union{CTS.IMEXAlgorithm, CTS.RosenbrockAlgorithm} - jacobian_algorithm = - parsed_args["use_dense_jacobian"] ? AutoDenseJacobian() : - ManualSparseJacobian( - DerivativeFlag(has_topography(axes(Y.c))), - DerivativeFlag(atmos.diff_mode), - DerivativeFlag(atmos.sgs_adv_mode), - DerivativeFlag(atmos.sgs_entr_detr_mode), - DerivativeFlag(atmos.sgs_mf_mode), - DerivativeFlag(atmos.sgs_nh_pressure_mode), - parsed_args["approximate_linear_solve_iters"], - ) - @info "Jacobian algorithm: $(summary_string(jacobian_algorithm))" - Jacobian(jacobian_algorithm, Y, atmos) +function get_jacobian(ode_algo, Y, atmos, parsed_args) + ode_algo isa Union{CTS.IMEXAlgorithm, CTS.RosenbrockAlgorithm} || + return nothing + jacobian_algorithm = if parsed_args["use_dense_jacobian"] + AutoDenseJacobian() else - nothing + manual_jacobian_algorithm = ManualSparseJacobian( + DerivativeFlag(has_topography(axes(Y.c))), + DerivativeFlag(atmos.diff_mode), + DerivativeFlag(atmos.sgs_adv_mode), + DerivativeFlag(atmos.sgs_entr_detr_mode), + DerivativeFlag(atmos.sgs_mf_mode), + DerivativeFlag(atmos.sgs_nh_pressure_mode), + parsed_args["approximate_linear_solve_iters"], + ) + parsed_args["use_auto_jacobian"] ? + AutoSparseJacobian(manual_jacobian_algorithm) : + manual_jacobian_algorithm end + @info "Jacobian algorithm: $(summary_string(jacobian_algorithm))" + return Jacobian(jacobian_algorithm, Y, atmos) +end #= ode_configuration(Y, parsed_args) diff --git a/test/dependencies.jl b/test/dependencies.jl index 45330c1ca0..3eb361b841 100644 --- a/test/dependencies.jl +++ b/test/dependencies.jl @@ -60,6 +60,8 @@ known_dependencies = Set([ # Random is used to reset seed for random number generator used for cloudy RRTMGP runs to enable bit-wise reproducibility for tests "Random", "SciMLBase", + # SparseMatrixColorings is used to generate the column coloring of the AutoSparseJacobian + "SparseMatrixColorings", "StaticArrays", # Statistics is used to call 'mean' on ClimaCore Fields "Statistics",