Add adjoint tests

avik-pal · avik-pal · commit 8e7f13255414 · 2023-12-17T21:38:37.000-05:00
diff --git a/CITATION.bib b/CITATION.bib
@@ -1,8 +1,6 @@
-@misc{pal2022mixing,
-      title={Mixing Implicit and Explicit Deep Learning with Skip DEQs and Infinite Time Neural ODEs (Continuous DEQs)}, 
-      author={Avik Pal and Alan Edelman and Christopher Rackauckas},
-      year={2022},
-      eprint={2201.12240},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG}
+@article{pal2022continuous,
+  title={Continuous Deep Equilibrium Models: Training Neural ODEs Faster by Integrating Them to Infinity},
+  author={Pal, Avik and Edelman, Alan and Rackauckas, Christopher},
+  booktitle={2023 IEEE High Performance Extreme Computing Conference (HPEC)}, 
+  year={2023}
 }
diff --git a/README.md b/README.md
@@ -24,27 +24,23 @@ Pkg.add("DeepEquilibriumNetworks")
 ## Quickstart
 
 ```julia
-using DeepEquilibriumNetworks, Lux, Random, Zygote
+using DeepEquilibriumNetworks, Lux, Random, NonlinearSolve, Zygote
 # using LuxCUDA, LuxAMDGPU ## Install and Load for GPU Support
 
 seed = 0
 rng = Random.default_rng()
 Random.seed!(rng, seed)
 
 model = Chain(Dense(2 => 2),
-    DeepEquilibriumNetwork(Parallel(+,
-            Dense(2 => 2; use_bias=false),
-            Dense(2 => 2; use_bias=false)),
-        ContinuousDEQSolver(; abstol=0.1f0, reltol=0.1f0, abstol_termination=0.1f0,
-            reltol_termination=0.1f0);
-        save_everystep=true))
+    DeepEquilibriumNetwork(Parallel(+, Dense(2 => 2; use_bias=false),
+            Dense(2 => 2; use_bias=false)), NewtonRaphson()))
 
 gdev = gpu_device()
 cdev = cpu_device()
 
 ps, st = Lux.setup(rng, model) |> gdev
-x = rand(rng, Float32, 2, 1) |> gdev
-y = rand(rng, Float32, 2, 1) |> gdev
+x = rand(rng, Float32, 2, 3) |> gdev
+y = rand(rng, Float32, 2, 3) |> gdev
 
 model(x, ps, st)
 
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -17,26 +17,23 @@ Pkg.add("DeepEquilibriumNetworks")
 ## Quick-start
 
 ```julia
-using DeepEquilibriumNetworks, Lux, Random, Zygote
+using DeepEquilibriumNetworks, Lux, Random, NonlinearSolve, Zygote
 # using LuxCUDA, LuxAMDGPU ## Install and Load for GPU Support
 
 seed = 0
 rng = Random.default_rng()
 Random.seed!(rng, seed)
+
 model = Chain(Dense(2 => 2),
-    DeepEquilibriumNetwork(Parallel(+,
-            Dense(2 => 2; use_bias=false),
-            Dense(2 => 2; use_bias=false)),
-        ContinuousDEQSolver(; abstol=0.1f0, reltol=0.1f0, abstol_termination=0.1f0,
-            reltol_termination=0.1f0);
-        save_everystep=true))
+    DeepEquilibriumNetwork(Parallel(+, Dense(2 => 2; use_bias=false),
+            Dense(2 => 2; use_bias=false)), NewtonRaphson()))
 
 gdev = gpu_device()
 cdev = cpu_device()
 
 ps, st = Lux.setup(rng, model) |> gdev
-x = rand(rng, Float32, 2, 1) |> gdev
-y = rand(rng, Float32, 2, 1) |> gdev
+x = rand(rng, Float32, 2, 3) |> gdev
+y = rand(rng, Float32, 2, 3) |> gdev
 
 model(x, ps, st)
 
diff --git a/ext/DeepEquilibriumNetworksSciMLSensitivityExt.jl b/ext/DeepEquilibriumNetworksSciMLSensitivityExt.jl
@@ -4,7 +4,7 @@ using SciMLBase, SciMLSensitivity
 import DeepEquilibriumNetworks: __default_sensealg
 
 @inline __default_sensealg(::SteadyStateProblem) = SteadyStateAdjoint(;
-    autojacvec=ZygoteVJP())
+    autojacvec=ZygoteVJP(), linsolve_kwargs=(; maxiters=10, abstol=1e-3, reltol=1e-3))
 @inline __default_sensealg(::ODEProblem) = GaussAdjoint(; autojacvec=ZygoteVJP())
 
 end
diff --git a/src/layers.jl b/src/layers.jl
@@ -22,6 +22,22 @@ Stores the solution of a DeepEquilibriumNetwork and its variants.
     original
 end
 
+function CRC.rrule(::Type{<:DeepEquilibriumSolution}, z_star, u0, residual, jacobian_loss,
+        nfe, original)
+    sol = DeepEquilibriumSolution(z_star, u0, residual, jacobian_loss, nfe, original)
+    ∇DeepEquilibriumSolution(::CRC.NoTangent) = ntuple(_ -> CRC.NoTangent(), 7)
+    function ∇DeepEquilibriumSolution(∂sol)
+        ∂z_star = ∂sol.z_star
+        ∂u0 = ∂sol.u0
+        ∂residual = ∂sol.residual
+        ∂jacobian_loss = ∂sol.jacobian_loss
+        ∂nfe = ∂sol.nfe
+        ∂original = CRC.NoTangent()
+        return (CRC.NoTangent(), ∂z_star, ∂u0, ∂residual, ∂jacobian_loss, ∂nfe, ∂original)
+    end
+    return sol, ∇DeepEquilibriumSolution
+end
+
 function DeepEquilibriumSolution()
     return DeepEquilibriumSolution(ntuple(Returns(nothing), 4)..., 0, nothing)
 end
@@ -66,8 +82,8 @@ function (deq::DEQ)(x, ps, st::NamedTuple, ::Val{true})
     repeated_model = RepeatedLayer(deq.model; repeats=st.fixed_depth)
 
     zˢᵗᵃʳ, st_ = repeated_model((z, x), ps.model, st.model)
-    model = Lux.Experimental.StatefulLuxLayer(deq.model, ps.model, st_)
-    resid = CRC.ignore_derivatives(zˢᵗᵃʳ .- model((zˢᵗᵃʳ, x)))
+    model = Lux.Experimental.StatefulLuxLayer(deq.model, nothing, st_)
+    resid = CRC.ignore_derivatives(zˢᵗᵃʳ .- model((zˢᵗᵃʳ, x), ps.model))
 
     rng = Lux.replicate(st.rng)
     jac_loss = __estimate_jacobian_trace(__getproperty(deq, Val(:jacobian_regularization)),
@@ -156,7 +172,7 @@ function DeepEquilibriumNetwork(model, solver; init=missing,
     if init === missing # Regular DEQ
         init = WrappedFunction(Base.Fix1(__zeros_init, __getproperty(model, Val(:scales))))
     elseif init === nothing # SkipRegDEQ
-        init = nothing
+        init = NoOpLayer()
     elseif !(init isa AbstractExplicitLayer)
         init = Lux.transform(init)
     end
@@ -225,8 +241,7 @@ model(x, ps, st)
 ```
 """
 function MultiScaleDeepEquilibriumNetwork(main_layers::Tuple, mapping_layers::Matrix,
-        post_fuse_layer::Union{Nothing, Tuple}, solver,
-        scales::NTuple{N, NTuple{L, Int64}}; kwargs...) where {N, L}
+        post_fuse_layer::Union{Nothing, Tuple}, solver, scales; kwargs...)
     l1 = Parallel(nothing, main_layers...)
     l2 = BranchLayer(Parallel.(+, map(x -> tuple(x...), eachrow(mapping_layers))...)...)
 
@@ -254,8 +269,7 @@ creates a [`MultiScaleDeepEquilibriumNetwork`](@ref) with `init` kwarg set to pa
 If `init` is not passed, it creates a MultiScale Regularized Deep Equilibrium Network.
 """
 function MultiScaleSkipDeepEquilibriumNetwork(main_layers::Tuple, mapping_layers::Matrix,
-        post_fuse_layer::Union{Nothing, Tuple}, init::Tuple, solver,
-        scales::NTuple{N, NTuple{L, Int64}}; kwargs...) where {N, L}
+        post_fuse_layer::Union{Nothing, Tuple}, init::Tuple, solver, scales; kwargs...)
     init = Chain(Parallel(nothing, init...), x -> mapreduce(__flatten, vcat, x))
     return MultiScaleDeepEquilibriumNetwork(main_layers, mapping_layers, post_fuse_layer,
         solver, scales; init, kwargs...)
@@ -279,7 +293,8 @@ function MultiScaleNeuralODE(args...; kwargs...)
 end
 
 ## Generate Initial Condition
-@inline function __get_initial_condition(deq::DEQ{pType, Nothing}, x, ps, st) where {pType}
+@inline function __get_initial_condition(deq::DEQ{pType, NoOpLayer}, x, ps,
+        st) where {pType}
     zₓ = __zeros_init(__getproperty(deq.model, Val(:scales)), x)
     z, st_ = deq.model((zₓ, x), ps.model, st.model)
     return z, (; st..., model=st_)
diff --git a/src/utils.jl b/src/utils.jl
@@ -93,6 +93,8 @@ function __estimate_jacobian_trace(::AutoFiniteDiff, model, ps, z, x, rng)
             z[idx] = _z
         end
     end
+
+    return res
 end
 
 __estimate_jacobian_trace(::Nothing, model, ps, z, x, rng) = zero(eltype(x))
diff --git a/test/Project.toml b/test/Project.toml
@@ -8,6 +8,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 LuxTestUtils = "ac9de150-d08f-4546-94fb-7472b5760531"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
+NonlinearSolve = "8913a72c-1f9b-4ce2-8d82-65094dcecaec"
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
@@ -20,5 +21,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-Aqua = "0.8"
-julia = "1.6"
+Aqua = "0.8"
diff --git a/test/layers.jl b/test/layers.jl
@@ -3,6 +3,14 @@ using ADTypes, DeepEquilibriumNetworks, DiffEqBase, NonlinearSolve, OrdinaryDiff
 
 include("test_utils.jl")
 
+function loss_function(model, x, ps, st)
+    y, st = model(x, ps, st)
+    l1 = y isa Tuple ? sum(Base.Fix1(sum, abs2), y) : sum(abs2, y)
+    l2 = st.solution.jacobian_loss
+    l3 = sum(abs2, st.solution.z_star .- st.solution.u0)
+    return l1 + l2 + l3
+end
+
 @testset "DeepEquilibriumNetwork" begin
     rng = __get_prng(0)
 
@@ -22,6 +30,8 @@ include("test_utils.jl")
 
         @testset "x_size: $(x_size)" for (base_model, init_model, x_size) in zip(base_models,
             init_models, x_sizes)
+            @info solver, mtype, jacobian_regularization, base_model, init_model, x_size
+
             model = if mtype === :deq
                 DeepEquilibriumNetwork(base_model, solver; jacobian_regularization)
             elseif mtype === :skipdeq
@@ -47,6 +57,11 @@ include("test_utils.jl")
             @test st.solution isa DeepEquilibriumSolution
             @test maximum(abs, st.solution.residual) ≤ 1e-3
 
+            _, gs_x, gs_ps, _ = Zygote.gradient(loss_function, model, x, ps, st)
+
+            @test __is_finite_gradient(gs_x)
+            @test __is_finite_gradient(gs_ps)
+
             ps, st = Lux.setup(rng, model)
             st = Lux.update_state(st, :fixed_depth, Val(10))
             @test st.solution == DeepEquilibriumSolution()
@@ -58,6 +73,11 @@ include("test_utils.jl")
             @test size(z) == size(x)
             @test st.solution isa DeepEquilibriumSolution
             @test st.solution.nfe == 10
+
+            _, gs_x, gs_ps, _ = Zygote.gradient(loss_function, model, x, ps, st)
+
+            @test __is_finite_gradient(gs_x)
+            @test __is_finite_gradient(gs_ps)
         end
     end
 end
@@ -91,11 +111,12 @@ end
     jacobian_regularizations = (nothing, AutoFiniteDiff(), AutoZygote())
 
     for mtype in model_type, jacobian_regularization in jacobian_regularizations
-
         @testset "Solver: $(__nameof(solver))" for solver in solvers
-
             @testset "x_size: $(x_size)" for (main_layer, mapping_layer, init_layer, x_size, scale) in zip(main_layers,
                 mapping_layers, init_layers, x_sizes, scales)
+                @info solver, mtype, jacobian_regularization, main_layer, mapping_layer,
+                init_layer, x_size, scale
+
                 model = if mtype === :deq
                     MultiScaleDeepEquilibriumNetwork(main_layer, mapping_layer, nothing,
                         solver,
diff --git a/test/test_utils.jl b/test/test_utils.jl
@@ -5,19 +5,17 @@ __nameof(::X) where {X} = nameof(X)
 
 __get_prng(seed::Int) = StableRNG(seed)
 
-# is_finite_gradient(x::AbstractArray) = all(isfinite, x)
+__is_finite_gradient(x::AbstractArray) = all(isfinite, x)
 
-# function is_finite_gradient(gs::NamedTuple)
-#     gradient_is_finite = [true]
-#     function _is_gradient_finite(x)
-#         if !isnothing(x) && !all(isfinite, x)
-#             gradient_is_finite[1] = false
-#         end
-#         return x
-#     end
-#     Functors.fmap(_is_gradient_finite, gs)
-#     return gradient_is_finite[1]
-# end
+function __is_finite_gradient(gs::NamedTuple)
+    gradient_is_finite = Ref(true)
+    function __is_gradient_finite(x)
+        !isnothing(x) && !all(isfinite, x) && (gradient_is_finite[] = false)
+        return x
+    end
+    fmap(__is_gradient_finite, gs)
+    return gradient_is_finite[]
+end
 
 function __get_dense_layer(args...; kwargs...)
     init_weight(rng::AbstractRNG, dims...) = randn(rng, Float32, dims) .* 0.001f0