explicitly preserve eltype (#56)

mcabbott · web-flow · commit 4155bcd03fb7 · 2022-02-09T07:50:39.000-05:00
diff --git a/src/Optimisers.jl b/src/Optimisers.jl
@@ -24,11 +24,11 @@ The initial state is `init(rule::RuleType, parameters)`.
 
 # Example
 ```jldoctest
-julia> Optimisers.init(Descent(0.1), [1,2,3]) === nothing
+julia> Optimisers.init(Descent(0.1), Float32[1,2,3]) === nothing
 true
 
-julia> Optimisers.apply!(Descent(0.1), nothing, [1,2,3], [4,5,6])
-(nothing, Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}}(*, ([4, 5, 6], 0.1)))
+julia> Optimisers.apply!(Descent(0.1), nothing, Float32[1,2,3], [4,5,6])
+(nothing, Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}}(*, ([4, 5, 6], 0.1f0)))
 ```
 """
 apply!
@@ -41,7 +41,7 @@ This and [`apply!`](@ref) are the two functions which any new optimisation rule
 
 # Examples
 ```jldoctest
-julia> Optimisers.init(Descent(), [1,2,3])  # is `nothing`
+julia> Optimisers.init(Descent(), Float32[1,2,3])  # is `nothing`
 
 julia> Optimisers.init(Momentum(), [1.0, 2.0])
 2-element Vector{Float64}:
diff --git a/src/interface.jl b/src/interface.jl
@@ -21,7 +21,7 @@ function setup(rule, x; seen = Base.IdSet())
   end
 end
 
-subtract!(x, x̄) = iswriteable(x) ? (x .= x .- x̄) : (x .- x̄)
+subtract!(x, x̄) = iswriteable(x) ? (x .= x .- x̄) : eltype(x).(x .- x̄)
 
 update!(::Nothing, x, ::Zero, ::Zero...) = nothing, x
 update!(::Nothing, x, x̄s...) = nothing, x
@@ -50,10 +50,10 @@ end
 apply!(o, state, x, dx, dxs...) = apply!(o, state, x, dx)
 
 isnumeric(x::AbstractArray{<:Number}) = isleaf(x)  # isleaf to allow for e.g. transposed shared weights
-isnumeric(x::AbstractArray{<:Bool}) = false  # convention of ChainRules is that Bool is non-differentiable
+isnumeric(x::AbstractArray{<:Integer}) = false
 isnumeric(x) = false
 
-iswriteable(::DenseArray{<:AbstractFloat}) = true  # more elaborate versions are possible, wait until needed?
+iswriteable(::DenseArray) = true  # more elaborate versions are possible, wait until needed?
 iswriteable(_) = false
 
 """
diff --git a/src/rules.jl b/src/rules.jl
@@ -525,7 +525,7 @@ This is equivalent to `Descent(1)`.
 ```jldoctest
 julia> o = OptimiserChain(ClipGrad(1), Descent(0.1));
 
-julia> m = ([0,0,0],);
+julia> m = (zeros(3),);
 
 julia> s = Optimisers.setup(o, m)
 (Leaf(OptimiserChain(ClipGrad{Int64}(1), Descent{Float64}(0.1)), [nothing, nothing]),)
diff --git a/test/rules.jl b/test/rules.jl
@@ -118,11 +118,7 @@ end
     # Static version is truly out-of-place:
     mstatic = (SA{Float32}[1,2], SA{Float64}[3,4]) # , SA{Float16}[5,6])  with Float16, all fail
     upstatic = Optimisers.update(Optimisers.setup(o, mstatic), mstatic, mstatic)[2]
-    if o isa OptimiserChain && o.opts[2] isa ADAM  # These promote to Float64
-      @test_broken map(eltype, upstatic) == types[1:2]
-    else
-      @test map(eltype, upstatic) == types[1:2]
-    end
+    @test map(eltype, upstatic) == types[1:2]
     @test upstatic[1] isa SVector
 
     # With ordinary Array gradient, what happens? Not so important!
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -38,7 +38,7 @@ Optimisers.trainable(x::TwoThirds) = (a = x.a,)
     end
 
     @testset "gradient clipping" begin
-      m = (α = ([0], sin), γ = rand(3))
+      m = (α = ([0.0], sin), γ = rand(3))
       s1 = Optimisers.setup(ClipGrad(13), m)
       _, m1 = Optimisers.update(s1, m, (α = nothing, γ = [1,10,100],))
       @test m.γ .- m1.γ ≈ [1, 10, 13]
@@ -58,7 +58,7 @@ Optimisers.trainable(x::TwoThirds) = (a = x.a,)
     end
 
     @testset "OptimiserChain" begin
-      x = [1,10,100]; dx = [1,2,3];
+      x = [1, 10, 100.0]; dx = [1, 2, 3.0];
       @test Optimisers.update(Optimisers.setup(WeightDecay(0.1), x), x, dx)[2] ≈ [1-0.1-1, 10-1-2, 100-10-3]
       @test Optimisers.update(Optimisers.setup(ClipGrad(2), x), x, dx)[2] ≈ [1-1, 10-2, 100-2]
 
@@ -81,7 +81,7 @@ Optimisers.trainable(x::TwoThirds) = (a = x.a,)
 
     @testset "trainable subset" begin
       # Foo has an old-style tuple trainable, both elements
-      mf = Foo([1,2], (a = sin, b = [3,4], c = 5))
+      mf = Foo([1.0, 2.0], (a = sin, b = [3.0, 4.0], c = 5))
       sf = Optimisers.setup(Descent(0.1), mf)
       gf = (x = nothing, y = (a = nothing, b = [1,1], c = 1))
       _, mf2 = Optimisers.update(sf, mf, gf)
@@ -116,6 +116,20 @@ Optimisers.trainable(x::TwoThirds) = (a = x.a,)
       @test Optimisers.update!(s, m, g...)[2] isa Foo
     end
 
+    @testset "eltype preservation" begin
+      m = (Float16[1,2], Float32[3,4])
+      s1 = Optimisers.setup(Descent(0.1), m)
+      s2, m2 = Optimisers.update(s1, m, m)
+      @test eltype(m2[1]) == Float16  # because update copies & calls update!
+      @test eltype(m2[2]) == Float32
+
+      staticm = (SA{Float16}[1,2], SA{Float32}[3,4])
+      s3 = Optimisers.setup(Descent(0.1), staticm)
+      s4, m4 = Optimisers.update(s3, staticm, staticm)
+      @test eltype(m4[1]) == Float16  # because of explicit broadcast in subtract!
+      @test eltype(m4[2]) == Float32
+    end
+
     @testset "forgotten gradient" begin
       x = [1.0, 2.0]
       sx = Optimisers.setup(Descent(), x)