Merge #1661

bors[bot] · mcabbott · web-flow · commit ee4c13039777 · 2021-07-12T17:20:34.000Z
1661: Deprecate `Flux.zeros` r=mcabbott a=mcabbott

Seems like a footgun to have a function with the same name &amp; almost the same function as Base's. It seems to have been used inconsistently, with some functions defining their own closure to be less confusing. This gives it a new name, `zeros32`, and uses it everywhere. Ditto `ones32`.

Co-authored-by: Michael Abbott &lt;32575566+mcabbott@users.noreply.github.com&gt;
diff --git a/src/deprecations.jl b/src/deprecations.jl
@@ -18,3 +18,16 @@ function Base.getproperty(a::Dense, s::Symbol)
   end
   return getfield(a, s)
 end
+
+function ones(dims...)
+  Base.depwarn("Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)", :ones)
+end
+ones(T::Type, dims...) = Base.ones(T, dims...) # no need for a message
+
+function zeros(dims...)
+  Base.depwarn("Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)", :ones)
+end
+zeros(T::Type, dims...) = Base.zeros(T, dims...)
+
+ones32(::Type, dims...) = throw(ArgumentError("Flux.ones32 is always Float32, use Base.ones to specify the element type"))
+zeros32(::Type, dims...) = throw(ArgumentError("Flux.zeros32 is always Float32, use Base.zeros to specify the element type"))
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -180,13 +180,13 @@ function Diagonal(sz::Integer...; initα = nothing, initβ = nothing)
     Base.depwarn("keyword initα is deprecated, please simply supply the desired vectors", :Diagonal)
     initα(sz...)
   else
-    ones(sz...)
+    ones32(sz...)
   end
   β = if initβ !== nothing
     Base.depwarn("keyword initβ is deprecated, please simply supply the desired vectors", :Diagonal)
     initβ(sz...)
   else
-    zeros(sz...)
+    zeros32(sz...)
   end
   Diagonal(α, β)
 end
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -198,7 +198,7 @@ end
 
 """
     BatchNorm(channels::Integer, λ=identity;
-              initβ=zeros, initγ=ones,
+              initβ=zeros32, initγ=ones32,
               ϵ=1f-5, momentum= 0.1f0)
 
 [Batch Normalization](https://arxiv.org/abs/1502.03167) layer.
@@ -246,15 +246,14 @@ mutable struct BatchNorm{F,V,N,W}
 end
 
 function BatchNorm(chs::Int, λ=identity;
-          initβ = i -> zeros(Float32, i), 
-          initγ = i -> ones(Float32, i), 
+          initβ=zeros32, initγ=ones32, 
           affine=true, track_stats=true,
           ϵ=1f-5, momentum=0.1f0)
 
   β = affine ? initβ(chs) : nothing
   γ = affine ? initγ(chs) : nothing
-  μ = track_stats ? zeros(Float32, chs) : nothing
-  σ² = track_stats ? ones(Float32, chs) : nothing
+  μ = track_stats ? zeros32(chs) : nothing
+  σ² = track_stats ? ones32(chs) : nothing
 
   return BatchNorm(λ, β, γ,
             μ, σ², ϵ, momentum, 
@@ -286,7 +285,7 @@ end
 
 """
     InstanceNorm(channels::Integer, λ=identity;
-                 initβ=zeros, initγ=ones,
+                 initβ=zeros32, initγ=ones32,
                  affine=false, track_stats=false,
                  ϵ=1f-5, momentum=0.1f0)
 
@@ -323,15 +322,14 @@ mutable struct InstanceNorm{F,V,N,W}
 end
 
 function InstanceNorm(chs::Int, λ=identity;
-                    initβ = i -> zeros(Float32, i), 
-                    initγ = i -> ones(Float32, i), 
+                    initβ=zeros32, initγ=ones32,
                     affine=false, track_stats=false,
                     ϵ=1f-5, momentum=0.1f0)
 
   β = affine ? initβ(chs) : nothing
   γ = affine ? initγ(chs) : nothing
-  μ = track_stats ? zeros(Float32, chs) : nothing
-  σ² = track_stats ? ones(Float32, chs) : nothing
+  μ = track_stats ? zeros32(chs) : nothing
+  σ² = track_stats ? ones32(chs) : nothing
 
   return InstanceNorm(λ, β, γ,
             μ, σ², ϵ, momentum, 
@@ -363,8 +361,7 @@ end
 
 """
     GroupNorm(channels::Integer, G::Integer, λ=identity;
-              initβ = (i) -> zeros(Float32, i), 
-              initγ = (i) -> ones(Float32, i),
+              initβ=zeros32, initγ=ones32,
               affine=true, track_stats=false,
               ϵ=1f-5, momentum=0.1f0)
 
@@ -406,17 +403,16 @@ end
 trainable(gn::GroupNorm) = hasaffine(gn) ? (gn.β, gn.γ) : ()
 
 function GroupNorm(chs::Int, G::Int, λ=identity;
-              initβ = (i) -> zeros(Float32, i), 
-              initγ = (i) -> ones(Float32, i), 
+              initβ=zeros32, initγ=ones32, 
               affine=true, track_stats=false,
-              ϵ=1f-5, momentum=0.1f0) 
+              ϵ=1f-5, momentum=0.1f0)
 
   chs % G == 0 || error("The number of groups ($(G)) must divide the number of channels ($chs)")
 
   β = affine ? initβ(chs) : nothing
   γ = affine ? initγ(chs) : nothing
-  μ = track_stats ? zeros(Float32, G) : nothing
-  σ² = track_stats ? ones(Float32, G) : nothing
+  μ = track_stats ? zeros32(G) : nothing
+  σ² = track_stats ? ones32(G) : nothing
 
   return GroupNorm(G, λ, 
             β, γ,
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
@@ -77,7 +77,7 @@ struct RNNCell{F,A,V,S}
   state0::S
 end
 
-RNNCell(in::Integer, out::Integer, σ=tanh; init=Flux.glorot_uniform, initb=zeros, init_state=zeros) = 
+RNNCell(in::Integer, out::Integer, σ=tanh; init=Flux.glorot_uniform, initb=zeros32, init_state=zeros32) = 
   RNNCell(σ, init(out, in), init(out, out), initb(out), init_state(out,1))
 
 function (m::RNNCell{F,A,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {F,A,V,T}
@@ -127,8 +127,8 @@ end
 
 function LSTMCell(in::Integer, out::Integer;
                   init = glorot_uniform,
-                  initb = zeros,
-                  init_state = zeros)
+                  initb = zeros32,
+                  init_state = zeros32)
   cell = LSTMCell(init(out * 4, in), init(out * 4, out), initb(out * 4), (init_state(out,1), init_state(out,1)))
   cell.b[gate(out, 2)] .= 1
   return cell
@@ -190,7 +190,7 @@ struct GRUCell{A,V,S}
   state0::S
 end
 
-GRUCell(in, out; init = glorot_uniform, initb = zeros, init_state = zeros) =
+GRUCell(in, out; init = glorot_uniform, initb = zeros32, init_state = zeros32) =
   GRUCell(init(out * 3, in), init(out * 3, out), initb(out * 3), init_state(out,1))
 
 function (m::GRUCell{A,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {A,V,T}
diff --git a/src/utils.jl b/src/utils.jl
@@ -346,7 +346,7 @@ julia> Flux.identity_init(3,3,2,2)
 ```
 """
 # Assume bias
-identity_init(cols; gain=1, shift=0) = zeros(Float32, cols)
+identity_init(cols; gain=1, shift=0) = zeros32(cols)
 
 # Assume matrix multiplication
 identity_init(rows, cols; gain=1, shift=0) = circshift(Matrix{Float32}(I * gain, rows,cols), shift)
@@ -355,7 +355,7 @@ identity_init(rows, cols; gain=1, shift=0) = circshift(Matrix{Float32}(I * gain,
 function identity_init(dims...; gain=1, shift=0)
   nin, nout = dims[end-1], dims[end]
   centers = map(d -> cld(d, 2), dims[1:end-2])
-  weights = zeros(Float32, dims)
+  weights = zeros32(dims)
   for i in 1:min(nin,nout)
     weights[centers..., i, i] = gain
   end
@@ -366,12 +366,8 @@ identity_init(::AbstractRNG, dims...; kwargs...) = identity_init(dims...; kwargs
 identity_init(; init_kwargs...) = identity_init(Random.GLOBAL_RNG; init_kwargs...)
 identity_init(rng::AbstractRNG; init_kwargs...) = (args...;kwargs...) -> identity_init(rng, args...; init_kwargs..., kwargs...)
 
-
-ones(T::Type, dims...) = Base.ones(T, dims...)
-zeros(T::Type, dims...) = Base.zeros(T, dims...)
-
-ones(dims...) = Base.ones(Float32, dims...)
-zeros(dims...) = Base.zeros(Float32, dims...)
+ones32(dims...) = Base.ones(Float32, dims...)
+zeros32(dims...) = Base.zeros(Float32, dims...)
 
 """
     create_bias(weights, bias, length)
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
@@ -166,7 +166,7 @@ import Flux: activations
       @test b3.bias isa Vector{Float16}
       @test size(b3(rand(4), rand(5))) == (3,)
 
-      b4 = Flux.Bilinear(3,3,7; bias=1:7, init=Flux.zeros)
+      b4 = Flux.Bilinear(3,3,7; bias=1:7, init=Flux.zeros32)
       @test_skip  b4.bias isa Vector{Float32}
 
       @test_throws ArgumentError Flux.Bilinear(rand(3)) # expects a 3-array
diff --git a/test/utils.jl b/test/utils.jl
@@ -349,20 +349,20 @@ end
     import Flux: loadparams!
     pars(w, b) = [w, b]
     import Flux: loadparams!, Zeros
-    pars(w, b::Zeros) = [w, Flux.zeros(size(w,1))]
+    pars(w, b::Zeros) = [w, Flux.zeros32(size(w,1))]
     pars(l) = pars(l.W, l.b)
     pararray(m) = mapreduce(pars, vcat, m)
     weights(m) = mapreduce(l -> [l.W], vcat, m)
-    @testset "Bias type $bt" for bt in (Flux.zeros, nobias)
+    @testset "Bias type $bt" for bt in (Flux.zeros32, nobias)
       m = dm(bt)
       loadparams!(m, params(m))
       testdense(m, bt)
     end
 
     @testset "$b1 to $b2" for (b1, b2, be) in (
-      (Flux.zeros, Flux.ones, Flux.ones),   # Load ones as bias to a model with zeros as bias -> model gets ones as bias
-      (Flux.ones, nobias, Flux.zeros), # Load Zeros as bias to a model with ones as bias-> model gets zeros as bias
-      (nobias, Flux.ones, nobias),     # Load ones as bias to a model with Zeros as bias-> model bias does not change
+      (Flux.zeros32, Flux.ones32, Flux.ones32),   # Load ones as bias to a model with zeros as bias -> model gets ones as bias
+      (Flux.ones32, nobias, Flux.zeros32), # Load Zeros as bias to a model with ones as bias-> model gets zeros as bias
+      (nobias, Flux.ones32, nobias),     # Load ones as bias to a model with Zeros as bias-> model bias does not change
     )
       m1 = dm(b1)
       m2 = dm(b2)