Merge #1454

bors[bot] · atiyo · web-flow · commit b917a32ca324 · 2021-01-12T16:35:36.000Z
1454: Add sparse initialization r=CarloLucibello a=atiyo Add sparse initialization, documentation and tests. Trim whitespace in editted files. This PR is intended to address one of the outstanding points in bringing Flux to parity with PyTorch's features so it partially addresses #1431 and fully addresses #1450. The implementation follows the method given in [PyTorch implementation](https://pytorch.org/docs/stable/_modules/torch/nn/init.html#sparse_): a normally-distributed array is created, then a fixed proportion of randomly chosen row-indices is zeroed out for every column. Like the PyTorch version, it is restricted to 2-d Arrays. ### PR Checklist - [x] Tests are added - [x] Entry in NEWS.md - [x] Documentation, if applicable - [ ] Final review from `@dhairyagandhi96` (for API changes). Co-authored-by: atiyo <atiyo@users.noreply.github.com> Co-authored-by: Atiyo Ghosh <atiyo@users.noreply.github.com>
diff --git a/NEWS.md b/NEWS.md
@@ -6,6 +6,7 @@
 * Excise datasets in favour of other providers in the julia ecosystem.
 * Added option to set `bias` to [false](https://github.com/FluxML/Flux.jl/pull/1379) to eliminating `bias` from being trained.
 * Removed kwarg only constructors for [`convolutional layers`](https://github.com/FluxML/Flux.jl/pull/1379)).
+* Add [sparse initialization](https://github.com/FluxML/Flux.jl/pull/1454) as described in [Deep learning via Hessian-free optimization](https://dl.acm.org/doi/abs/10.5555/3104322.3104416).
 * Other new features and bug fixes (see GitHub releases page)
 
 ## v0.11.2
diff --git a/docs/src/utilities.md b/docs/src/utilities.md
@@ -36,6 +36,7 @@ Flux.glorot_uniform
 Flux.glorot_normal
 Flux.kaiming_uniform
 Flux.kaiming_normal
+Flux.sparse_init
 ```
 
 ## Model Building
diff --git a/src/utils.jl b/src/utils.jl
@@ -56,6 +56,7 @@ julia> Flux.glorot_uniform(2, 3)
 * glorot initialization using normal distribution: [`glorot_normal`](@ref Flux.glorot_normal)
 * kaiming initialization using normal distribution: [`kaiming_normal`](@ref Flux.kaiming_normal)
 * kaiming initialization using uniform distribution: [`kaiming_uniform`](@ref Flux.kaiming_uniform)
+* sparse initialization: [`sparse_init`](@ref Flux.sparse_init)
 * calculation of `fan_in` and `fan_out`: [`nfan`](@ref Flux.nfan)
 
 # References
@@ -88,6 +89,7 @@ julia> Flux.glorot_normal(3, 2)
 * glorot initialization using uniform distribution: [`glorot_uniform`](@ref Flux.glorot_uniform)
 * kaiming initialization using normal distribution: [`kaiming_normal`](@ref Flux.kaiming_normal)
 * kaiming initialization using uniform distribution: [`kaiming_uniform`](@ref Flux.kaiming_uniform)
+* sparse initialization: [`sparse_init`](@ref Flux.sparse_init)
 * calculation of `fan_in` and `fan_out`: [`nfan`](@ref Flux.nfan)
 
 # References
@@ -120,6 +122,7 @@ julia> Flux.kaiming_uniform(3, 2)
 * kaiming initialization using normal distribution: [`kaiming_normal`](@ref Flux.kaiming_normal)
 * glorot initialization using normal distribution: [`glorot_normal`](@ref Flux.glorot_normal)
 * glorot initialization using uniform distribution: [`glorot_uniform`](@ref Flux.glorot_uniform)
+* sparse initialization: [`sparse_init`](@ref Flux.sparse_init)
 * calculation of `fan_in` and `fan_out`: [`nfan`](@ref Flux.nfan)
 
 # References
@@ -156,6 +159,7 @@ julia> Flux.kaiming_normal(3, 2)
 * kaiming initialization using uniform distribution: [`kaiming_uniform`](@ref Flux.kaiming_uniform)
 * glorot initialization using normal distribution: [`glorot_normal`](@ref Flux.glorot_normal)
 * glorot initialization using uniform distribution: [`glorot_uniform`](@ref Flux.glorot_uniform)
+* sparse initialization: [`sparse_init`](@ref Flux.sparse_init)
 * calculation of `fan_in` and `fan_out`: [`nfan`](@ref Flux.nfan)
 
 # References
@@ -170,14 +174,58 @@ end
 kaiming_normal(dims...; kwargs...) = kaiming_normal(Random.GLOBAL_RNG, dims...; kwargs...)
 kaiming_normal(rng::AbstractRNG; kwargs...) = (dims...; kwargs...) -> kaiming_normal(rng, dims...; kwargs...)
 
+"""
+    sparse_init([rng=GLOBAL_RNG], dims...; sparsity, std = 0.01)
+
+Return an `Array` of size `dims` where each column contains a fixed fraction of
+zero elements given by `sparsity`. Non-zero elements are normally distributed
+with a mean of zero and standard deviation `std`.
+
+This method is described in [1].
+
+# Examples
+```jldoctest; setup = :(using Random; Random.seed!(0))
+julia> Flux.sparse_init(3, 2, sparsity=0.1)
+3×2 Array{Float32,2}:
+  0.00828413  0.0
+ -0.00353007  0.00297336
+  0.0         0.00586617
+```
+
+# See also
+
+* kaiming initialization using normal distribution: [`kaiming_normal`](@ref Flux.kaiming_normal)
+* kaiming initialization using uniform distribution: [`kaiming_uniform`](@ref Flux.kaiming_uniform)
+* glorot initialization using normal distribution: [`glorot_normal`](@ref Flux.glorot_normal)
+* glorot initialization using uniform distribution: [`glorot_uniform`](@ref Flux.glorot_uniform)
+
+# References
+
+[1] Martens, J, "Deep learning via Hessian-free optimization" _Proceedings of the 27th International Conference on International Conference on Machine Learning_. 2010.
+"""
+function sparse_init(rng::AbstractRNG, dims...; sparsity, std = 0.01)
+  if length(dims) != 2
+    throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
+  end
+  rows, cols = dims
+  prop_zero = min(1.0, sparsity)
+  num_zeros = ceil(Integer, prop_zero * rows)
+  sparse_array = randn(rng, Float32, dims...) .* Float32(std)
+  sparse_array[1:num_zeros, :] .= 0f0
+  return mapslices(shuffle, sparse_array, dims=1)
+end
+
+sparse_init(dims...; kwargs...) = sparse_init(Random.GLOBAL_RNG, dims...; kwargs...)
+sparse_init(rng::AbstractRNG; kwargs...) = (dims...; kwargs...) -> sparse_init(rng, dims...; kwargs...)
+
 ones(T::Type, dims...) = Base.ones(T, dims...)
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
 
 ones(dims...) = Base.ones(Float32, dims...)
 zeros(dims...) = Base.zeros(Float32, dims...)
 
 """
-    create_bias(shallcreate::Bool, iftrue, dims...) 
+    create_bias(shallcreate::Bool, iftrue, dims...)
     create_bias(x, ::Any...)
 
 Return a bias parameter for a layer.
@@ -188,7 +236,7 @@ Essentially handles the allowed input options for the `bias` keyword:
     If not a boolean, return self to handle the case of bias=somearray.
 """
 create_bias(shallcreate::Bool, iftrue, dims...) = shallcreate ? iftrue(dims...) : Zeros()
-create_bias(x, ::Any...) = x  
+create_bias(x, ::Any...) = x
 
 """
     unsqueeze(xs, dim)
diff --git a/test/utils.jl b/test/utils.jl
@@ -1,5 +1,5 @@
 using Flux
-using Flux: throttle, nfan, glorot_uniform, glorot_normal, kaiming_normal, kaiming_uniform, stack, unstack, Zeros
+using Flux: throttle, nfan, glorot_uniform, glorot_normal, kaiming_normal, kaiming_uniform, sparse_init, stack, unstack, Zeros
 using StatsBase: var, std
 using Random
 using Test
@@ -95,6 +95,30 @@ end
       @test eltype(v) == Float32
     end
   end
+
+  @testset "sparse_init" begin
+    # sparse_init should yield an error for non 2-d dimensions
+    # sparse_init should yield no zero elements if sparsity < 0
+    # sparse_init should yield all zero elements if sparsity > 1
+    # sparse_init should yield exactly ceil(n_in * sparsity) elements in each column for other sparsity values
+    # sparse_init should yield a kernel in its non-zero elements consistent with the std parameter
+
+    @test_throws ArgumentError sparse_init(100, 100, 100, sparsity=0.1)
+    v = sparse_init(100, 100, sparsity=-0.1)
+    @test sum(v .== 0) == 0
+    @test eltype(v) == Float32
+    v = sparse_init(100, 100, sparsity=1.1)
+    @test sum(v .== 0) == length(v)
+    @test eltype(v) == Float32
+
+    for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)]
+      expected_zeros = ceil(Integer, n_in * sparsity)
+      v = sparse_init(n_in, n_out, sparsity=sparsity, std=σ)
+      @test all([sum(v[:,col] .== 0) == expected_zeros for col in 1:n_out])
+      @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ
+      @test eltype(v) == Float32
+    end
+  end
 end
 
 @testset "Params" begin
@@ -141,22 +165,22 @@ end
 
     @testset "Explicit" begin
       gfun(args...) = gradient((x, y) -> sum(op.(x,y)), args...)
-      g = gfun(o, z) 
+      g = gfun(o, z)
       @test gfun(o, Z) == (g[1], nothing)
 
-      g = gfun(z, o) 
+      g = gfun(z, o)
       @test gfun(Z, o) == (nothing, g[2])
     end
 
     @testset "Implicit" begin
       gfun(args...) = gradient(() -> sum(op.(args...)), params(collect(args)))
-      g = gfun(o, z) 
+      g = gfun(o, z)
 
       gres = gfun(o, Z)
       @test gres[o] == g[o]
       @test Z ∉ gres.params
 
-      g = gfun(z, o) 
+      g = gfun(z, o)
       gres = gfun(Z, o)
       @test gres[o] == g[o]
       @test Z ∉ gres.params
@@ -170,14 +194,14 @@ end
 
     @testset "Explicit" begin
       gfun(args...) = gradient((x, y) -> sum(x ./ y), args...)
-      g = gfun(z, o) 
+      g = gfun(z, o)
       @test gfun(Z, o) == (nothing, g[2])
     end
 
     @testset "Implicit" begin
       gfun(x,y) = gradient(() -> sum(x ./ y), params([x,y]))
 
-      g = gfun(z, o) 
+      g = gfun(z, o)
       gres = gfun(Z, o)
       @test gres[o] == g[o]
       @test Z ∉ gres.params
@@ -193,21 +217,21 @@ end
     @testset "Explicit" begin
       gfun(args...) = gradient((x, y) -> sum(op(x,y)), args...)
 
-      g = gfun(o, z) 
+      g = gfun(o, z)
       @test gfun(o, Z) == (g[1], nothing)
 
-      g = gfun(z, o) 
+      g = gfun(z, o)
       @test gfun(Z, o) == (nothing, g[2])
     end
 
     @testset "Implicit" begin
       gfun(args...) = gradient(() -> sum(op(args...)), params(collect(args)))
-      g = gfun(o, z) 
+      g = gfun(o, z)
       gres = gfun(o, Z)
       @test gres[o] == g[o]
       @test Z ∉ gres.params
 
-      g = gfun(z, o) 
+      g = gfun(z, o)
       gres = gfun(Z, o)
       @test gres[o] == g[o]
       @test Z ∉ gres.params
@@ -225,7 +249,7 @@ end
 
 @testset "Param remapping" begin
   ls(dims...) = reshape(collect(Float32, 1:prod(dims)), dims...)
-  dl(nin, nout, bias) = Dense(ls(nin, nout), bias(nout)) 
+  dl(nin, nout, bias) = Dense(ls(nin, nout), bias(nout))
   dm(bias) = Chain(
     dl(3, 5, bias),
     dl(5, 4, bias),
@@ -239,10 +263,10 @@ end
     @test typeof(l1.b) === typeof(l2.b)
   end
 
-  @testset "loadparams!" begin 
+  @testset "loadparams!" begin
     import Flux: loadparams!
     pars(w, b::Zeros) = [w, zeros(size(w,2))]
-    pars(w, b) = [w, b] 
+    pars(w, b) = [w, b]
     pars(l) = pars(l.W, l.b)
     pararray(m) = mapreduce(pars, vcat, m)
     weights(m) = mapreduce(l -> [l.W], vcat, m)
@@ -285,4 +309,4 @@ end
   @test c[1].testing
   trainmode!(c)
   @test !c[1].testing
-end
+end