Merge #1744

bors[bot] · jw3126 · DhairyaLGandhi · web-flow · commit f66be896d3d2 · 2021-10-25T14:43:13.000Z
1744: allow groups in ConvTranspose r=DhairyaLGandhi a=jw3126 fix #1743 Co-authored-by: Jan Weidner <jw3126@gmail.com> Co-authored-by: Dhairya Gandhi <dhairya@juliacomputing.com>
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.12.7"
+version = "0.12.8"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -136,7 +136,7 @@ end
 
 function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
             init = glorot_uniform, stride = 1, pad = 0, dilation = 1, groups = 1,
-            weight = convfilter(k, (ch[1] ÷ groups => ch[2]); init), bias = true) where N
+            weight = convfilter(k, ch; init, groups), bias = true) where N
 
   Conv(weight, bias, σ; stride, pad, dilation, groups)
 end
@@ -152,8 +152,11 @@ distribution.
 
 See also: [`depthwiseconvfilter`](@ref)
 """
-convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
-          init = glorot_uniform) where N = init(filter..., ch...)
+function convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
+          init = glorot_uniform, groups = 1) where N
+  cin, cout = ch
+  init(filter..., cin÷groups, cout)
+end
 
 @functor Conv
 
@@ -163,9 +166,12 @@ function (c::Conv)(x::AbstractArray)
   σ.(conv(x, c.weight, cdims) .+ b)
 end
 
+_channels_in(l ::Conv) = size(l.weight, ndims(l.weight)-1) * l.groups
+_channels_out(l::Conv) = size(l.weight, ndims(l.weight))
+
 function Base.show(io::IO, l::Conv)
   print(io, "Conv(", size(l.weight)[1:ndims(l.weight)-2])
-  print(io, ", ", size(l.weight, ndims(l.weight)-1), " => ", size(l.weight, ndims(l.weight)))
+  print(io, ", ", _channels_in(l), " => ", _channels_out(l))
   _print_conv_opt(io, l)
   print(io, ")")
 end
@@ -175,7 +181,10 @@ function _print_conv_opt(io::IO, l)
   all(==(0), l.pad) || print(io, ", pad=", _maybetuple_string(l.pad))
   all(==(1), l.stride) || print(io, ", stride=", _maybetuple_string(l.stride))
   all(==(1), l.dilation) || print(io, ", dilation=", _maybetuple_string(l.dilation))
-  l.bias == Zeros() && print(io, ", bias=false")
+  if hasproperty(l, :groups)
+    (l.groups == 1) || print(io, ", groups=", l.groups)
+  end
+  (l.bias isa Zeros) && print(io, ", bias=false")
 end
 
 """
@@ -216,44 +225,53 @@ struct ConvTranspose{N,M,F,A,V}
   stride::NTuple{N,Int}
   pad::NTuple{M,Int}
   dilation::NTuple{N,Int}
+  groups::Int
 end
 
+_channels_in(l::ConvTranspose)  = size(l.weight)[end]
+_channels_out(l::ConvTranspose) = size(l.weight)[end-1]*l.groups
+
 """
-    ConvTranspose(weight::AbstractArray, [bias, activation; stride, pad, dilation])
+    ConvTranspose(weight::AbstractArray, [bias, activation; stride, pad, dilation, groups])
 
 Constructs a layer with the given weight and bias arrays.
 Accepts the same keywords as the `ConvTranspose((4,4), 3 => 7, relu)` method.
 """
 function ConvTranspose(w::AbstractArray{T,N}, bias = true, σ = identity;
-                      stride = 1, pad = 0, dilation = 1) where {T,N}
+                      stride = 1, pad = 0, dilation = 1, groups=1) where {T,N}
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(ConvTranspose, pad, size(w)[1:N-2], dilation, stride)
-  b = create_bias(w, bias, size(w, N-1))
-  return ConvTranspose(σ, w, b, stride, pad, dilation)
+  b = create_bias(w, bias, size(w, N-1) * groups)
+  return ConvTranspose(σ, w, b, stride, pad, dilation, groups)
 end
 
 function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                       init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-                      weight = convfilter(k, reverse(ch), init = init), bias = true) where N
+                      groups = 1,
+                      weight = convfilter(k, reverse(ch); init, groups),
+                      bias = true,
+                      ) where N
 
-  ConvTranspose(weight, bias, σ; stride, pad, dilation)
+  ConvTranspose(weight, bias, σ; stride, pad, dilation, groups)
 end
 
 @functor ConvTranspose
 
 function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
-    # Calculate size of "input", from ∇conv_data()'s perspective...
-    combined_pad = (c.pad[1:2:end] .+ c.pad[2:2:end])
-    I = (size(x)[1:end-2] .- 1).*c.stride .+ 1 .+ (size(c.weight)[1:end-2] .- 1).*c.dilation .- combined_pad
-    C_in = size(c.weight)[end-1]
-    batch_size = size(x)[end]
-    # Create DenseConvDims() that looks like the corresponding conv()
-    return DenseConvDims((I..., C_in, batch_size), size(c.weight);
-                        stride=c.stride,
-                        padding=c.pad,
-                        dilation=c.dilation,
-    )
+  # Calculate size of "input", from ∇conv_data()'s perspective...
+  combined_pad = (c.pad[1:2:end] .+ c.pad[2:2:end])
+  I = (size(x)[1:end-2] .- 1).*c.stride .+ 1 .+ (size(c.weight)[1:end-2] .- 1).*c.dilation .- combined_pad
+  C_in = size(c.weight)[end-1] * c.groups
+  batch_size = size(x)[end]
+  # Create DenseConvDims() that looks like the corresponding conv()
+  w_size = size(c.weight)
+  return DenseConvDims((I..., C_in, batch_size), w_size;
+                      stride=c.stride,
+                      padding=c.pad,
+                      dilation=c.dilation,
+                      groups=c.groups,
+  )
 end
 
 # TODO: Find proper fix for https://github.com/FluxML/Flux.jl/issues/900
@@ -267,7 +285,7 @@ end
 
 function Base.show(io::IO, l::ConvTranspose)
   print(io, "ConvTranspose(", size(l.weight)[1:ndims(l.weight)-2])
-  print(io, ", ", size(l.weight, ndims(l.weight)), " => ", size(l.weight, ndims(l.weight)-1))
+  print(io, ", ", _channels_in(l), " => ", _channels_out(l))
   _print_conv_opt(io, l)
   print(io, ")")
 end
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
@@ -48,13 +48,17 @@ function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...; te
           xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1]
           xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1]
 
-          # test 
+          # test
           if test_cpu
             @test y_gpu ≈ y_cpu rtol=1f-3 atol=1f-3
             if isnothing(xg_cpu)
               @test isnothing(xg_gpu)
             else
-              @test Array(xg_gpu) ≈ xg_cpu rtol=1f-3 atol=1f-3
+              if layer === GroupedConvTranspose
+                @test Array(xg_gpu) ≈ xg_cpu rtol=2f-2 atol=1f-3
+              else
+                @test Array(xg_gpu) ≈ xg_cpu rtol=1f-3 atol=1f-3
+              end
             end
           end
           @test gs_gpu isa Flux.Zygote.Grads
@@ -80,6 +84,7 @@ ConvTransposeNoBias(args...) = ConvTranspose(args...; bias = false)
 CrossCorNoBias(args...) = CrossCor(args...; bias = false)
 DepthwiseConvNoBias(args...) = DepthwiseConv(args...; bias = false)
 GroupedConv(args...) = Conv(args..., groups = 5)
+GroupedConvTranspose(args...) = ConvTranspose(args..., groups = 5)
 
 for act in ACTIVATIONS
   r = rand(Float32, 28, 28, 1, 1)
@@ -89,16 +94,16 @@ for act in ACTIVATIONS
                  DepthwiseConv, DepthwiseConvNoBias]
   gpu_gradtest("Convolution with $act", conv_layers, r, (2,2), 1=>3, act, test_cpu = false)
 
-  groupedconv = [GroupedConv]
+  groupedconv = [GroupedConv, GroupedConvTranspose]
   gpu_gradtest("GroupedConvolution with $act", groupedconv, rand(Float32, 28, 28, 100, 2), (3,3), 100 => 25, act, test_cpu = true)
 
   batch_norm = [BatchNorm]
   gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28,28,3,4), 3, act, test_cpu = false) #TODO fix errors
   gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5,4), 5, act, test_cpu = false)
-  
+
   instancenorm = [InstanceNorm]
   gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act, test_cpu = false)
-  
+
   groupnorm = [GroupNorm]
   gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28,28,3,1), 3, 1, act, test_cpu = false)
 end
@@ -151,7 +156,7 @@ end
   else
     @test sum(l(ip)) ≈ 0.f0
     gs = gradient(() -> sum(l(ip)), Flux.params(l))
-    @test l.bias ∉ gs.params 
+    @test l.bias ∉ gs.params
   end
 end
 
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
@@ -67,13 +67,60 @@ end
   @test Flux.Losses.mse(bias(ip), op) ≈ 4.f0
 
   @testset "Grouped Conv" begin
+    ip = rand(Float32, 28, 100, 2)
+    c = Conv((3,), 100 => 25, groups = 5)
+    @test size(c.weight) == (3, 20, 25)
+    @test size(c(ip)) == (26, 25, 2)
+
     ip = rand(Float32, 28, 28, 100, 2)
     c = Conv((3,3), 100 => 25, groups = 5)
     @test size(c.weight) == (3, 3, 20, 25)
     @test size(c(ip)) == (26, 26, 25, 2)
+
+    ip = rand(Float32, 10, 11, 12, 100, 2)
+    c = Conv((3,4,5), 100 => 25, groups = 5)
+    @test size(c.weight) == (3,4,5, 20, 25)
+    @test size(c(ip)) == (8,8,8, 25, 2)
   end
 end
 
+@testset "_channels_in, _channels_out" begin
+    _channels_in = Flux._channels_in
+    _channels_out = Flux._channels_out
+    @test _channels_in(Conv((3,)   , 2=>4)) == 2
+    @test _channels_in(Conv((5,6,) , 2=>4)) == 2
+    @test _channels_in(Conv((1,2,3), 2=>4)) == 2
+    @test _channels_out(Conv((3,)   , 2=>4)) == 4
+    @test _channels_out(Conv((5,6,) , 2=>4)) == 4
+    @test _channels_out(Conv((1,2,3), 2=>4)) == 4
+
+    @test _channels_in( ConvTranspose((3,)   , 1=>4)) == 1
+    @test _channels_in( ConvTranspose((5,6,) , 2=>4)) == 2
+    @test _channels_in( ConvTranspose((1,2,3), 3=>4)) == 3
+    @test _channels_out(ConvTranspose((3,)   , 2=>1)) == 1
+    @test _channels_out(ConvTranspose((5,6,) , 2=>2)) == 2
+    @test _channels_out(ConvTranspose((1,2,3), 2=>3)) == 3
+
+    @test _channels_in( ConvTranspose((6,)   , 8=>4, groups=4)) == 8
+    @test _channels_in( ConvTranspose((5,6,) , 2=>4, groups=2)) == 2
+    @test _channels_in( ConvTranspose((1,2,3), 3=>6, groups=3)) == 3
+
+    @test _channels_out(ConvTranspose((1,)   , 10=>15, groups=5)) == 15
+    @test _channels_out(ConvTranspose((3,2)   , 10=>15, groups=5)) == 15
+    @test _channels_out(ConvTranspose((5,6,) , 2=>2, groups=2)) == 2
+
+    for Layer in [Conv, ConvTranspose]
+        for _ in 1:10
+            groups = rand(1:10)
+            kernel_size = Tuple(rand(1:5) for _ in rand(1:3))
+            cin = rand(1:5) * groups
+            cout = rand(1:5) * groups
+            @test _channels_in(Layer(kernel_size, cin=>cout; groups)) == cin
+            @test _channels_out(Layer(kernel_size, cin=>cout; groups)) == cout
+        end
+    end
+end
+
 @testset "asymmetric padding" begin
   r = ones(Float32, 28, 28, 1, 1)
   m = Conv((3, 3), 1=>1, relu; pad=(0,1,1,2))
@@ -118,14 +165,42 @@ end
   x = zeros(Float32, 5, 5, 2, 4)
   m = ConvTranspose((3,3), 2=>3)
   @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
+
+  # test ConvTranspose supports groups argument
+  x = randn(Float32, 10, 10, 2, 3)
+  m1 = ConvTranspose((3,3), 2=>4, pad=SamePad())
+  @test size(m1.weight) == (3,3,4,2)
+  @test size(m1(x)) == (10,10,4,3)
+  m2 = ConvTranspose((3,3), 2=>4, groups=2, pad=SamePad())
+  @test size(m2.weight) == (3,3,2,2)
+  @test size(m1(x)) == size(m2(x))
+  @test gradient(()->sum(m2(x)), params(m2)) isa Flux.Zygote.Grads
+
+  x = randn(Float32, 10, 2,1)
+  m = ConvTranspose((3,), 2=>4, pad=SamePad(), groups=2)
+  @test size(m(x)) === (10,4,1)
+  @test length(m.weight) == (3)*(2*4) / 2
+
+  x = randn(Float32, 10, 11, 4,2)
+  m = ConvTranspose((3,5), 4=>4, pad=SamePad(), groups=4)
+  @test size(m(x)) === (10,11, 4,2)
+  @test length(m.weight) == (3*5)*(4*4)/4
+
+  x = randn(Float32, 10, 11, 12, 3,2)
+  m = ConvTranspose((3,5,3), 3=>6, pad=SamePad(), groups=3)
+  @test size(m(x)) === (10,11, 12, 6,2)
+  @test length(m.weight) == (3*5*3) * (3*6) / 3
+
+  @test occursin("groups=2", sprint(show, ConvTranspose((3,3), 2=>4, groups=2)))
+  @test occursin("2 => 4"  , sprint(show, ConvTranspose((3,3), 2=>4, groups=2)))
 end
 
 @testset "CrossCor" begin
   x = rand(Float32, 28, 28, 1, 1)
   w = rand(Float32, 2,2,1,1)
   y = CrossCor(w, [0.0])
 
-  @test sum(w .* x[1:2, 1:2, :, :]) ≈ y(x)[1, 1, 1, 1]  rtol=1e-7
+  @test sum(w .* x[1:2, 1:2, :, :]) ≈ y(x)[1, 1, 1, 1]  rtol=2e-7
 
   r = zeros(Float32, 28, 28, 1, 5)
   m = Chain(