Merge #1794

bors[bot] · mcabbott · web-flow · commit fe803a1a706e · 2021-12-13T18:27:18.000Z
1794: Tidy up `Maxout` r=mcabbott a=mcabbott Maxout is from #698 . This: * adds pretty printing * changes the explicit signature to `Maxout(layer, layer, layer)`, rather than providing a tuple, to be more like other layers (with deprecation) * adds more examples to the docstring, and combines the two * changes not to use `mapreduce`. I see now this was a performance choice at the time, discussed here #647 (comment) , but with Zygote this is much slower. Before: ``` julia> using Flux julia> m3 = Maxout(() -> Dense(5, 7, tanh), 3) Maxout{Tuple{Dense{typeof(tanh), Matrix{Float32}, Vector{Float32}}, Dense{typeof(tanh), Matrix{Float32}, Vector{Float32}}, Dense{typeof(tanh), Matrix{Float32}, Vector{Float32}}}}((Dense(5, 7, tanh), Dense(5, 7, tanh), Dense(5, 7, tanh))) julia> x = rand(Float32, 5, 11); julia> `@btime` gradient(sum∘m3, $x); min 112.792 μs, mean 123.774 μs (930 allocations, 49.09 KiB. GC mean 3.71%) ``` After: ``` julia> m3 = Maxout(() -> Dense(5, 7, tanh), 3) Maxout( Dense(5, 7, tanh), # 42 parameters Dense(5, 7, tanh), # 42 parameters Dense(5, 7, tanh), # 42 parameters ) # Total: 6 arrays, 126 parameters, 888 bytes. julia> x = rand(Float32, 5, 11); julia> `@btime` gradient(sum∘m3, $x); min 34.541 μs, mean 38.448 μs (493 allocations, 32.48 KiB. GC mean 6.63%) ``` Co-authored-by: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
diff --git a/src/deprecations.jl b/src/deprecations.jl
@@ -33,3 +33,7 @@ zeros(T::Type, dims...) = Base.zeros(T, dims...)
 
 ones32(::Type, dims...) = throw(ArgumentError("Flux.ones32 is always Float32, use Base.ones to specify the element type"))
 zeros32(::Type, dims...) = throw(ArgumentError("Flux.zeros32 is always Float32, use Base.zeros to specify the element type"))
+
+
+# v0.13 deprecations
+@deprecate Maxout(layers::Tuple) Maxout(layers...)
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -211,48 +211,67 @@ function Base.show(io::IO, l::Diagonal)
 end
 
 """
-    Maxout(over)
+    Maxout(layers...)
+    Maxout(f, n_alts)
 
-The [Maxout](https://arxiv.org/abs/1302.4389) layer has a number of
-internal layers which all receive the same input. It returns the elementwise
-maximum of the internal layers' outputs.
+This contains a number of internal layes, each of which receives the same input.
+Its output is the elementwise maximum of the the internal layers' outputs.
 
-Maxout over linear dense layers satisfies the univeral approximation theorem.
-"""
-struct Maxout{FS<:Tuple}
-    over::FS
-end
+Instead of defining layers individually, you can provide a zero-argument function
+which constructs them, and the number to construct.
 
-"""
-    Maxout(f, n_alts)
+Maxout over linear dense layers satisfies the univeral approximation theorem.
+See Goodfellow, Warde-Farley, Mirza, Courville & Bengio "Maxout Networks" 
+[https://arxiv.org/abs/1302.4389](1302.4389).
 
-Construct a Maxout layer over `n_alts` instances of the layer given by `f`.
-The function takes no arguments and should return some callable layer.
-Conventionally, this is a linear dense layer.
+See also [`Parallel`](@ref) to reduce with other operators.
 
 # Examples
+```
+julia> m = Maxout(x -> abs2.(x), x -> x .* 3);
 
-This constructs a `Maxout` layer over 4 internal dense linear layers, each
-identical in structure (784 inputs, 128 outputs):
-```jldoctest
-julia> insize = 784;
+julia> m([-2 -1 0 1 2])
+1×5 Matrix{Int64}:
+ 4  1  0  3  6
 
-julia> outsize = 128;
+julia> m3 = Maxout(() -> Dense(5, 7, tanh), 3)
+Maxout(
+  Dense(5, 7, tanh),                    # 42 parameters
+  Dense(5, 7, tanh),                    # 42 parameters
+  Dense(5, 7, tanh),                    # 42 parameters
+)                   # Total: 6 arrays, 126 parameters, 888 bytes.
 
-julia> Maxout(()->Dense(insize, outsize), 4);
+julia> Flux.outputsize(m3, (5, 11))
+(7, 11)
 ```
 """
-function Maxout(f, n_alts)
+struct Maxout{FS<:Tuple}
+  over::FS
+  Maxout(layers...) = new{typeof(layers)}(layers)
+end
+
+function Maxout(f::Function, n_alts::Integer)
   over = Tuple(f() for _ in 1:n_alts)
-  return Maxout(over)
+  return Maxout(over...)
 end
 
 @functor Maxout
 
 function (mo::Maxout)(input::AbstractArray)
-    mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
+  # Perhaps surprisingly, pairwise max broadcast is often faster,
+  # even with Zygote. See #698 and #1794
+  mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.over)
+end
+
+trainable(mo::Maxout) = mo.over
+
+function Base.show(io::IO, mo::Maxout)
+  print(io, "Maxout(")
+  _show_layers(io, mo.over)
+  print(io, ")")
 end
 
+
 """
     SkipConnection(layer, connection)
 
@@ -277,6 +296,8 @@ julia> sm = SkipConnection(m, (mx, x) -> cat(mx, x, dims=3));
 julia> size(sm(x)) == (5, 5, 11, 10)
 true
 ```
+
+See also [`Parallel`](@ref), [`Maxout`](@ref).
 """
 struct SkipConnection{T,F}
   layers::T
@@ -390,7 +411,7 @@ end
     Parallel(connection, layers...)
     Parallel(connection; name = layer, ...)
 
-Create a 'Parallel' layer that passes an input array to each path in
+Create a `Parallel` layer that passes an input array to each path in
 `layers`, before reducing the output with `connection`.
 
 Called with one input `x`, this is equivalent to `reduce(connection, [l(x) for l in layers])`.
@@ -399,6 +420,9 @@ If called with multiple inputs, they are `zip`ped with the layers, thus `Paralle
 Like [`Chain`](@ref), its sub-layers may be given names using the keyword constructor.
 These can be accessed by indexing: `m[1] == m[:name]` is the first layer.
 
+See also [`SkipConnection`](@ref) which is `Parallel` with one `identity`,
+and [`Maxout`](@ref) which reduces by broadcasting `max`.
+
 # Examples
 
 ```jldoctest
diff --git a/src/layers/show.jl b/src/layers/show.jl
@@ -1,6 +1,6 @@
 
 for T in [
-    :Chain, :Parallel, :SkipConnection, :Recur  # container types
+    :Chain, :Parallel, :SkipConnection, :Recur, :Maxout  # container types
   ]
   @eval function Base.show(io::IO, m::MIME"text/plain", x::$T)
     if get(io, :typeinfo, nothing) === nothing  # e.g. top level in REPL
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
@@ -109,13 +109,13 @@ import Flux: activations
     end
 
     @testset "simple alternatives" begin
-      mo = Maxout((x -> x, x -> 2x, x -> 0.5x))
+      mo = Maxout(x -> x, x -> 2x, x -> 0.5x)
       input = rand(40)
       @test mo(input) == 2*input
     end
 
     @testset "complex alternatives" begin
-      mo = Maxout((x -> [0.5; 0.1]*x, x -> [0.2; 0.7]*x))
+      mo = Maxout(x -> [0.5; 0.1]*x, x -> [0.2; 0.7]*x)
       input = [3.0 2.0]
       target = [0.5, 0.7].*input
       @test mo(input) == target

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`
`2`	`2`	`for T in [`
`3`		`- :Chain, :Parallel, :SkipConnection, :Recur # container types`
	`3`	`+ :Chain, :Parallel, :SkipConnection, :Recur, :Maxout # container types`
`4`	`4`	`]`
`5`	`5`	`@eval function Base.show(io::IO, m::MIME"text/plain", x::$T)`
`6`	`6`	`if get(io, :typeinfo, nothing) === nothing # e.g. top level in REPL`