Updated recurrent benchmarks from suggestions. Modified benchutils to

mkschleg · mkschleg · commit add28e3e527c · 2022-03-03T13:24:15.000-07:00
be easier to overload behaviour.
diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl
@@ -6,20 +6,9 @@ using Zygote: pullback, ignore
 
 fw(m, x) = m(x)
 bw(back) = back(1f0)
-fwbw(m, ps, x) = gradient(() -> sum(m(x)), ps)
+fwbw(m, ps, x) = gradient(() -> sum(fw(m, x)), ps)
+pb(m, ps, x) = pullback(()->sum(fw(m, x)), ps)
 
-# Need to specialize for flux.recur.
-fw(m::Flux.Recur, X::Vector{<:AbstractArray}) = begin
-    ignore() do
-      Flux.reset!(m)
-    end
-    [m(x) for x in X]
-end
-fwbw(m::Flux.Recur, ps, X::Vector{<:AbstractArray}) = gradient(ps) do
-    y = fw(m, X)
-    sum(sum(y))
-end
-  
 function run_benchmark(model, x; cuda=true)
     
     if cuda 
@@ -28,11 +17,7 @@ function run_benchmark(model, x; cuda=true)
     end
 
     ps = Flux.params(model)
-    y, back = if model isa Flux.Recur && eltype(x) <: AbstractArray
-        pullback(() -> sum(sum([model(x_t) for x_t in x])), ps)
-    else
-        pullback(() -> sum(model(x)), ps)
-    end
+    y, back =  pb(model, ps, x)
 
 
     if cuda
diff --git a/perf/recurrent.jl b/perf/recurrent.jl
@@ -1,41 +1,62 @@
 
-println("RNN")
-for n in [2, 20, 200, 1000], T in [1, 8, 16, 64]
-  x = [randn(Float32, n, n) for t in 1:T]
-  model = RNN(n, n)
-  println("CPU n=$n, t=$T")
-  run_benchmark(model, x, cuda=false)
-  println("CUDA n=$n, t=$T")
-  try
-      run_benchmark(model, x, cuda=true)
-  catch ex
-      @show typeof(ex)
-      if ex isa OutOfGPUMemoryError
-          @warn "Not enough GPU memory to run test"
-      else
-          rethrow(ex)
-      end
-  end
+
+struct RNNWrapper{T}
+  rnn::T
+end
+Flux.@functor RNNWrapper
+
+# Need to specialize for RNNWrapper.
+fw(r::RNNWrapper, X::Vector{<:AbstractArray}) = begin
+  Flux.reset!(r.rnn)
+  [r.rnn(x) for x in X]
+end
+
+fw(r::RNNWrapper, X) = begin
+  Flux.reset!(r.rnn)
+  r.rnn(X)
+end
+
+fwbw(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = gradient(ps) do
+  y = fw(r, X)
+  sum(sum(y))
+end
+
+pb(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = pullback(ps) do
+  y = fw(r, X)
+  sum(sum(y))
 end
 
-println("RNN-3d")
-for n in [2, 20, 200, 1000], T in [1, 8, 16, 64]
-  x = randn(Float32, n, n, T)
-  model = RNN(n, n)
-  println("CPU n=$n, t=$T")
-  run_benchmark(model, x, cuda=false)
-  println("CUDA n=$n, t=$T")
-  try
+function rnn_benchmark_sweep(data_creator::Function, rnn_type)
+  for n in [2, 20, 200, 1000], ts in [1, 4, 16, 64]
+    x, x_n = data_creator(n, ts)
+    model = RNNWrapper(rnn_type(n, n))
+    
+    println("$rnn_type $x_n CPU n=$n, ts=$ts")
+    run_benchmark(model, x, cuda=false)
+    
+    println("$rnn_type $x_n CUDA n=$n, ts=$ts")
+    try
       run_benchmark(model, x, cuda=true)
-  catch ex
+    catch ex
       @show typeof(ex)
       if ex isa OutOfGPUMemoryError
-          @warn "Not enough GPU memory to run test"
+        @warn "Not enough GPU memory to run test"
       else
-          rethrow(ex)
+        rethrow(ex)
       end
-  end
+    end
+  end  
 end
 
+for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM]
+  rnn_benchmark_sweep(rnn_type) do n, ts
+    [randn(Float32, n, n) for _ in 1:ts], "Vec"
+  end
+end
 
+for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM]
+  rnn_benchmark_sweep(rnn_type) do n, ts
+    randn(Float32, n, n, ts), "Block"
+  end
+end