move code to src/nnlib.jl

maxfreu · maxfreu · commit ce92c3ab8da6 · 2021-02-10T14:01:23.000+01:00
diff --git a/lib/cudnn/CUDNN.jl b/lib/cudnn/CUDNN.jl
@@ -38,9 +38,6 @@ include("rnn.jl")
 include("multiheadattn.jl")
 include("normalization.jl")
 
-# custom kernels
-include("upsampling.jl")
-
 # high-level integrations
 include("nnlib.jl")
 include("batchnorm.jl")
diff --git a/lib/cudnn/nnlib.jl b/lib/cudnn/nnlib.jl
@@ -4,8 +4,7 @@ import NNlib: stride, padding, dilation, flipkernel, spatial_dims, kernel_size,
               conv!, ∇conv_filter!, ∇conv_data!,
               maxpool!, meanpool!, ∇maxpool!, ∇meanpool!, PoolDims,
               softmax, softmax!, ∇softmax, ∇softmax!,
-              logsoftmax, logsoftmax!, ∇logsoftmax, ∇logsoftmax!,
-              upsample_bilinear_whcn!, ∇upsample_bilinear_whcn!
+              logsoftmax, logsoftmax!, ∇logsoftmax, ∇logsoftmax!
 
 import DataStructures: DefaultDict
 
@@ -300,40 +299,6 @@ end
 Base.broadcasted(::typeof(identity), x::DenseCuArray{T}) where {T<:CUDNNFloat} = x
 
 
-# Upsampling
-
-function upsample_bilinear_whcn!(y::CuArray{T,4}, x::CuArray{T,4}) where T
-    w,h,c,n = size(x)
-    out_w, out_h = (size(y,1), size(y,2))
-
-    out_size = out_h*out_w
-    rheight = T((h-1)/(out_h-1))
-    rwidth  = T((w-1)/(out_w-1))
-
-    kernel = @cuda name="upsample_bilinear_whcn!" launch=false upsample_bilinear_whcn_kernel!(out_size, rheight, rwidth, x, y)
-    config = launch_configuration(kernel.fun; max_threads=256)
-    threads = Base.min(out_size, config.threads)
-    blocks = cld(out_size, threads)
-    kernel(out_size, rheight, rwidth, x, y; threads=threads, blocks=blocks)
-    return y
-end
-
-function ∇upsample_bilinear_whcn!(dx::CuArray{T,4}, Δ::CuArray{T,4}) where T
-    w,h,c,n = Base.size(Δ)
-    out_w, out_h = (size(dx, 1), size(dx, 2))
-    in_size = h*w
-    rheight = T((out_h-1)/(h-1)) # reversed compared to forward pass
-    rwidth  = T((out_w-1)/(w-1))
-
-    kernel = @cuda name="∇upsample_bilinear_whcn!" launch=false ∇upsample_bilinear_whcn_kernel!(in_size, rheight, rwidth, Δ, dx)
-    config = launch_configuration(kernel.fun; max_threads=256)
-    threads = Base.min(in_size, config.threads)
-    blocks = cld(in_size, threads)
-    kernel(in_size, rheight, rwidth, Δ, dx; threads=threads, blocks=blocks)
-    return dx
-end
-
-
 # Compatibility shims until users upgrade to new NNlib format
 function conv!(y::DenseCuArray{T}, x::DenseCuArray{T}, w::DenseCuArray{T}; pad=0, stride=1, flipkernel=0, dilation=1, kwargs...) where {T<:CUDNNFloat}
     cdims = DenseConvDims(x, w; padding=pad, stride=stride, flipkernel=(flipkernel!=0), dilation=dilation)
diff --git a/lib/cudnn/upsampling.jl b/lib/cudnn/upsampling.jl
diff --git a/src/nnlib.jl b/src/nnlib.jl
@@ -29,3 +29,129 @@ NNlib._batched_gemm!(::Type{<:CuArray}, transA::Char, transB::Char, α::Number,
 
 Base.unsafe_convert(::Type{CuPtr{T}}, A::NNlib.BatchedAdjOrTrans{T}) where {T} =
     Base.unsafe_convert(CuPtr{T}, parent(A))
+
+
+#
+# Upsampling
+#
+
+# An implementation for GPU based bilinear upsampling including its gradient
+# The code is a translation from the following files:
+# https://github.com/pytorch/pytorch/blob/master/caffe2/operators/upsample_op.cu
+# https://github.com/pytorch/pytorch/blob/master/caffe2/core/common_gpu.h
+
+# Forward and backward pass have been tested to produce the same output
+# as pytorch with align_corners=True - it works modulo bit noise.
+
+function upsample_bilinear_whcn_kernel!(n_elem, rheight, rwidth, x, y)
+    index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
+
+    if index < n_elem
+        in_w, in_h, channels, batchsize = size(x)
+        out_w, out_h, _, _ = size(y)
+
+        ow = index % out_w
+        oh = index ÷ out_w
+
+        real_index = rheight*oh
+        ih0 = floor(Int, real_index)
+        offset = (ih0 < in_h-1) ? 1 : 0
+        ih1 = ih0 + offset + 1
+        h1lambda = real_index - ih0
+        h0lambda = 1 - h1lambda
+        ih0 += 1
+
+        real_index = rwidth*ow
+        iw0 = floor(Int, real_index)
+        offset = (iw0 < in_w-1) ? 1 : 0
+        iw1 = iw0 + offset + 1
+        w1lambda = real_index - iw0
+        w0lambda = 1 - w1lambda
+        iw0 += 1
+
+        @inbounds for n in 1:batchsize
+            for c in 1:channels
+                val = h0lambda * (w0lambda * x[iw0, ih0, c, n]  + # h0 * w0 * i00
+                                  w1lambda * x[iw1, ih0, c, n]) + # h0 * w1 * i01
+                      h1lambda * (w0lambda * x[iw0, ih1, c, n]  + # h1 * w0 * i10
+                                  w1lambda * x[iw1, ih1, c, n])   # h1 * w1 * i11
+                y[ow+1, oh+1, c, n] = val
+            end
+        end
+    end
+    return nothing
+end
+
+# Δ is the gradient backpropagated from downstream layers
+function ∇upsample_bilinear_whcn_kernel!(n_elem, rheight, rwidth, Δ, dx)
+    index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
+
+    if index < n_elem
+        in_width, in_height, channels, batchsize = size(Δ)
+        out_width, out_height, _, _ = size(dx)
+
+        iw = index % in_width
+        ih = index ÷ in_width
+
+        # Compute Y axis lambdas
+        real_index_h = rheight*ih
+        oh0 = floor(Int, real_index_h)
+        offset = (oh0 < out_height-1) ? 1 : 0
+        oh1 = oh0 + offset + 1
+        h1lambda = real_index_h - oh0
+        h0lambda = 1 - h1lambda
+        oh0 += 1
+
+        # # Compute X axis lambdas
+        real_index_w = rwidth * iw
+        ow0 = floor(Int, real_index_w)
+        offset = (ow0 < out_width - 1) ? 1 : 0
+        ow1 = ow0 + offset + 1
+        w1lambda = real_index_w - ow0
+        w0lambda = 1 - w1lambda
+        ow0 += 1
+
+        @inbounds for n in 1:batchsize
+            for c in 1:channels
+                val = Δ[iw+1, ih+1, c, n]
+                @atomic dx[ow0, oh0, c, n] += h0lambda * w0lambda * val
+                @atomic dx[ow1, oh0, c, n] += h0lambda * w1lambda * val
+                @atomic dx[ow0, oh1, c, n] += h1lambda * w0lambda * val
+                @atomic dx[ow1, oh1, c, n] += h1lambda * w1lambda * val
+            end
+        end
+    end # if
+    return nothing
+end
+
+
+function NNlib.upsample_bilinear_whcn!(y::CuArray{T,4}, x::CuArray{T,4}) where T
+    w,h,c,n = size(x)
+    out_w, out_h = (size(y,1), size(y,2))
+
+    out_size = out_h*out_w
+    rheight = T((h-1)/(out_h-1))
+    rwidth  = T((w-1)/(out_w-1))
+
+    kernel = @cuda name="upsample_bilinear_whcn!" launch=false upsample_bilinear_whcn_kernel!(out_size, rheight, rwidth, x, y)
+    config = launch_configuration(kernel.fun; max_threads=256)
+    threads = Base.min(out_size, config.threads)
+    blocks = cld(out_size, threads)
+    kernel(out_size, rheight, rwidth, x, y; threads=threads, blocks=blocks)
+    return y
+end
+
+function NNlib.∇upsample_bilinear_whcn!(dx::CuArray{T,4}, Δ::CuArray{T,4}) where T
+    w,h,c,n = Base.size(Δ)
+    out_w, out_h = (size(dx, 1), size(dx, 2))
+    in_size = h*w
+    rheight = T((out_h-1)/(h-1)) # reversed compared to forward pass
+    rwidth  = T((out_w-1)/(w-1))
+
+    kernel = @cuda name="∇upsample_bilinear_whcn!" launch=false ∇upsample_bilinear_whcn_kernel!(in_size, rheight, rwidth, Δ, dx)
+    config = launch_configuration(kernel.fun; max_threads=256)
+    threads = Base.min(in_size, config.threads)
+    blocks = cld(in_size, threads)
+    kernel(in_size, rheight, rwidth, Δ, dx; threads=threads, blocks=blocks)
+    return dx
+end
diff --git a/test/cudnn/nnlib.jl b/test/cudnn/nnlib.jl
@@ -136,30 +136,3 @@ end
         CUDNN.batchnorm(v, v, m, v, v, 1.0; training=training)
     end
 end
-
-@testset "Bilinear upsampling" begin
-  x = Float32[1 2; 3 4][:,:,:,:]
-  x = cat(x,x; dims=3)
-  x = cat(x,x; dims=4)
-  x = cu(x)
-
-  y_true = Float32[ 1//1  4//3   5//3   2//1;
-          7//5 26//15 31//15 12//5;
-          9//5 32//15 37//15 14//5;
-         11//5 38//15 43//15 16//5;
-         13//5 44//15 49//15 18//5;
-          3//1 10//3  11//3   4//1]
-  y_true = cat(y_true,y_true; dims=3)
-  y_true = cat(y_true,y_true; dims=4)
-  y_true = cu(y_true)
-
-  y = upsample_bilinear(x, (3,2))
-
-  @test size(y) == size(y_true)
-  @test eltype(y) == Float32
-  @test y ≈ y_true
-
-  o = CUDA.ones(Float32,6,4,2,1)
-  grad_true = 6*CUDA.ones(Float32,2,2,2,1)
-  @test ∇upsample_bilinear(o; size=(2,2)) ≈ grad_true
-end
diff --git a/test/nnlib.jl b/test/nnlib.jl
@@ -62,3 +62,30 @@ end
     @test testf(x -> logσ.(x), rand(5))
   end
 end
+
+@testset "Bilinear upsampling" begin
+  x = Float32[1 2; 3 4][:,:,:,:]
+  x = cat(x,x; dims=3)
+  x = cat(x,x; dims=4)
+  x = cu(x)
+
+  y_true = Float32[ 1//1  4//3   5//3   2//1;
+          7//5 26//15 31//15 12//5;
+          9//5 32//15 37//15 14//5;
+         11//5 38//15 43//15 16//5;
+         13//5 44//15 49//15 18//5;
+          3//1 10//3  11//3   4//1]
+  y_true = cat(y_true,y_true; dims=3)
+  y_true = cat(y_true,y_true; dims=4)
+  y_true = cu(y_true)
+
+  y = upsample_bilinear(x, (3,2))
+
+  @test size(y) == size(y_true)
+  @test eltype(y) == Float32
+  @test y ≈ y_true
+
+  o = CUDA.ones(Float32,6,4,2,1)
+  grad_true = 6*CUDA.ones(Float32,2,2,2,1)
+  @test ∇upsample_bilinear(o; size=(2,2)) ≈ grad_true
+end