implement bilinear upsampling

maxfreu · maxfreu · commit 51b9354ae9f3 · 2021-01-11T17:03:44.000+01:00
diff --git a/lib/cudnn/CUDNN.jl b/lib/cudnn/CUDNN.jl
@@ -5,6 +5,7 @@ using ..APIUtils
 using ..CUDA
 using ..CUDA: CUstream, libraryPropertyType
 using ..CUDA: libcudnn, @retry_reclaim, isdebug
+using ..CUDA: atomic_add!
 
 using CEnum
 
@@ -29,6 +30,9 @@ include("batchnorm.jl")
 include("dropout.jl")
 include("rnn.jl")
 
+# custom kernels
+include("upsampling.jl")
+
 # high-level integrations
 include("nnlib.jl")
 
diff --git a/lib/cudnn/nnlib.jl b/lib/cudnn/nnlib.jl
@@ -4,7 +4,8 @@ import NNlib: stride, padding, dilation, flipkernel, spatial_dims, kernel_size,
               conv!, ∇conv_filter!, ∇conv_data!,
               maxpool!, meanpool!, ∇maxpool!, ∇meanpool!,
               softmax, softmax!, ∇softmax, ∇softmax!,
-              logsoftmax, logsoftmax!, ∇logsoftmax, ∇logsoftmax!
+              logsoftmax, logsoftmax!, ∇logsoftmax, ∇logsoftmax!,
+              upsample_bilinear, ∇upsample_bilinear
 
 import DataStructures: DefaultDict
 
@@ -271,3 +272,39 @@ end
 # CUDNN_ACTIVATION_IDENTITY does not work with cudnnActivationForward
 # FIXME: put this optimization in GPUArrays' `copyto!` (like Base.Broadcast's `copyto!`)
 Base.broadcasted(::typeof(identity), x::DenseCuArray{T}) where {T<:CUDNNFloat} = x
+
+
+# Upsampling
+
+function upsample_bilinear(x::CuArray{T,4}, scale::NTuple{2,Real}=(1,1); outsize::Union{Nothing,NTuple{2,Integer}}=nothing) where T ) where T
+  w,h,c,n = size(x)
+  width_scale, height_scale = Float32.(scale)
+  if outsize===nothing
+    out_w = floor(Int, width_scale*w)
+    out_h = floor(Int, height_scale*h)
+  else
+    out_w, out_h = outsize
+  end
+  out_size = out_h*out_w
+  nblocks = GET_BLOCKS(out_size)
+  out = CuArray{T}(undef, out_w, out_h, c, n)
+  CUDA.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS upsample_bilinear_kernel!(n,c,h,w,out_h,out_w,height_scale,width_scale,x,out)
+  return out
+end
+
+function ∇upsample_bilinear(Δ::CuArray{T,4}, scale::NTuple{2,Real}=(1,1); outsize::Union{Nothing,NTuple{2,Integer}}=nothing) where T
+  w,h,c,n = size(Δ)
+  input_size = length(Δ)
+  width_scale, height_scale  = Float32.(scale)
+  if outsize===nothing
+      out_w = ceil(Int, w/width_scale)
+      out_h = ceil(Int, h/height_scale)
+  else
+      out_w, out_h = outsize
+  end
+  out_size = out_h * out_w
+  nblocks = GET_BLOCKS(out_size)
+  dx = zero(CuArray{T}(undef, out_w, out_h, c, n))
+  CUDA.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS ∇upsample_bilinear_kernel(input_size, c, h, w, out_h, out_w, height_scale, width_scale, Δ, dx)
+  return dx
+end
diff --git a/lib/cudnn/upsampling.jl b/lib/cudnn/upsampling.jl
@@ -0,0 +1,142 @@
+# An implementation for GPU based bilinear upsampling including its gradient
+# The code is a translation from the following files:
+# https://github.com/pytorch/pytorch/blob/master/caffe2/operators/upsample_op.cu
+# https://github.com/pytorch/pytorch/blob/master/caffe2/core/common_gpu.h
+
+# Forward and backward pass have been tested to produce the same output
+# as pytorch - it works modulo bit noise.
+
+const CUDA_NUM_THREADS = 128
+const MAXIMUM_NUM_BLOCKS = 4096
+
+@inline function GET_BLOCKS(N::Integer)
+    # Use at least 1 block, since CUDA does not allow empty block
+    return max(min((N + CUDA_NUM_THREADS - 1) ÷ CUDA_NUM_THREADS, MAXIMUM_NUM_BLOCKS), 1)
+end
+
+# pytorch: nchw with row major
+# flux: whcn with column major -> same data layout in memory -> this function
+#   can stay as it is except for +1
+@inline function idx(
+    n::Integer,
+    num_channels::Integer,
+    c::Integer,
+    height::Integer,
+    width::Integer,
+    y::Integer,
+    x::Integer)
+    return ((n * num_channels + c) * height + y) * width + x + 1
+end
+
+function upsample_bilinear_kernel!(
+    num_batch,
+    num_channels,
+    input_height,
+    input_width,
+    output_height,
+    output_width,
+    height_scale,
+    width_scale,
+    X,  # input  __restrict__
+    Y)  # output __restrict__
+    out_size = output_height * output_width
+
+    # CUDA 1D kernel loop
+    @inbounds for index in ((blockIdx().x-1) * blockDim().x + threadIdx().x-1) : (blockDim().x * gridDim().x) : out_size-1
+        # mind the order!
+        indexTemp = index
+        out_x = indexTemp % output_width
+        indexTemp ÷= output_width
+        out_y = indexTemp % output_height
+        indexTemp ÷= output_height
+        indexTemp ÷= num_channels
+
+        rheight = output_height > 1 ? (input_height - 1f0) / (output_height - 1f0) : 0f0
+        rwidth  = output_width  > 1 ? (input_width - 1f0)  / (output_width  - 1f0) : 0f0
+
+        # Compute Y axis lambdas
+        h1r = rheight * out_y
+        h1 = floor(Int, h1r) # here was a typecast (int)
+        h1p = (h1 < input_height - 1) ? 1 : 0
+        h1lambda = h1r - h1
+        h0lambda = 1f0 - h1lambda
+
+        # Compute X axis lambdas
+        w1r = rwidth * out_x
+        w1 = floor(Int, w1r)
+        w1p = (w1 < input_width - 1) ? 1 : 0
+        w1lambda = w1r - w1
+        w0lambda = 1f0 - w1lambda
+
+        for n in 0:num_batch-1  # shift to original C indexing
+            for c in 0:num_channels-1
+                X0 = X[idx(n, num_channels, c, input_height, input_width, h1, w1)]
+                X1 = X[idx(n, num_channels, c, input_height, input_width, h1, w1 + w1p)]
+                X2 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1)]
+                X3 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1 + w1p)]
+
+                Y[idx(n, num_channels, c, output_height, output_width, out_y, out_x)] =
+                            h0lambda * (w0lambda * X0 + w1lambda * X1) +
+                            h1lambda * (w0lambda * X2 + w1lambda * X3)
+            end  # channels
+        end  # batch
+    end # 1D kernel loop
+    return nothing
+end
+
+# input is dY, output is dX
+function ∇upsample_bilinear_kernel(
+    input_size,
+    num_channels,
+    input_height,
+    input_width,
+    output_height,
+    output_width,
+    height_scale,
+    width_scale,
+    dY, # const
+    dX)
+    @inbounds for index in ((blockIdx().x - 1) * blockDim().x + threadIdx().x-1): blockDim().x * gridDim().x : input_size-1
+        # mind the order!
+        indexTemp = index
+        in_x = indexTemp % input_width
+        indexTemp ÷= input_width
+        in_y = indexTemp % input_height
+        indexTemp ÷= input_height
+        c = indexTemp % num_channels
+        indexTemp ÷= num_channels
+        n = indexTemp
+
+        out_y = min(in_y / height_scale, output_height - 1)
+        out_x = min(in_x / width_scale, output_width - 1)
+
+        rheight = output_height > 1 ? (output_height - 1.f0) / (input_height - 1.f0) : 0.f0
+        rwidth  = output_width > 1 ?  (output_width -  1.f0) / (input_width -  1.f0) : 0.f0
+
+        # Compute Y axis lambdas
+        h1r = rheight * in_y
+        h1 = round(Int, h1r, RoundDown)
+        h1p = (h1 < output_height - 1) ? 1 : 0
+        h1lambda = h1r - h1
+        h0lambda = 1.f0 - h1lambda
+
+        # Compute X axis lambdas
+        w1r = rwidth * in_x
+        w1 = round(Int, w1r, RoundDown)
+        w1p = (w1 < output_width - 1) ? 1 : 0
+        w1lambda = w1r - w1
+        w0lambda = 1.f0 - w1lambda
+
+        #if __CUDA_ARCH__ >= 350 # true for everything from 9xx on
+        dYi = ldg(dY, index+1)  # ldg(pointer(dY[index])) ?
+        #else
+            # dYi = dY[index + 1];
+        #endif
+
+        atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1,       w1)),       h0lambda * w0lambda * dYi)
+        atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1,       w1 + w1p)), h0lambda * w1lambda * dYi)
+        atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1)),       h1lambda * w0lambda * dYi)
+        atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1 + w1p)), h1lambda * w1lambda * dYi)
+    end
+    return nothing
+end
diff --git a/test/cudnn.jl b/test/cudnn.jl
@@ -109,3 +109,27 @@ end
     CUDNN.batchnorm(v, v, m, v, v, 1.0; training=training)
   end
 end
+
+@testset "Bilinear upsampling" begin
+  # only the forward pass is tested, more testing can be done in NNlib
+  x = Float32[1 2; 3 4][:,:,:,:]
+  x = cat(x,x; dims=3)
+  x = cat(x,x; dims=4)
+  x = cu(x)
+
+  y_true = Float32[ 1//1  4//3   5//3   2//1;
+          7//5 26//15 31//15 12//5;
+          9//5 32//15 37//15 14//5;
+         11//5 38//15 43//15 16//5;
+         13//5 44//15 49//15 18//5;
+          3//1 10//3  11//3   4//1]
+  y_true = cat(y_true,y_true; dims=3)
+  y_true = cat(y_true,y_true; dims=4)
+  y_true = cu(y_true)
+
+  y = upsample_bilinear(x, (3,2))
+
+  @test size(y) == size(y_true)
+  @test eltype(y) == Float32
+  @test y ≈ y_true
+end