|
| 1 | +# An implementation for GPU based bilinear upsampling including its gradient |
| 2 | +# The code is a translation from the following files: |
| 3 | +# https://github.com/pytorch/pytorch/blob/master/caffe2/operators/upsample_op.cu |
| 4 | +# https://github.com/pytorch/pytorch/blob/master/caffe2/core/common_gpu.h |
| 5 | + |
| 6 | +# Forward and backward pass have been tested to produce the same output |
| 7 | +# as pytorch - it works modulo bit noise. |
| 8 | + |
| 9 | +const CUDA_NUM_THREADS = 128 |
| 10 | +const MAXIMUM_NUM_BLOCKS = 4096 |
| 11 | + |
| 12 | +@inline function GET_BLOCKS(N::Integer) |
| 13 | + # Use at least 1 block, since CUDA does not allow empty block |
| 14 | + return max(min((N + CUDA_NUM_THREADS - 1) ÷ CUDA_NUM_THREADS, MAXIMUM_NUM_BLOCKS), 1) |
| 15 | +end |
| 16 | + |
| 17 | +# pytorch: nchw with row major |
| 18 | +# flux: whcn with column major -> same data layout in memory -> this function |
| 19 | +# can stay as it is except for +1 |
| 20 | +@inline function idx( |
| 21 | + n::Integer, |
| 22 | + num_channels::Integer, |
| 23 | + c::Integer, |
| 24 | + height::Integer, |
| 25 | + width::Integer, |
| 26 | + y::Integer, |
| 27 | + x::Integer) |
| 28 | + return ((n * num_channels + c) * height + y) * width + x + 1 |
| 29 | +end |
| 30 | + |
| 31 | +function upsample_bilinear_kernel!( |
| 32 | + num_batch, |
| 33 | + num_channels, |
| 34 | + input_height, |
| 35 | + input_width, |
| 36 | + output_height, |
| 37 | + output_width, |
| 38 | + height_scale, |
| 39 | + width_scale, |
| 40 | + X, # input __restrict__ |
| 41 | + Y) # output __restrict__ |
| 42 | + out_size = output_height * output_width |
| 43 | + |
| 44 | + # CUDA 1D kernel loop |
| 45 | + @inbounds for index in ((blockIdx().x-1) * blockDim().x + threadIdx().x-1) : (blockDim().x * gridDim().x) : out_size-1 |
| 46 | + # mind the order! |
| 47 | + indexTemp = index |
| 48 | + out_x = indexTemp % output_width |
| 49 | + indexTemp ÷= output_width |
| 50 | + out_y = indexTemp % output_height |
| 51 | + indexTemp ÷= output_height |
| 52 | + indexTemp ÷= num_channels |
| 53 | + |
| 54 | + rheight = output_height > 1 ? (input_height - 1f0) / (output_height - 1f0) : 0f0 |
| 55 | + rwidth = output_width > 1 ? (input_width - 1f0) / (output_width - 1f0) : 0f0 |
| 56 | + |
| 57 | + # Compute Y axis lambdas |
| 58 | + h1r = rheight * out_y |
| 59 | + h1 = floor(Int, h1r) # here was a typecast (int) |
| 60 | + h1p = (h1 < input_height - 1) ? 1 : 0 |
| 61 | + h1lambda = h1r - h1 |
| 62 | + h0lambda = 1f0 - h1lambda |
| 63 | + |
| 64 | + # Compute X axis lambdas |
| 65 | + w1r = rwidth * out_x |
| 66 | + w1 = floor(Int, w1r) |
| 67 | + w1p = (w1 < input_width - 1) ? 1 : 0 |
| 68 | + w1lambda = w1r - w1 |
| 69 | + w0lambda = 1f0 - w1lambda |
| 70 | + |
| 71 | + for n in 0:num_batch-1 # shift to original C indexing |
| 72 | + for c in 0:num_channels-1 |
| 73 | + X0 = X[idx(n, num_channels, c, input_height, input_width, h1, w1)] |
| 74 | + X1 = X[idx(n, num_channels, c, input_height, input_width, h1, w1 + w1p)] |
| 75 | + X2 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1)] |
| 76 | + X3 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1 + w1p)] |
| 77 | + |
| 78 | + Y[idx(n, num_channels, c, output_height, output_width, out_y, out_x)] = |
| 79 | + h0lambda * (w0lambda * X0 + w1lambda * X1) + |
| 80 | + h1lambda * (w0lambda * X2 + w1lambda * X3) |
| 81 | + end # channels |
| 82 | + end # batch |
| 83 | + end # 1D kernel loop |
| 84 | + return nothing |
| 85 | +end |
| 86 | + |
| 87 | +# input is dY, output is dX |
| 88 | +function ∇upsample_bilinear_kernel( |
| 89 | + input_size, |
| 90 | + num_channels, |
| 91 | + input_height, |
| 92 | + input_width, |
| 93 | + output_height, |
| 94 | + output_width, |
| 95 | + height_scale, |
| 96 | + width_scale, |
| 97 | + dY, # const |
| 98 | + dX) |
| 99 | + @inbounds for index in ((blockIdx().x - 1) * blockDim().x + threadIdx().x-1): blockDim().x * gridDim().x : input_size-1 |
| 100 | + # mind the order! |
| 101 | + indexTemp = index |
| 102 | + in_x = indexTemp % input_width |
| 103 | + indexTemp ÷= input_width |
| 104 | + in_y = indexTemp % input_height |
| 105 | + indexTemp ÷= input_height |
| 106 | + c = indexTemp % num_channels |
| 107 | + indexTemp ÷= num_channels |
| 108 | + n = indexTemp |
| 109 | + |
| 110 | + out_y = min(in_y / height_scale, output_height - 1) |
| 111 | + out_x = min(in_x / width_scale, output_width - 1) |
| 112 | + |
| 113 | + rheight = output_height > 1 ? (output_height - 1.f0) / (input_height - 1.f0) : 0.f0 |
| 114 | + rwidth = output_width > 1 ? (output_width - 1.f0) / (input_width - 1.f0) : 0.f0 |
| 115 | + |
| 116 | + # Compute Y axis lambdas |
| 117 | + h1r = rheight * in_y |
| 118 | + h1 = round(Int, h1r, RoundDown) |
| 119 | + h1p = (h1 < output_height - 1) ? 1 : 0 |
| 120 | + h1lambda = h1r - h1 |
| 121 | + h0lambda = 1.f0 - h1lambda |
| 122 | + |
| 123 | + # Compute X axis lambdas |
| 124 | + w1r = rwidth * in_x |
| 125 | + w1 = round(Int, w1r, RoundDown) |
| 126 | + w1p = (w1 < output_width - 1) ? 1 : 0 |
| 127 | + w1lambda = w1r - w1 |
| 128 | + w0lambda = 1.f0 - w1lambda |
| 129 | + |
| 130 | + #if __CUDA_ARCH__ >= 350 # true for everything from 9xx on |
| 131 | + dYi = ldg(dY, index+1) # ldg(pointer(dY[index])) ? |
| 132 | + #else |
| 133 | + # dYi = dY[index + 1]; |
| 134 | + #endif |
| 135 | + |
| 136 | + atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1, w1)), h0lambda * w0lambda * dYi) |
| 137 | + atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1, w1 + w1p)), h0lambda * w1lambda * dYi) |
| 138 | + atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1)), h1lambda * w0lambda * dYi) |
| 139 | + atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1 + w1p)), h1lambda * w1lambda * dYi) |
| 140 | + end |
| 141 | + return nothing |
| 142 | +end |
0 commit comments