@@ -29,3 +29,129 @@ NNlib._batched_gemm!(::Type{<:CuArray}, transA::Char, transB::Char, α::Number,
29
29
30
30
Base. unsafe_convert (:: Type{CuPtr{T}} , A:: NNlib.BatchedAdjOrTrans{T} ) where {T} =
31
31
Base. unsafe_convert (CuPtr{T}, parent (A))
32
+
33
+
34
+ #
35
+ # Upsampling
36
+ #
37
+
38
+ # An implementation for GPU based bilinear upsampling including its gradient
39
+ # The code is a translation from the following files:
40
+ # https://github.com/pytorch/pytorch/blob/master/caffe2/operators/upsample_op.cu
41
+ # https://github.com/pytorch/pytorch/blob/master/caffe2/core/common_gpu.h
42
+
43
+ # Forward and backward pass have been tested to produce the same output
44
+ # as pytorch with align_corners=True - it works modulo bit noise.
45
+
46
+ function upsample_bilinear_whcn_kernel! (n_elem, rheight, rwidth, x, y)
47
+ index = (threadIdx (). x - 1 ) + (blockIdx (). x - 1 ) * blockDim (). x
48
+
49
+ if index < n_elem
50
+ in_w, in_h, channels, batchsize = size (x)
51
+ out_w, out_h, _, _ = size (y)
52
+
53
+ ow = index % out_w
54
+ oh = index ÷ out_w
55
+
56
+ real_index = rheight* oh
57
+ ih0 = floor (Int, real_index)
58
+ offset = (ih0 < in_h- 1 ) ? 1 : 0
59
+ ih1 = ih0 + offset + 1
60
+ h1lambda = real_index - ih0
61
+ h0lambda = 1 - h1lambda
62
+ ih0 += 1
63
+
64
+ real_index = rwidth* ow
65
+ iw0 = floor (Int, real_index)
66
+ offset = (iw0 < in_w- 1 ) ? 1 : 0
67
+ iw1 = iw0 + offset + 1
68
+ w1lambda = real_index - iw0
69
+ w0lambda = 1 - w1lambda
70
+ iw0 += 1
71
+
72
+ @inbounds for n in 1 : batchsize
73
+ for c in 1 : channels
74
+ val = h0lambda * (w0lambda * x[iw0, ih0, c, n] + # h0 * w0 * i00
75
+ w1lambda * x[iw1, ih0, c, n]) + # h0 * w1 * i01
76
+ h1lambda * (w0lambda * x[iw0, ih1, c, n] + # h1 * w0 * i10
77
+ w1lambda * x[iw1, ih1, c, n]) # h1 * w1 * i11
78
+ y[ow+ 1 , oh+ 1 , c, n] = val
79
+ end
80
+ end
81
+ end
82
+ return nothing
83
+ end
84
+
85
+ # Δ is the gradient backpropagated from downstream layers
86
+ function ∇upsample_bilinear_whcn_kernel! (n_elem, rheight, rwidth, Δ, dx)
87
+ index = (threadIdx (). x - 1 ) + (blockIdx (). x - 1 ) * blockDim (). x
88
+
89
+ if index < n_elem
90
+ in_width, in_height, channels, batchsize = size (Δ)
91
+ out_width, out_height, _, _ = size (dx)
92
+
93
+ iw = index % in_width
94
+ ih = index ÷ in_width
95
+
96
+ # Compute Y axis lambdas
97
+ real_index_h = rheight* ih
98
+ oh0 = floor (Int, real_index_h)
99
+ offset = (oh0 < out_height- 1 ) ? 1 : 0
100
+ oh1 = oh0 + offset + 1
101
+ h1lambda = real_index_h - oh0
102
+ h0lambda = 1 - h1lambda
103
+ oh0 += 1
104
+
105
+ # # Compute X axis lambdas
106
+ real_index_w = rwidth * iw
107
+ ow0 = floor (Int, real_index_w)
108
+ offset = (ow0 < out_width - 1 ) ? 1 : 0
109
+ ow1 = ow0 + offset + 1
110
+ w1lambda = real_index_w - ow0
111
+ w0lambda = 1 - w1lambda
112
+ ow0 += 1
113
+
114
+ @inbounds for n in 1 : batchsize
115
+ for c in 1 : channels
116
+ val = Δ[iw+ 1 , ih+ 1 , c, n]
117
+ @atomic dx[ow0, oh0, c, n] += h0lambda * w0lambda * val
118
+ @atomic dx[ow1, oh0, c, n] += h0lambda * w1lambda * val
119
+ @atomic dx[ow0, oh1, c, n] += h1lambda * w0lambda * val
120
+ @atomic dx[ow1, oh1, c, n] += h1lambda * w1lambda * val
121
+ end
122
+ end
123
+ end # if
124
+ return nothing
125
+ end
126
+
127
+
128
+ function NNlib. upsample_bilinear_whcn! (y:: CuArray{T,4} , x:: CuArray{T,4} ) where T
129
+ w,h,c,n = size (x)
130
+ out_w, out_h = (size (y,1 ), size (y,2 ))
131
+
132
+ out_size = out_h* out_w
133
+ rheight = T ((h- 1 )/ (out_h- 1 ))
134
+ rwidth = T ((w- 1 )/ (out_w- 1 ))
135
+
136
+ kernel = @cuda name= " upsample_bilinear_whcn!" launch= false upsample_bilinear_whcn_kernel! (out_size, rheight, rwidth, x, y)
137
+ config = launch_configuration (kernel. fun; max_threads= 256 )
138
+ threads = Base. min (out_size, config. threads)
139
+ blocks = cld (out_size, threads)
140
+ kernel (out_size, rheight, rwidth, x, y; threads= threads, blocks= blocks)
141
+ return y
142
+ end
143
+
144
+ function NNlib. ∇upsample_bilinear_whcn! (dx:: CuArray{T,4} , Δ:: CuArray{T,4} ) where T
145
+ w,h,c,n = Base. size (Δ)
146
+ out_w, out_h = (size (dx, 1 ), size (dx, 2 ))
147
+ in_size = h* w
148
+ rheight = T ((out_h- 1 )/ (h- 1 )) # reversed compared to forward pass
149
+ rwidth = T ((out_w- 1 )/ (w- 1 ))
150
+
151
+ kernel = @cuda name= " ∇upsample_bilinear_whcn!" launch= false ∇upsample_bilinear_whcn_kernel! (in_size, rheight, rwidth, Δ, dx)
152
+ config = launch_configuration (kernel. fun; max_threads= 256 )
153
+ threads = Base. min (in_size, config. threads)
154
+ blocks = cld (in_size, threads)
155
+ kernel (in_size, rheight, rwidth, Δ, dx; threads= threads, blocks= blocks)
156
+ return dx
157
+ end
0 commit comments