diff --git a/LICENSE.md b/LICENSE.md
index 4529a5eb5d..b6a6b92559 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,6 +1,6 @@
 The CUDA.jl package is licensed under the MIT "Expat" License:
 
-> Copyright (c) 2019-2020: Julia Computing and other contributors
+> Copyright (c) 2019-present: Julia Computing and other contributors
 > Copyright (c) 2014-2018: Tim Besard
 > Copyright (c) 2013: Dahua Lin
 >
diff --git a/src/nnlib.jl b/src/nnlib.jl
index 1cf65bdd26..8d5acbacdc 100644
--- a/src/nnlib.jl
+++ b/src/nnlib.jl
@@ -29,3 +29,161 @@ NNlib._batched_gemm!(::Type{<:CuArray}, transA::Char, transB::Char, α::Number,
 
 Base.unsafe_convert(::Type{CuPtr{T}}, A::NNlib.BatchedAdjOrTrans{T}) where {T} =
     Base.unsafe_convert(CuPtr{T}, parent(A))
+
+
+#
+# Upsampling
+#
+
+# GPU based bilinear upsampling including its gradient
+#
+# Based on the Caffe2 implementation at:
+# The code is a translation from the following files:
+# - https://github.com/pytorch/pytorch/blob/v1.8.0-rc1/caffe2/operators/upsample_op.cu
+# - https://github.com/pytorch/pytorch/blob/v1.8.0-rc1/caffe2/core/common_gpu.h
+#
+# Copyright (c) 2016-2021 Facebook Inc.
+# Copyright (c) 2015 Google Inc.
+# Copyright (c) 2015 Yangqing Jia
+# Copyright 2019-2020 Kakao Brain
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are
+# permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this list of
+#    conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice, this list of
+#    conditions and the following disclaimer in the documentation and/or other materials
+#    provided with the distribution.
+#
+# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America and
+#    IDIAP Research Institute nor the names of its contributors may be used to endorse or
+#    promote products derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Forward and backward pass have been tested to produce the same output
+# as pytorch with align_corners=True - it works modulo bit noise.
+
+function upsample_bilinear_whcn_kernel!(n_elem, rheight, rwidth, x, y)
+    index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
+
+    if index < n_elem
+        in_w, in_h, channels, batchsize = size(x)
+        out_w, out_h, _, _ = size(y)
+
+        ow = index % out_w
+        oh = index ÷ out_w
+
+        real_index = rheight*oh
+        ih0 = Base.floor(Int, real_index)
+        offset = (ih0 < in_h-1) ? 1 : 0
+        ih1 = ih0 + offset + 1
+        h1lambda = real_index - ih0
+        h0lambda = 1 - h1lambda
+        ih0 += 1
+
+        real_index = rwidth*ow
+        iw0 = Base.floor(Int, real_index)
+        offset = (iw0 < in_w-1) ? 1 : 0
+        iw1 = iw0 + offset + 1
+        w1lambda = real_index - iw0
+        w0lambda = 1 - w1lambda
+        iw0 += 1
+
+        @inbounds for n in 1:batchsize
+            for c in 1:channels
+                val = h0lambda * (w0lambda * x[iw0, ih0, c, n]  + # h0 * w0 * i00
+                                  w1lambda * x[iw1, ih0, c, n]) + # h0 * w1 * i01
+                      h1lambda * (w0lambda * x[iw0, ih1, c, n]  + # h1 * w0 * i10
+                                  w1lambda * x[iw1, ih1, c, n])   # h1 * w1 * i11
+                y[ow+1, oh+1, c, n] = val
+            end
+        end
+    end
+    return nothing
+end
+
+# Δ is the gradient backpropagated from downstream layers
+function ∇upsample_bilinear_whcn_kernel!(n_elem, rheight, rwidth, Δ, dx)
+    index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
+
+    if index < n_elem
+        in_width, in_height, channels, batchsize = size(Δ)
+        out_width, out_height, _, _ = size(dx)
+
+        iw = index % in_width
+        ih = index ÷ in_width
+
+        # Compute Y axis lambdas
+        real_index_h = rheight*ih
+        oh0 = Base.floor(Int, real_index_h)
+        offset = (oh0 < out_height-1) ? 1 : 0
+        oh1 = oh0 + offset + 1
+        h1lambda = real_index_h - oh0
+        h0lambda = 1 - h1lambda
+        oh0 += 1
+
+        # # Compute X axis lambdas
+        real_index_w = rwidth * iw
+        ow0 = Base.floor(Int, real_index_w)
+        offset = (ow0 < out_width - 1) ? 1 : 0
+        ow1 = ow0 + offset + 1
+        w1lambda = real_index_w - ow0
+        w0lambda = 1 - w1lambda
+        ow0 += 1
+
+        @inbounds for n in 1:batchsize
+            for c in 1:channels
+                val = Δ[iw+1, ih+1, c, n]
+                @atomic dx[ow0, oh0, c, n] += h0lambda * w0lambda * val
+                @atomic dx[ow1, oh0, c, n] += h0lambda * w1lambda * val
+                @atomic dx[ow0, oh1, c, n] += h1lambda * w0lambda * val
+                @atomic dx[ow1, oh1, c, n] += h1lambda * w1lambda * val
+            end
+        end
+    end # if
+    return nothing
+end
+
+function NNlib.upsample_bilinear_whcn!(y::CuArray{T,4}, x::CuArray{T,4}) where T
+    w,h,c,n = size(x)
+    out_w, out_h = (size(y,1), size(y,2))
+
+    out_size = out_h*out_w
+    rheight = T((h-1)/(out_h-1))
+    rwidth  = T((w-1)/(out_w-1))
+
+    kernel = @cuda launch=false upsample_bilinear_whcn_kernel!(out_size, rheight, rwidth, x, y)
+    config = launch_configuration(kernel.fun; max_threads=256)
+    threads = Base.min(out_size, config.threads)
+    blocks = cld(out_size, threads)
+    kernel(out_size, rheight, rwidth, x, y; threads=threads, blocks=blocks)
+    return y
+end
+
+function NNlib.∇upsample_bilinear_whcn!(dx::CuArray{T,4}, Δ::CuArray{T,4}) where T
+    w,h,c,n = Base.size(Δ)
+    out_w, out_h = (size(dx, 1), size(dx, 2))
+    in_size = h*w
+    rheight = T((out_h-1)/(h-1)) # reversed compared to forward pass
+    rwidth  = T((out_w-1)/(w-1))
+
+    kernel = @cuda launch=false ∇upsample_bilinear_whcn_kernel!(in_size, rheight, rwidth, Δ, dx)
+    config = launch_configuration(kernel.fun; max_threads=256)
+    threads = Base.min(in_size, config.threads)
+    blocks = cld(in_size, threads)
+    kernel(in_size, rheight, rwidth, Δ, dx; threads=threads, blocks=blocks)
+    return dx
+end
diff --git a/test/nnlib.jl b/test/nnlib.jl
index 8f57e27f90..0e3020e2cc 100644
--- a/test/nnlib.jl
+++ b/test/nnlib.jl
@@ -62,3 +62,30 @@ end
     @test testf(x -> logσ.(x), rand(5))
   end
 end
+
+@testset "Bilinear upsampling" begin
+  x = Float32[1 2; 3 4][:,:,:,:]
+  x = cat(x,x; dims=3)
+  x = cat(x,x; dims=4)
+  x = cu(x)
+
+  y_true = Float32[ 1//1  4//3   5//3   2//1;
+          7//5 26//15 31//15 12//5;
+          9//5 32//15 37//15 14//5;
+         11//5 38//15 43//15 16//5;
+         13//5 44//15 49//15 18//5;
+          3//1 10//3  11//3   4//1]
+  y_true = cat(y_true,y_true; dims=3)
+  y_true = cat(y_true,y_true; dims=4)
+  y_true = cu(y_true)
+
+  y = upsample_bilinear(x, (3,2))
+
+  @test size(y) == size(y_true)
+  @test eltype(y) == Float32
+  @test y ≈ y_true
+
+  o = CUDA.ones(Float32,6,4,2,1)
+  grad_true = 6*CUDA.ones(Float32,2,2,2,1)
+  @test ∇upsample_bilinear(o; size=(2,2)) ≈ grad_true
+end