diff --git a/Project.toml b/Project.toml
index 7639e89b0..80bec9943 100644
--- a/Project.toml
+++ b/Project.toml
@@ -15,6 +15,7 @@ Requires = "0.5, 1.0"
 julia = "1.3"
 
 [extras]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -23,4 +24,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["ChainRulesTestUtils", "FiniteDifferences", "Random", "StableRNGs", "Test", "Zygote"]
+test = ["ChainRulesTestUtils", "CUDA", "FiniteDifferences", "Random", "StableRNGs", "Test", "Zygote"]
diff --git a/src/NNlib.jl b/src/NNlib.jl
index 5f4851fd7..d5c89dff1 100644
--- a/src/NNlib.jl
+++ b/src/NNlib.jl
@@ -24,14 +24,13 @@ is_nnpack_available() = false
 end
 
 include("activations.jl")
-
 include("softmax.jl")
-include("misc.jl")
 include("batched/batchedmul.jl")
 include("gemm.jl")
 include("conv.jl")
 include("conv_bias_act.jl")
 include("pooling.jl")
+include("upsample.jl")
 
 ## Include implementations
 include("impl/padding_edges.jl")
diff --git a/src/misc.jl b/src/misc.jl
deleted file mode 100644
index 02c9fa53d..000000000
--- a/src/misc.jl
+++ /dev/null
@@ -1,25 +0,0 @@
-export pixel_shuffle
-
-"""
-    pixel_shuffle(x, r)
-    
-Pixel shuffling operation. `r` is the upscale factor for shuffling.
-The operation converts an input of size [W,H,r²C,N] to size [rW,rH,C,N]
-Used extensively in super-resolution networks to upsample 
-towards high resolution features.
-
-Reference : https://arxiv.org/pdf/1609.05158.pdf
-"""
-function pixel_shuffle(x::AbstractArray, r::Integer)    
-    @assert ndims(x) > 2
-    d = ndims(x) - 2
-    sizein = size(x)[1:d]
-    cin, n = size(x, d+1), size(x, d+2) 
-    @assert cin % r^d == 0
-    cout = cin ÷ r^d
-    # x = reshape(x, sizein..., fill(r, d)..., cout, n) # bug https://github.com/FluxML/Zygote.jl/issues/866
-    x = reshape(x, sizein..., ntuple(i->r, d)..., cout, n)
-    perm = [d+1:2d 1:d]' |> vec  # = [d+1, 1, d+2, 2, ..., 2d, d]
-    x = permutedims(x, (perm..., 2d+1, 2d+2))
-    return reshape(x, ((r .* sizein)..., cout, n))
-end
diff --git a/src/upsample.jl b/src/upsample.jl
new file mode 100644
index 000000000..860e8441b
--- /dev/null
+++ b/src/upsample.jl
@@ -0,0 +1,248 @@
+export upsample_bilinear, ∇upsample_bilinear, pixel_shuffle
+
+"""
+    upsample_bilinear(x::AbstractArray{<:Number,4}, k::NTuple{2,Int})
+
+Upsamples the first 2 dimensions of the array `x` by the upsample factors stored in `k`,
+using bilinear interpolation. 
+
+The size of the output is equal to 
+`(k[1]*S1, k[2]*S2, S3, S4)`, where `S1, S2, S3, S4 = size(x)`.
+
+The interpolation grid is identical to the one used by `imresize` from `Images.jl`.
+
+Currently only 2d upsampling is supported.
+"""
+function upsample_bilinear(x::AbstractArray{T,4}, k::NTuple{2,Int}) where T
+    # This function is gpu friendly
+    
+    imgsize = size(x)
+    newsize = get_newsize(imgsize, k)
+
+    # Get linear interpolation lower- and upper index, and weights
+    ilow1, ihigh1, wdiff1 = get_inds_and_ws(x, imgsize[1], newsize[1], 1)
+    ilow2, ihigh2, wdiff2 = get_inds_and_ws(x, imgsize[2], newsize[2], 2)
+
+    # Adjust the upper interpolation indices of the second dimension
+    ihigh2_r = adjoint_of_idx(ilow2)[ihigh2]
+
+    @inbounds y = @view(x[ilow1,ilow2,:,:]) .* (1 .- wdiff1) .+ @view(x[ihigh1,ilow2,:,:]) .* wdiff1
+    @inbounds y .= y .* (1 .- wdiff2) .+ y[:,ihigh2_r,:,:] .* wdiff2
+    # @inbounds y = y .* (1 .- wdiff2) .+ @view(y[:,ihigh2_r,:,:]) .* wdiff2 # equivalent to line above
+    return y
+end
+
+function get_inds_and_ws(x::T, n::Int, m::Int, dim::Int) where T <: AbstractArray
+    # Creates interpolation grid for resampling. 
+    # Creates the same grid as used in Image.jl `imresize`.
+    step = n // m
+    offset = (n + 1)//2 - step//2 - step * (m//2 - 1)
+    xq = clamp.(range(offset, step=step, length=m), 1, n)
+
+    # Creates interpolation lower and upper indices, and broadcastable weights
+    ilow = floor.(Int, xq)
+    ihigh = ceil.(Int, xq)
+    sizew = ntuple(i-> i == dim ? length(xq) : 1, ndims(x))
+    wdiff = convert(T, reshape(xq .- ilow, sizew)) # wdiff possibly lives on gpu
+    return ilow, ihigh, wdiff
+end
+
+"""
+    adjoint_of_idx(idx::Vector{<:Integer})
+
+# Arguments
+- `idx`: a vector of indices from which you want the adjoint.
+
+# Outputs
+-`idx_adjoint`: index that inverses the operation `x[idx]`.
+
+# Explanation
+Determines the adjoint of the vector of indices `idx`, based on the following assumptions:
+* `idx[1] == 1`
+* `all(d in [0,1] for d in diff(idx))`
+The adjoint of `idx` can be seen as an inverse operation such that:
+
+```julia
+x = [1, 2, 3, 4, 5]
+idx = [1, 2, 2, 3, 4, 4, 5]
+idx_adjoint = adjoint_of_idx(idx)
+@assert x[idx][idx_adjoint] == x
+```
+The above holds as long as `idx` contains every index in `x`.
+"""
+function adjoint_of_idx(idx::Vector{Int})
+    d = trues(length(idx))
+    d[2:end] .= diff(idx)
+    idx_adjoint = findall(d)
+    return idx_adjoint
+end
+
+function get_newsize(sz, k)
+    return ntuple(i -> i <= length(k) ? sz[i]*k[i] : sz[i], length(sz))
+end
+
+
+"""
+    ∇upsample_bilinear(Δ::AbstractArray{<:Number,4}, k::NTuple{2,Int})
+    
+# Arguments
+- `Δ`: array that has been upsampled using the upsample factors in `k`
+
+# Outputs
+- `dx`: downsampled version of `Δ`
+
+# Explanation
+
+Custom adjoint for [`upsample_bilinear`](@ref). 
+The adjoint of upsampling is a downsampling operation, which
+in this implementation is performed using `NNlib.conv` in combination with a downsampling kernel based on the
+upsampling factors. Because of the zero-padding during convolution, the values at the boundary are polluted by edge-effects,
+which have been corrected for manually.
+"""
+function ∇upsample_bilinear(Δ::AbstractArray{<:Number, 4}, k::NTuple{2,Int})
+    # This function is gpu friendly
+    
+    # Be more efficient on some corner cases
+    if size(Δ, 1) == k[1]
+        Δ = sum(Δ, dims=1)
+        k = (1, k[2])
+    end
+    if size(Δ, 2) == k[2]
+        Δ = sum(Δ, dims=2)
+        k = (k[1], 1)
+    end
+    if (size(Δ, 1) == 1) && (size(Δ, 2) == 1)
+        dx = Δ
+        return dx
+    end
+
+    n_chan, n_batch = size(Δ, 3), size(Δ, 4)
+
+    kern1 = get_downsamplekernel(Δ, k[1])
+    kern2 = get_downsamplekernel(Δ, k[2])
+    kern = kern1 * kern2'
+    
+    pad = (floor(Int, k[1]//2), floor(Int, k[2]//2))
+    stride = k
+    
+    weight = similar(Δ, eltype(Δ), (size(kern)..., n_chan, n_chan))
+    weight .= 0
+    for i in 1:n_chan
+        weight[:,:,i,i] .= kern
+    end
+    # weight = cat(fill(kern, n_chan)..., dims=(3,4)) # slow
+    dx = conv(Δ, weight, pad=pad, stride=stride)
+
+    # Still have to fix edge effects due to zero-padding of convolution,
+    # TODO: Could be circumvented by having padding that just extrapolates the value at the first/last index
+    # nextras = tuple((Int.(floor(factor//2)) for factor in k)...)
+    nextras = (floor(Int, k[1]//2), floor(Int, k[2]//2))
+
+    # First dimension edge-effect correction
+    if nextras[1] > 0
+        kern1 = kern[1:nextras[1],:]
+        pad1 = (0, pad[2])
+        stride1 = (1, stride[2])
+        weight1 = similar(Δ, eltype(Δ), (size(kern1)..., n_chan, n_chan))
+        weight1 .= 0
+        for i in 1:n_chan
+            weight1[:,:,i,i] .= kern1
+        end
+        # weight1 = cat(fill(kern1, n_chan)..., dims=(3,4)) # slow
+        dx[[1],:,:,:] .+= conv(Δ[1:nextras[1],:,:,:], weight1, pad=pad1, stride=stride1)
+        weight1 .= weight1[end:-1:1,:,:,:]
+        dx[[end],:,:,:] .+= conv(Δ[end-nextras[1]+1:end,:,:,:], weight1, pad=pad1, stride=stride1)
+    
+        ## Conv with views is not dispatched to CUDA.conv
+        # dx[[1],:,:,:] .+= conv(@view(Δ[1:nextras[1],:,:,:]), weight1, pad=pad1, stride=stride1)
+        # weight1 .= @view(weight1[end:-1:1,:,:,:])
+        # dx[[end],:,:,:] .+= conv(@view(Δ[end-nextras[1]+1:end,:,:,:]), weight1, pad=pad1, stride=stride1)
+    end
+
+    # Second dimension edge-effect correction
+    if nextras[2] > 0
+        kern2 = kern[:,1:nextras[2]]
+        pad2 = (pad[1], 0)
+        stride2 = (stride[1], 1)
+        weight2 = similar(Δ, eltype(Δ), (size(kern2)..., n_chan, n_chan))
+        weight2 .= 0
+        for i in 1:n_chan
+            weight2[:,:,i,i] .= kern2
+        end
+        # weight2 = cat(fill(kern2, n_chan)..., dims=(3,4)) # slow
+    
+        yy = conv(Δ[:,1:nextras[2],:,:], weight2, pad=pad2, stride=stride2)
+        dx[:,[1],:,:] .+= conv(Δ[:,1:nextras[2],:,:], weight2, pad=pad2, stride=stride2)
+        weight2 .= weight2[:,end:-1:1,:,:]
+        dx[:,[end],:,:] .+= conv(Δ[:,end-nextras[2]+1:end,:,:], weight2, pad=pad2, stride=stride2)
+
+        ## Conv with views is not dispatched to CUDA.conv
+        # yy = conv(@view(Δ[:,1:nextras[2],:,:]), weight2, pad=pad2, stride=stride2)
+        # dx[:,[1],:,:] .+= conv(@view(Δ[:,1:nextras[2],:,:]), weight2, pad=pad2, stride=stride2)
+        # weight2 .= @view(weight2[:,end:-1:1,:,:])
+        # dx[:,[end],:,:] .+= conv(@view(Δ[:,end-nextras[2]+1:end,:,:]), weight2, pad=pad2, stride=stride2)
+    end
+
+    ## Finally fix four corners if needed
+    n1, n2 = nextras
+    if (n1 > 0) & (n2 > 0)
+        dx[1,1,:,:] .+= sum(kern[1:n1,1:n2] .* @view(Δ[1:n1,1:n2,:,:]), dims=(1,2))[1,1,:,:]
+        dx[1,end,:,:] .+= sum(kern[1:n1,end-n2+1:end] .* @view(Δ[1:n1,end-n2+1:end,:,:]), dims=(1,2))[1,1,:,:]
+        dx[end,end,:,:] .+= sum(kern[end-n1+1:end,end-n2+1:end] .* @view(Δ[end-n1+1:end,end-n2+1:end,:,:]), dims=(1,2))[1,1,:,:]
+        dx[end,1,:,:] .+= sum(kern[end-n1+1:end,1:n2] .* @view(Δ[end-n1+1:end,1:n2,:,:]), dims=(1,2))[1,1,:,:]
+    end
+
+    return dx
+end
+
+# `n` upsample factor for which a downsample kernel will be determined.
+# Δ is given in case of necessity of gpu conversion 
+function get_downsamplekernel(Δ, n::Int)
+    step = 1//n
+    if n % 2 == 0
+        start = step//2
+        upward = collect(start:step:1//1)
+        kernel = [upward; reverse(upward)]
+    else
+        start = step
+        upward = collect(start:step:1//1)
+        kernel = [upward; reverse(upward[1:end-1])]
+    end
+    # TODO there must be a more convenient way to send to gpu 
+    kernel = convert(typeof(Δ), reshape(kernel, length(kernel), 1, 1, 1))
+    kernel = dropdims(kernel, dims=(2,3,4))
+    return kernel
+end
+
+function ChainRulesCore.rrule(::typeof(upsample_bilinear), x, k)
+    Ω = upsample_bilinear(x, k)
+    function upsample_bilinear_pullback(Δ)
+        (NO_FIELDS, ∇upsample_bilinear(Δ, k), DoesNotExist())
+    end
+    return Ω, upsample_bilinear_pullback
+end
+
+
+"""
+    pixel_shuffle(x, r)
+    
+Pixel shuffling operation. `r` is the upscale factor for shuffling.
+The operation converts an input of size [W,H,r²C,N] to size [rW,rH,C,N]
+Used extensively in super-resolution networks to upsample 
+towards high resolution features.
+
+Reference : https://arxiv.org/pdf/1609.05158.pdf
+"""
+function pixel_shuffle(x::AbstractArray, r::Integer)
+    @assert ndims(x) > 2
+    d = ndims(x) - 2
+    sizein = size(x)[1:d]
+    cin, n = size(x, d+1), size(x, d+2) 
+    @assert cin % r^d == 0
+    cout = cin ÷ r^d
+    # x = reshape(x, sizein..., fill(r, d)..., cout, n) # bug https://github.com/FluxML/Zygote.jl/issues/866
+    x = reshape(x, sizein..., ntuple(i->r, d)..., cout, n)
+    perm = [d+1:2d 1:d]' |> vec  # = [d+1, 1, d+2, 2, ..., 2d, d]
+    x = permutedims(x, (perm..., 2d+1, 2d+2))
+    return reshape(x, ((r .* sizein)..., cout, n))
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 0ef0676fc..16cb3394e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,6 +6,8 @@ using FiniteDifferences: FiniteDifferenceMethod, central_fdm
 import Zygote
 using Zygote: gradient
 using StableRNGs
+using CUDA
+CUDA.allowscalar(false)
 
 const rng = StableRNG(123)
 
@@ -36,6 +38,6 @@ end
     include("softmax.jl")
 end
 
-@testset "Misc Stuff" begin
-    include("misc.jl")
+@testset "Upsampling" begin
+    include("upsample.jl")
 end
diff --git a/test/misc.jl b/test/upsample.jl
similarity index 60%
rename from test/misc.jl
rename to test/upsample.jl
index 2f2d724da..6d07a1344 100644
--- a/test/misc.jl
+++ b/test/upsample.jl
@@ -1,3 +1,32 @@
+@testset "upsample_bilinear 2d" begin
+    x = reshape(Float32[1. 2.; 3. 4.], (2,2,1,1))
+    y_true =   [1//1  5//4    7//4    2//1;
+                1//1  5//4    7//4    2//1;
+                5//3  23//12  29//12  8//3;
+                7//3  31//12  37//12  10//3;
+                3//1  13//4   15//4   4//1;
+                3//1  13//4   15//4   4//1][:,:,:,:]
+   
+    y = upsample_bilinear(x, (3, 2))
+    @test size(y) == size(y_true)
+    @test eltype(y) == Float32
+    @test y ≈ y_true
+
+    gradtest(x->upsample_bilinear(x, (3, 2)), x, atol=1e-4)
+
+    if CUDA.has_cuda()
+        y = upsample_bilinear(x |> cu, (3, 2))
+        @test y isa CuArray 
+        @test Array(y) ≈ y_true
+        g_gpu = Zygote.gradient(x -> sum(sin.(upsample_bilinear(x, (3, 2))))
+                                , x |> cu)[1]
+        @test g_gpu isa CuArray
+        g_cpu = Zygote.gradient(x -> sum(sin.(upsample_bilinear(x, (3, 2))))
+                                , x)[1]
+        @test Array(g_cpu) ≈ g_cpu  atol=1e-4
+    end
+end
+
 @testset "pixel_shuffle" begin
     x = reshape(1:16, (2, 2, 4, 1))
     # [:, :, 1, 1] =
@@ -61,3 +90,4 @@
         gradtest(x -> pixel_shuffle(x, r), x)
     end
 end
+