fix cpu(x) for immutable arrays (#2117)

CarloLucibello · DhairyaLGandhi · ToucheSir · web-flow · commit ee78ce3cefb0 · 2022-12-31T08:10:50.000+01:00
* fix cpu(x) for immutable arrays

* Update test/cuda/cuda.jl

Co-authored-by: Dhairya Gandhi &lt;dhairya@juliacomputing.com&gt;

* Update test/cuda/cuda.jl

Co-authored-by: Brian Chen &lt;ToucheSir@users.noreply.github.com&gt;

* rrules for adapt

* do not unthunk if not needed

* add comment on adapt rrules

Co-authored-by: Dhairya Gandhi &lt;dhairya@juliacomputing.com&gt;
Co-authored-by: Brian Chen &lt;ToucheSir@users.noreply.github.com&gt;
diff --git a/src/functor.jl b/src/functor.jl
@@ -121,14 +121,31 @@ adapt_storage(to::FluxCPUAdaptor, x::AbstractSparseArray) = x
 adapt_storage(to::FluxCPUAdaptor, x::CUDA.RNG) = Random.default_rng()
 adapt_storage(to::FluxCPUAdaptor, x::AbstractRNG) = x
 
+# PIRACY, should be defined in CUDA.jl
 function ChainRulesCore.rrule(::Type{Array}, x::CUDA.CuArray)
-  Array(x), dx -> (NoTangent(), CUDA.cu(unthunk(dx)),)
+  Array(x), dx -> (NoTangent(), CUDA.cu(unthunk(dx)))
 end
 
 function ChainRulesCore.rrule(::typeof(Adapt.adapt_storage), to::FluxCPUAdaptor, x::CUDA.AbstractGPUArray)
-  adapt_storage(to, x), dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx)),)
+  adapt_storage(to, x), dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx)))
 end
 
+# The following rrules for adapt are here to avoid double wrapping issues
+# as seen in https://github.com/FluxML/Flux.jl/pull/2117#discussion_r1027321801
+
+ChainRulesCore.rrule(::typeof(adapt), a::FluxCPUAdaptor, x::AnyCuArray) =
+  adapt(a, x), Δ -> (NoTangent(), NoTangent(), adapt(FluxCUDAAdaptor(), unthunk(Δ)))
+
+ChainRulesCore.rrule(::typeof(adapt), a::FluxCPUAdaptor, x::AbstractArray) =
+  adapt(a, x), Δ -> (NoTangent(), NoTangent(), Δ)
+
+ChainRulesCore.rrule(::typeof(adapt), a::FluxCUDAAdaptor, x::AnyCuArray) =
+  adapt(a, x), Δ -> (NoTangent(), NoTangent(), Δ)
+
+ChainRulesCore.rrule(::typeof(adapt), a::FluxCUDAAdaptor, x::AbstractArray) =
+  adapt(a, x), Δ -> (NoTangent(), NoTangent(), adapt(FluxCPUAdaptor(), unthunk(Δ)))
+
+
 # CPU/GPU movement conveniences
 
 """
@@ -154,7 +171,7 @@ julia> typeof(m_cpu.W)
 Matrix{Float32}
 ```
 """
-cpu(x) = fmap(x -> adapt(FluxCPUAdaptor(), x), x)
+cpu(x) = fmap(x -> adapt(FluxCPUAdaptor(), x), x, exclude = _isleaf)
 
 _isbitsarray(::AbstractArray{<:Number}) = true
 _isbitsarray(::AbstractArray{T}) where T = isbitstype(T)
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
@@ -20,7 +20,7 @@ using SparseArrays: sparse, SparseMatrixCSC, AbstractSparseArray
   m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
   cm = gpu(m)
 
-  @test all(p isa CuArray for p in params(cm))
+  @test all(p isa CuArray for p in Flux.params(cm))
   @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
 
   xs = rand(5, 5)
@@ -65,7 +65,7 @@ end
 end
 
 @testset "onehot forward map to broadcast" begin
-  oa = OneHotArray(rand(1:10, 5, 5), 10) |> gpu
+  oa = Flux.OneHotArray(rand(1:10, 5, 5), 10) |> gpu
   @test all(map(identity, oa) .== oa)
   @test all(map(x -> 2 * x, oa) .== 2 .* oa)
 end
@@ -110,14 +110,14 @@ end
   # This test should really not go through indirections and pull out Fills for efficiency
   # but we forcefully materialise. TODO: remove materialising CuArray here
   @test gradient(x -> sum(cpu(x)), ca)[1] isa CuArray # This involves FillArray, which should be GPU compatible
-  @test gradient(x -> sum(cpu(x)), ca')[1] isa LinearAlgebra.Adjoint
+  @test gradient(x -> sum(cpu(x)), ca')[1] isa CuArray
 
   # Even more trivial: no movement
   @test gradient(x -> sum(abs, cpu(x)), a)[1] isa Matrix
   @test gradient(x -> sum(abs, cpu(x)), a')[1] isa Matrix
   @test gradient(x -> sum(cpu(x)), a)[1] isa typeof(gradient(sum, a)[1]) # FillArray
   @test gradient(x -> sum(abs, gpu(x)), ca)[1] isa CuArray
-  @test_skip gradient(x -> sum(abs, gpu(x)), ca')[1] isa CuArray # KernelError: passing and using non-bitstype argument
+  @test gradient(x -> sum(abs, gpu(x)), ca')[1] isa CuArray
 
   # More complicated, Array * CuArray is an error
   g0 = gradient(x -> sum(abs, (a * (a * x))), a)[1]
@@ -165,4 +165,16 @@ end
   @test gpu(g2) isa CuArray
   @test gpu(g2) ≈ cu(Vector(g2))
   @test parent(gpu(g3)) isa CuArray
+
+
+  #Issue #2116  
+  struct A2116
+    x::Int
+    y::Int
+  end
+  x = [A2116(1,1), A2116(2,2)]
+  xgpu = gpu(x) 
+  @test xgpu isa CuVector{A2116}
+  @test cpu(xgpu) isa Vector{A2116} 
+  @test cpu(gpu([CartesianIndex(1)])) isa Vector{CartesianIndex{1}}
 end
diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl
@@ -1,7 +1,7 @@
 using Flux, Test, CUDA
 using Zygote
 using Zygote: pullback
-using Random
+using Random, LinearAlgebra, Statistics
 
 @info "Testing GPU Support"
 CUDA.allowscalar(false)