Fix GPU getindex (#53)

mcabbott · web-flow · commit 4e2d74a3ae87 · 2025-04-28T17:46:37.000-04:00
* delete ambiguous GPU getindex methods, and add tests

* restore special method for row indexing

* similar and copyto to make convert(AbstractArray{Float32}, cx) work

* copyto method using Adapt move to CPU

* fix &amp; test invoke case

* v0.2.8
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "OneHotArrays"
 uuid = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
-version = "0.2.7"
+version = "0.2.8"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/src/array.jl b/src/array.jl
@@ -64,13 +64,9 @@ Base.size(x::OneHotArray) = (x.nlabels, size(x.indices)...)
 
 function Base.getindex(x::OneHotArray{<:Any, N}, i::Int, I::Vararg{Int, N}) where N
   @boundscheck (1 <= i <= x.nlabels) || throw(BoundsError(x, (i, I...)))
-  return x.indices[I...] .== i
+  return x.indices[I...] == i
 end
-# the method above is faster on the CPU but will scalar index on the GPU
-# so we define the method below to pass the extra indices directly to GPU array
-function Base.getindex(x::OneHotArray{<:Any, N, <:Any, <:AbstractGPUArray},
-                       i::Int, 
-                       I::Vararg{Any, N}) where N
+function Base.getindex(x::OneHotArray{<:Any, N}, i::Int, I::Vararg{Any, N}) where N
   @boundscheck (1 <= i <= x.nlabels) || throw(BoundsError(x, (i, I...)))
   return x.indices[I...] .== i
 end
@@ -80,6 +76,18 @@ end
 Base.getindex(x::OneHotArray, ::Colon) = BitVector(reshape(x, :))
 Base.getindex(x::OneHotArray{<:Any, N}, ::Colon, ::Vararg{Colon, N}) where N = x
 
+Base.similar(x::OneHotArray{<:Any,<:Any,<:Any,<:AbstractArray}, ::Type{T}, size::Base.Dims) where T =
+  similar(x.indices, T, size)
+
+function Base.copyto!(dst::AbstractArray{T,N}, src::OneHotArray{<:Any,<:Any,N,<:AbstractArray}) where {T,N}
+  size(dst) == size(src) || return invoke(copyto!, Tuple{typeof(dst), AbstractArray{Bool,N}}, dst, src)
+  dst .= reshape(src.indices, 1, size(src.indices)...) .== (1:src.nlabels)
+  return dst
+end
+function Base.copyto!(dst::Array{T,N}, src::OneHotArray{<:Any,<:Any,N,<:AnyGPUArray}) where {T,N}
+  copyto!(dst, adapt(Array, src))
+end
+
 function Base.showarg(io::IO, x::OneHotArray, toplevel)
   print(io, ndims(x) == 1 ? "OneHotVector(" : ndims(x) == 2 ? "OneHotMatrix(" : "OneHotArray(")
   Base.showarg(io, x.indices, false)
diff --git a/test/array.jl b/test/array.jl
@@ -38,6 +38,9 @@ end
   # linear indexing
   @test om[11] == om[1, 2]
   @test oa[52] == oa[2, 1, 2]
+  @test copyto!(rand(50,1), om) == reshape(om,:,1)  # hits invoke path
+  @test copyto!(rand(51,1), om)[1:50] == vec(om)
+  @test_throws BoundsError copyto!(rand(49,1), om)
 
   # bounds checks
   @test_throws BoundsError ov[0]
diff --git a/test/gpu.jl b/test/gpu.jl
@@ -1,5 +1,6 @@
 
 # Tests from Flux, probably not the optimal testset organisation!
+# (When CUDA is not available, these are run with JLArrays)
 
 @testset "CUDA" begin
   x = randn(5, 5)
@@ -18,14 +19,39 @@
   @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys)
 end
 
+@testset "gpu indexing" begin
+  x = onehotbatch([1, 2, 3, 2], 1:3)
+  cx = cu(x)
+
+  # These worked on OneHotArrays v0.2.7
+  @test cx[:, 1:2] isa OneHotMatrix
+  @test cx[:, 1:2].indices isa CuArray
+
+  @test @allowscalar cx[:,1] isa OneHotVector  # column, needs @allowscalar on v0.2.7
+  @test @allowscalar cx[:,1].indices isa Integer
+  @test collect(@allowscalar cx[:,end]) == [0,1,0]
+
+  @test cx[2,:] isa CuArray{Bool}  # row, is not onehot!
+  @test sum(cx[2,:]) == 2
+  @test collect(cx[2,:]) == x[2,:]
+
+  # These were broken on OneHotArrays v0.2.7
+  @test @allowscalar cx[2,2] == x[2,2]
+  @test collect(cx) == collect(x)
+  @test Matrix(cx) == Matrix(x) == collect(x)
+  @test Array{Float32}(cx) == Array{Float32}(x) == collect(x)
+  @test convert(AbstractArray{Float32}, cx) isa CuArray{Float32}
+  @test collect(convert(AbstractArray{Float32}, cx)) == collect(x)
+end
+
 @testset "onehot gpu" begin
   y = onehotbatch(ones(3), 1:2) |> cu;
   @test (repr("text/plain", y); true)
 
   gA = rand(3, 2) |> cu;
 
   #NOTE: this would require something that can copute gradient... we don't have that here?
-  #@test gradient(A -> sum(A * y), gA)[1] isa CuArray 
+  #@test gradient(A -> sum(A * y), gA)[1] isa CuArray
 
   # some specialized implementations call only mul! and not *, so we must ensure this works
   @test LinearAlgebra.mul!(similar(gA, 3, 3), gA, y) ≈ gA*y
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -19,11 +19,13 @@ import CUDA
 if CUDA.functional()
   using CUDA  # exports CuArray, etc
   CUDA.allowscalar(false)
+  using CUDA: @allowscalar
   @info "starting CUDA tests"
 else
   @info "CUDA not functional, testing with JLArrays instead"
   using JLArrays  # fake GPU array, for testing
   JLArrays.allowscalar(false)
+  using JLArrays: @allowscalar
   cu = jl
   CuArray{T,N} = JLArray{T,N}
 end