Optimize accumulate/logical indexing.

maleadt · maleadt · commit 9459fce88bf6 · 2019-10-10T08:14:44.000+02:00
Use the launch configuration API, and eagerly put temporary arrays back in the pool.
diff --git a/src/accumulate.jl b/src/accumulate.jl
@@ -25,33 +25,37 @@ function Base._accumulate!(op::Function, vout::CuVector{T}, v::CuVector, dims::N
     Δ = 1   # Δ = 2^d
     n = ceil(Int, log2(length(v)))
 
-    num_threads = 256
-    num_blocks = ceil(Int, length(v) / num_threads)
+    # partial in-place accumulation
+    function kernel(op, vout, vin, Δ)
+        i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
 
-    for d in 0:n   # passes through data
-        @cuda blocks=num_blocks threads=num_threads _partial_accumulate!(op, vout, vin, Δ)
+        @inbounds if i <= length(vin)
+            if i > Δ
+                vout[i] = op(vin[i - Δ], vin[i])
+            else
+                vout[i] = vin[i]
+            end
+        end
 
-        vin, vout = vout, vin
-        Δ *= 2
+        return
     end
 
-    return vin
-end
+    function configurator(kernel)
+        fun = kernel.fun
+        config = launch_configuration(fun)
+        blocks = cld(length(v), config.threads)
 
-function _partial_accumulate!(op, vout, vin, Δ)
-    @inbounds begin
-        k = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+        return (threads=config.threads, blocks=blocks)
+    end
 
-        if k <= length(vin)
-            if k > Δ
-                vout[k] = op(vin[k - Δ], vin[k])
-            else
-                vout[k] = vin[k]
-            end
-        end
+    for d in 0:n   # passes through data
+        @cuda config=configurator kernel(op, vout, vin, Δ)
+
+        vin, vout = vout, vin
+        Δ *= 2
     end
 
-    return
+    return vin
 end
 
 Base.accumulate_pairwise!(op, result::CuVector, v::CuVector) = accumulate!(op, result, v)
diff --git a/src/indexing.jl b/src/indexing.jl
@@ -23,9 +23,6 @@ function Base.getindex(xs::CuArray{T}, bools::CuArray{Bool}) where {T}
   ys = CuArray{T}(undef, n)
 
   if n > 0
-    num_threads = min(n, 256)
-    num_blocks = ceil(Int, length(indices) / num_threads)
-
     function kernel(ys::CuDeviceArray{T}, xs::CuDeviceArray{T}, bools, indices)
         i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
 
@@ -38,9 +35,19 @@ function Base.getindex(xs::CuArray{T}, bools::CuArray{Bool}) where {T}
         return
     end
 
-    @cuda blocks=num_blocks threads=num_threads kernel(ys, xs, bools, indices)
+    function configurator(kernel)
+        fun = kernel.fun
+        config = launch_configuration(fun)
+        blocks = cld(length(indices), config.threads)
+
+        return (threads=config.threads, blocks=blocks)
+    end
+
+    @cuda config=configurator kernel(ys, xs, bools, indices)
   end
 
+  unsafe_free!(indices)
+
   return ys
 end