JuliaGPU
diff --git a/‎.github/FUNDING.yml
Lines changed: 1 addition & 0 deletions b/‎.github/FUNDING.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitlab-ci.yml
Lines changed: 0 additions & 1 deletion b/‎.gitlab-ci.yml
Lines changed: 0 additions & 1 deletion
diff --git a/‎Project.toml
Lines changed: 4 additions & 4 deletions b/‎Project.toml
Lines changed: 4 additions & 4 deletions
diff --git a/‎README.md
Lines changed: 5 additions & 4 deletions b/‎README.md
Lines changed: 5 additions & 4 deletions
diff --git a/‎deps/build.jl
Lines changed: 12 additions & 2 deletions b/‎deps/build.jl
Lines changed: 12 additions & 2 deletions
diff --git a/‎docs/src/tutorials/intro.jl
Lines changed: 4 additions & 4 deletions b/‎docs/src/tutorials/intro.jl
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/CuArrays.jl
Lines changed: 14 additions & 14 deletions b/‎src/CuArrays.jl
Lines changed: 14 additions & 14 deletions
diff --git a/‎src/array.jl
Lines changed: 63 additions & 6 deletions b/‎src/array.jl
Lines changed: 63 additions & 6 deletions
diff --git a/‎src/blas/highlevel.jl
Lines changed: 2 additions & 2 deletions b/‎src/blas/highlevel.jl
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/blas/wrappers.jl
Lines changed: 4 additions & 4 deletions b/‎src/blas/wrappers.jl
Lines changed: 4 additions & 4 deletions
@@ -0,0 +1 @@
+custom: https://numfocus.salsalabs.org/donate-to-julia/index.html
@@ -1,6 +1,5 @@
 variables:
   CI_IMAGE_TAG: 'cuda'
-  CI_DEV_PKGS: 'CUDAapi GPUArrays CUDAnative NNlib CUDAdrv'
   JULIA_NUM_THREADS: '4'
 
 include:
 
@@ -1,6 +1,6 @@
 name = "CuArrays"
 uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
-version = "1.0.2"
+version = "2.0.0"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
@@ -31,7 +31,7 @@ test = ["Test", "FFTW", "ForwardDiff"]
 julia = "1.0"
 CUDAnative = "2.0"
 CUDAdrv = "3.0"
-CUDAapi = "0.5.3, 0.6"
-NNlib = "0.5, 0.6"
-GPUArrays = "0.7"
+CUDAapi = "0.5.3, 0.6, 1.0"
+NNlib = "0.6"
+GPUArrays = "0.7.1"
 Adapt = "0.4"
@@ -23,10 +23,11 @@ arrays:
 
 ## Installation
 
-CuArrays should work **out-of-the-box** on Julia 1.0. You only need to have a
-proper set-up of CUDA, meaning the rest of the Julia CUDA stack should work
-(notably CUDAapi.jl, CUDAdrv.jl and CUDAnative.jl). If you encounter any issues
-with CuArrays.jl, please make sure those other packages are working as expected.
+CuArrays should work **out-of-the-box** on stable releases of Julia 1.x. You
+only need to have a proper set-up of CUDA, meaning the rest of the Julia CUDA
+stack should work (notably CUDAapi.jl, CUDAdrv.jl and CUDAnative.jl). If you
+encounter any issues with CuArrays.jl, please make sure those other packages are
+working as expected.
 
 Some parts of CuArrays.jl depend on **optional libraries**, such as
 [cuDNN](https://developer.nvidia.com/cudnn). The build process should notify
 
@@ -41,11 +41,21 @@ function main()
 
     toolkit = find_toolkit()
 
-    for name in ("cublas", "cusparse", "cusolver", "cufft", "curand", "cudnn")
+    # required libraries that are part of the CUDA toolkit
+    for name in ("cublas", "cusparse", "cusolver", "cufft", "curand")
         lib = Symbol("lib$name")
         config[lib] = find_cuda_library(name, toolkit)
         if config[lib] == nothing
-            build_warning("Could not find library '$name'")
+            build_error("Could not find library '$name' (it should be part of the CUDA toolkit)")
+        end
+    end
+
+    # optional libraries
+    for name in ("cudnn", )
+        lib = Symbol("lib$name")
+        config[lib] = find_cuda_library(name, toolkit)
+        if config[lib] == nothing
+            build_warning("Could not find optional library '$name'")
         end
     end
 
 
@@ -114,8 +114,8 @@ using BenchmarkTools
 
 using CuArrays
 
-x_d = cufill(1.0f0, N)  # a vector stored on the GPU filled with 1.0 (Float32)
-y_d = cufill(2.0f0, N)  # a vector stored on the GPU filled with 2.0
+x_d = CuArrays.fill(1.0f0, N)  # a vector stored on the GPU filled with 1.0 (Float32)
+y_d = CuArrays.fill(2.0f0, N)  # a vector stored on the GPU filled with 2.0
 
 # Here the `d` means "device," in contrast with "host". Now let's do the increment:
 
@@ -220,8 +220,8 @@ CUDAdrv.@profile bench_gpu1!(y_d, x_d)
 
 # You can see that 100% of the time was spent in `ptxcall_gpu_add1__1`, the name of the
 # kernel that `CUDAnative` assigned when compiling `gpu_add1!` for these inputs. (Had you
-# created arrays of multiple data types, e.g., `xu_d = cufill(0x01, N)`, you might have
-# also seen `ptxcall_gpu_add1__2` and so on. Like the rest of Julia, you can define a
+# created arrays of multiple data types, e.g., `xu_d = CuArrays.fill(0x01, N)`, you might
+# have also seen `ptxcall_gpu_add1__2` and so on. Like the rest of Julia, you can define a
 # single method and it will be specialized at compile time for the particular data types
 # you're using.)
 
 
@@ -1,12 +1,10 @@
-__precompile__()
-
 module CuArrays
 
 using CUDAdrv, CUDAnative
 
 using GPUArrays
 
-export CuArray, CuVector, CuMatrix, CuVecOrMat, cu, cuzeros, cuones, cufill
+export CuArray, CuVector, CuMatrix, CuVecOrMat, cu
 
 import LinearAlgebra, SpecialFunctions
 
@@ -45,12 +43,12 @@ include("gpuarray_interface.jl")
 # of CuArrays and/or CUDAnative only use a single context), so keep track of the active one.
 const active_context = Ref{CuContext}()
 
-libcublas !== nothing   && include("blas/CUBLAS.jl")
-libcusparse !== nothing && include("sparse/CUSPARSE.jl")
-libcusolver !== nothing && include("solver/CUSOLVER.jl")
-libcufft !== nothing    && include("fft/CUFFT.jl")
-libcurand !== nothing   && include("rand/CURAND.jl")
-libcudnn !== nothing    && include("dnn/CUDNN.jl")
+include("blas/CUBLAS.jl")
+include("sparse/CUSPARSE.jl")
+include("solver/CUSOLVER.jl")
+include("fft/CUFFT.jl")
+include("rand/CURAND.jl")
+libcudnn !== nothing && include("dnn/CUDNN.jl")
 
 include("nnlib.jl")
 
@@ -84,11 +82,13 @@ function __init__()
         active_context[] = ctx
 
         # wipe the active handles
-        isdefined(CuArrays, :CUBLAS)   && (CUBLAS._handle[] = C_NULL; CUBLAS._xt_handle[] = C_NULL)
-        isdefined(CuArrays, :CUSOLVER) && (CUSOLVER._dense_handle[] = C_NULL; CUSOLVER._sparse_handle[] = C_NULL)
-        isdefined(CuArrays, :CUSPARSE) && (CUSPARSE._handle[] = C_NULL)
-        isdefined(CuArrays, :CURAND)   && (CURAND._generator[] = nothing)
-        isdefined(CuArrays, :CUDNN)    && (CUDNN._handle[] = C_NULL)
+        CUBLAS._handle[] = C_NULL
+        CUBLAS._xt_handle[] = C_NULL
+        CUSOLVER._dense_handle[] = C_NULL
+        CUSOLVER._sparse_handle[] = C_NULL
+        CUSPARSE._handle[] = C_NULL
+        CURAND._generator[] = nothing
+        isdefined(CuArrays, :CUDNN) && (CUDNN._handle[] = C_NULL)
     end
     push!(CUDAnative.device!_listeners, callback)
 
 
@@ -214,12 +214,12 @@ end
 cu(xs) = adapt(CuArray{Float32}, xs)
 Base.getindex(::typeof(cu), xs...) = CuArray([xs...])
 
-cuzeros(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 0)
-cuones(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 1)
-cuzeros(dims...) = cuzeros(Float32, dims...)
-cuones(dims...) = cuones(Float32, dims...)
-cufill(v, dims...) = fill!(CuArray{typeof(v)}(undef, dims...), v)
-cufill(v, dims::Dims) = fill!(CuArray{typeof(v)}(undef, dims...), v)
+zeros(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 0)
+ones(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 1)
+zeros(dims...) = CuArrays.zeros(Float32, dims...)
+ones(dims...) = CuArrays.ones(Float32, dims...)
+fill(v, dims...) = fill!(CuArray{typeof(v)}(undef, dims...), v)
+fill(v, dims::Dims) = fill!(CuArray{typeof(v)}(undef, dims...), v)
 
 # optimized implementation of `fill!` for types that are directly supported by memset
 const MemsetTypes = Dict(1=>UInt8, 2=>UInt16, 4=>UInt32)
@@ -270,3 +270,60 @@ function LinearAlgebra.triu!(A::CuMatrix{T}, d::Integer = 0) where T
   @cuda blocks=blk threads=thr kernel!(A, d)
   return A
 end
+
+
+## reversing
+
+function _reverse(input::CuVector{T}, output::CuVector{T}) where {T}
+    @assert length(input) == length(output)
+
+    nthreads = 256
+    nblocks = ceil(Int, length(input) / nthreads)
+    shmem = nthreads * sizeof(T)
+
+    function kernel(input::CuDeviceVector{T}, output::CuDeviceVector{T}) where {T}
+        shared = @cuDynamicSharedMem(T, blockDim().x)
+
+        # load one element per thread from device memory and buffer it in reversed order
+
+        offset_in = blockDim().x * (blockIdx().x - 1)
+        index_in = offset_in + threadIdx().x
+
+        if index_in <= length(input)
+            index_shared = blockDim().x - threadIdx().x + 1
+            @inbounds shared[index_shared] = input[index_in]
+        end
+
+        sync_threads()
+
+        # write back in forward order, but to the reversed block offset as before
+
+        offset_out = length(output) -  blockDim().x * blockIdx().x
+        index_out = offset_out + threadIdx().x
+
+        if 1 <= index_out <= length(output)
+            index_shared = threadIdx().x
+            @inbounds output[index_out] = shared[index_shared]
+        end
+
+        return
+    end
+
+    @cuda threads=nthreads blocks=nblocks shmem=shmem kernel(input, output)
+
+    return
+end
+
+function Base.reverse!(v::CuVector, start=1, stop=length(v))
+    v′ = view(v, start:stop)
+    _reverse(v′, v′)
+    return v
+end
+
+function Base.reverse(v::CuVector, start=1, stop=length(v))
+    v′ = similar(v)
+    start > 1 && copyto!(v′, 1, v, 1, start-1)
+    _reverse(view(v, start:stop), view(v′, start:stop))
+    stop < length(v) && copyto!(v′, stop+1, v, stop+1)
+    return v′
+end
@@ -30,7 +30,7 @@ function LinearAlgebra.BLAS.dotc(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{
 end
 
 function LinearAlgebra.BLAS.dot(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64}
-    dotc(DX, DY)
+    BLAS.dotc(DX, DY)
 end
 
 function LinearAlgebra.BLAS.dotu(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64}
@@ -43,7 +43,7 @@ LinearAlgebra.norm(x::CublasArray) = nrm2(x)
 LinearAlgebra.BLAS.asum(x::CublasArray) = asum(length(x), x, 1)
 
 function LinearAlgebra.axpy!(alpha::Number, x::CuArray{T}, y::CuArray{T}) where T<:CublasFloat
-    length(x)==length(y) || throw(DimensionMismatch(""))
+    length(x)==length(y) || throw(DimensionMismatch("axpy arguments have lengths $(length(x)) and $(length(y))"))
     axpy!(length(x), convert(T,alpha), x, 1, y, 1)
 end
 
 
@@ -1473,7 +1473,7 @@ for (fname, elty) in
             unsafe_free!(Aptrs)
 
             if !Pivot
-                pivotArray = CuArray(zeros(Cint, (n, length(A))))
+                pivotArray = CuArrays.zeros(Cint, (n, length(A)))
             end
             pivotArray, info, A
         end
@@ -1513,7 +1513,7 @@ for (fname, elty) in
             ldc = max(1,stride(C[1],2))
             Aptrs = device_batch(A)
             Cptrs = device_batch(C)
-            info = CuArray(zeros(Cint,length(A)))
+            info = CuArrays.zeros(Cint,length(A))
             $fname(handle(), n, Aptrs, lda, pivotArray, Cptrs, ldc, info, length(A))
             unsafe_free!(Cptrs)
             unsafe_free!(Aptrs)
@@ -1552,7 +1552,7 @@ for (fname, elty) in
             ldc = max(1,stride(C[1],2))
             Aptrs = device_batch(A)
             Cptrs = device_batch(C)
-            info = CuArray(zeros(Cint,length(A)))
+            info = CuArrays.zeros(Cint,length(A))
             $fname(handle(), n, Aptrs, lda, Cptrs, ldc, info, length(A))
             unsafe_free!(Cptrs)
             unsafe_free!(Aptrs)
@@ -1638,7 +1638,7 @@ for (fname, elty) in
             Aptrs = device_batch(A)
             Cptrs = device_batch(C)
             info  = zero(Cint)
-            infoarray = CuArray(zeros(Cint, length(A)))
+            infoarray = CuArrays.zeros(Cint, length(A))
             $fname(handle(), cutrans, m, n, nrhs, Aptrs, lda, Cptrs, ldc, [info], infoarray, length(A))
             unsafe_free!(Cptrs)
             unsafe_free!(Aptrs)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+custom: https://numfocus.salsalabs.org/donate-to-julia/index.html`