JuliaGPU
diff --git a/‎Project.toml
Lines changed: 4 additions & 7 deletions b/‎Project.toml
Lines changed: 4 additions & 7 deletions
diff --git a/‎docs/src/index.md
Lines changed: 2 additions & 2 deletions b/‎docs/src/index.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/matmul.jl
Lines changed: 5 additions & 5 deletions b/‎examples/matmul.jl
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/memcopy.jl
Lines changed: 3 additions & 4 deletions b/‎examples/memcopy.jl
Lines changed: 3 additions & 4 deletions
diff --git a/‎examples/memcopy_static.jl
Lines changed: 3 additions & 4 deletions b/‎examples/memcopy_static.jl
Lines changed: 3 additions & 4 deletions
diff --git a/‎examples/mpi.jl
Lines changed: 7 additions & 6 deletions b/‎examples/mpi.jl
Lines changed: 7 additions & 6 deletions
diff --git a/‎examples/naive_transpose.jl
Lines changed: 6 additions & 6 deletions b/‎examples/naive_transpose.jl
Lines changed: 6 additions & 6 deletions
diff --git a/‎examples/performance.jl
Lines changed: 8 additions & 13 deletions b/‎examples/performance.jl
Lines changed: 8 additions & 13 deletions
diff --git a/‎src/KernelAbstractions.jl
Lines changed: 2 additions & 2 deletions b/‎src/KernelAbstractions.jl
Lines changed: 2 additions & 2 deletions
@@ -5,19 +5,16 @@ version = "0.2.6"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
-CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
-CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Cassette = "7057c7e9-c182-5462-911a-8362d720325c"
+LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
 [compat]
-Adapt = "0.4, 1.0"
-CUDAapi = "4.0"
-CUDAdrv = "6.3"
-CUDAnative = "3.0"
+Adapt = "0.4, 1.0, 2.0"
+CUDA = "1.0"
 Cassette = "0.3.2"
 MacroTools = "0.5"
 SpecialFunctions = "0.10"
 
@@ -55,11 +55,11 @@ all(A .== 2.0)
 1. Functions inside kernels are forcefully inlined, except when marked with `@noinline`.
 2. Floating-point multiplication, addition, subtraction are marked contractable.
 
-## Important differences to CUDAnative
+## Important differences to CUDA
 
 1. The kernels are automatically bounds-checked against either the dynamic or statically
    provided `ndrange`.
-2. Functions like `Base.sin` are mapped to `CUDAnative.sin`.
+2. Functions like `Base.sin` are mapped to `CUDA.sin`.
 
 ## Important differences to GPUifyLoops
 
 
@@ -1,7 +1,7 @@
-using KernelAbstractions, Test, CUDAapi
-if CUDAapi.has_cuda_gpu()
-    using CuArrays
-    CuArrays.allowscalar(false)
+using KernelAbstractions, Test, CUDA
+
+if has_cuda_gpu()
+    CUDA.allowscalar(false)
 end
 
 # Simple kernel for matrix multiplication
@@ -26,7 +26,7 @@ function matmul!(a, b, c)
     if isa(a, Array)
         kernel! = matmul_kernel!(CPU(),4)
     else
-        kernel! = matmul_kernel!(CUDA(),256)
+        kernel! = matmul_kernel!(CUDAGPU(),256)
     end
     kernel!(a, b, c, ndrange=size(c)) 
 end
 
@@ -1,5 +1,5 @@
 using KernelAbstractions
-using CUDAapi
+using CUDA
 using Test
 
 @kernel function copy_kernel!(A, @Const(B))
@@ -21,15 +21,14 @@ wait(event)
 
 
 if has_cuda_gpu()
-    using CuArrays
 
     function mycopy!(A::CuArray, B::CuArray)
         @assert size(A) == size(B)
-        copy_kernel!(CUDA(), 256)(A, B, ndrange=length(A))
+        copy_kernel!(CUDAGPU(), 256)(A, B, ndrange=length(A))
     end
 
     A = CuArray{Float32}(undef, 1024)
-    B = CuArrays.ones(Float32, 1024)
+    B = CUDA.ones(Float32, 1024)
     event = mycopy!(A, B)
     wait(event)
     @test A == B
 
@@ -1,5 +1,5 @@
 using KernelAbstractions
-using CUDAapi
+using CUDA
 using Test
 
 @kernel function copy_kernel!(A, @Const(B))
@@ -20,16 +20,15 @@ wait(event)
 @test A == B
 
 if has_cuda_gpu()
-    using CuArrays
 
     function mycopy_static!(A::CuArray, B::CuArray)
         @assert size(A) == size(B)
-        kernel = copy_kernel!(CUDA(), 32, size(A)) # if size(A) varies this will cause recompilation
+        kernel = copy_kernel!(CUDAGPU(), 32, size(A)) # if size(A) varies this will cause recompilation
         kernel(A, B, ndrange=size(A))
     end
 
     A = CuArray{Float32}(undef, 1024)
-    B = CuArrays.ones(Float32, 1024)
+    B = CUDA.ones(Float32, 1024)
     event = mycopy_static!(A, B)
     wait(event)
     @test A == B
 
@@ -1,15 +1,16 @@
 # EXCLUDE FROM TESTING
 using KernelAbstractions
-using  CUDAapi
-if CUDAapi.has_cuda_gpu()
-    using CuArrays
-    CuArrays.allowscalar(false)
+using  CUDA
+
+if has_cuda_gpu()
+    CUDA.allowscalar(false)
 else
     exit()
 end
+
 using MPI
 
-device(A) = typeof(A) <: Array ? CPU() : CUDA()
+device(A) = typeof(A) <: Array ? CPU() : CUDAGPU()
 
 function mpiyield()
     MPI.Iprobe(MPI.MPI_ANY_SOURCE, MPI.MPI_ANY_TAG, MPI.COMM_WORLD)
@@ -62,7 +63,7 @@ function main()
     T = Int64
     M = 10
 
-    d_recv_buf = CuArrays.zeros(T, M)
+    d_recv_buf = CUDA.zeros(T, M)
     h_send_buf = zeros(T, M)
     h_recv_buf = zeros(T, M)
 
 
@@ -1,7 +1,7 @@
-using KernelAbstractions, Test, CUDAapi
-if CUDAapi.has_cuda_gpu()
-    using CuArrays
-    CuArrays.allowscalar(false)
+using KernelAbstractions, Test, CUDA
+
+if has_cuda_gpu()
+    CUDA.allowscalar(false)
 end
 
 @kernel function naive_transpose_kernel!(a, b)
@@ -19,7 +19,7 @@ function naive_transpose!(a, b)
     if isa(a, Array)
         kernel! = naive_transpose_kernel!(CPU(),4)
     else
-        kernel! = naive_transpose_kernel!(CUDA(),256)
+        kernel! = naive_transpose_kernel!(CUDAGPU(),256)
     end
     kernel!(a, b, ndrange=size(a))
 end
@@ -39,7 +39,7 @@ wait(event)
 # beginning GPU tests
 if has_cuda_gpu()
     d_a = CuArray(a)
-    d_b = CuArrays.zeros(Float32, res, res)
+    d_b = CUDA.zeros(Float32, res, res)
 
     ev = naive_transpose!(d_a, d_b)
     wait(ev)
 
@@ -1,12 +1,7 @@
 using KernelAbstractions
-using CUDAapi
+using CUDA
 
-CUDAapi.has_cuda_gpu() || exit()
-
-using CuArrays
-using CUDAdrv
-using CUDAnative
-using CUDAnative.NVTX
+has_cuda_gpu() || exit()
 
 @kernel function transpose_kernel_naive!(b, a)
     i, j = @index(Global, NTuple)
@@ -24,12 +19,12 @@ const nreps = 1
 NVTX.@range "Naive transpose ($block_dim, $block_dim)" let
     a = CuArray(rand(T, shape))
     b = similar(a, shape[2], shape[1])
-    kernel! = transpose_kernel_naive!(CUDA(), (block_dim, block_dim), size(b))
+    kernel! = transpose_kernel_naive!(CUDAGPU(), (block_dim, block_dim), size(b))
 
     event = kernel!(b, a)
     wait(event)
     @assert Array(b) == Array(a)'
-    @CUDAdrv.profile begin
+    CUDA.@profile begin
         for rep in 1:nreps
           event = kernel!(b, a, dependencies=(event,))
         end
@@ -40,12 +35,12 @@ end
 NVTX.@range "Naive transpose ($(block_dim^2), 1)" let
     a = CuArray(rand(T, shape))
     b = similar(a, shape[2], shape[1])
-    kernel! = transpose_kernel_naive!(CUDA(), (block_dim*block_dim, 1), size(b))
+    kernel! = transpose_kernel_naive!(CUDAGPU(), (block_dim*block_dim, 1), size(b))
 
     event = kernel!(b, a)
     wait(event)
     @assert Array(b) == Array(a)'
-    @CUDAdrv.profile begin
+    CUDA.@profile begin
         for rep in 1:nreps
           event = kernel!(b, a, dependencies=(event,))
         end
@@ -56,12 +51,12 @@ end
 NVTX.@range "Naive transpose (1, $(block_dim^2))" let
     a = CuArray(rand(T, shape))
     b = similar(a, shape[2], shape[1])
-    kernel! = transpose_kernel_naive!(CUDA(), (1, block_dim*block_dim), size(b))
+    kernel! = transpose_kernel_naive!(CUDAGPU(), (1, block_dim*block_dim), size(b))
 
     event = kernel!(b, a)
     wait(event)
     @assert Array(b) == Array(a)'
-    @CUDAdrv.profile begin
+    CUDA.@profile begin
         for rep in 1:nreps
           event = kernel!(b, a, dependencies=(event,))
         end
 
@@ -2,7 +2,7 @@ module KernelAbstractions
 
 export @kernel
 export @Const, @localmem, @private, @uniform, @synchronize, @index, groupsize, @print
-export Device, GPU, CPU, CUDA, Event, MultiEvent, NoneEvent
+export Device, GPU, CPU, CUDAGPU, Event, MultiEvent, NoneEvent
 export async_copy!
 
 
@@ -303,7 +303,7 @@ abstract type Device end
 abstract type GPU <: Device end
 
 struct CPU <: Device end
-struct CUDA <: GPU end
+struct CUDAGPU <: GPU end
 # struct AMD <: GPU end
 # struct Intel <: GPU end