JuliaGPU
diff --git a/‎.buildkite/pipeline.yml
Lines changed: 2 additions & 22 deletions b/‎.buildkite/pipeline.yml
Lines changed: 2 additions & 22 deletions
diff --git a/‎.ci/test.jl
Lines changed: 3 additions & 1 deletion b/‎.ci/test.jl
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/CompatHelper.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/CompatHelper.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci-ka-cuda.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci-ka-cuda.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci-ka-rocm.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci-ka-rocm.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci-ka.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ci-ka.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎Project.toml
Lines changed: 2 additions & 6 deletions b/‎Project.toml
Lines changed: 2 additions & 6 deletions
diff --git a/‎lib/CUDAKernels/Project.toml
Lines changed: 4 additions & 8 deletions b/‎lib/CUDAKernels/Project.toml
Lines changed: 4 additions & 8 deletions
diff --git a/‎lib/CUDAKernels/src/CUDAKernels.jl
Lines changed: 14 additions & 81 deletions b/‎lib/CUDAKernels/src/CUDAKernels.jl
Lines changed: 14 additions & 81 deletions
diff --git a/‎lib/KernelGradients/Project.toml
Lines changed: 2 additions & 7 deletions b/‎lib/KernelGradients/Project.toml
Lines changed: 2 additions & 7 deletions
@@ -19,26 +19,6 @@ steps:
       KERNELABSTRACTIONS_TEST_BACKEND: "CUDA"
     timeout_in_minutes: 60
 
-  - label: "CUDA Julia 1.6"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: "1.6"
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-          dirs:
-            - src
-            - lib
-    commands:
-      - julia .ci/develop.jl
-      - julia .ci/test.jl
-    agents:
-      queue: "juliagpu"
-      cuda: "*"
-    env:
-      JULIA_CUDA_USE_BINARYBUILDER: "true"
-      KERNELABSTRACTIONS_TEST_BACKEND: "CUDA"
-    timeout_in_minutes: 60
-
   - label: "CUDA Julia 1.7"
     plugins:
       - JuliaCI/julia#v1:
@@ -79,10 +59,10 @@ steps:
       KERNELABSTRACTIONS_TEST_BACKEND: "CUDA"
     timeout_in_minutes: 60
 
-  - label: "ROCm Julia 1.6"
+  - label: "ROCm Julia 1.7"
     plugins:
       - JuliaCI/julia#v1:
-          version: "1.6"
+          version: "1.7"
       - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
 
@@ -12,6 +12,8 @@ end
 if !CI || BACKEND == "CUDA"
     push!(pkgs, "CUDAKernels")
 end
-# push!(pkgs, "KernelGradients")
+if !CI || haskey(ENV, "TEST_KERNELGRADIENTS")
+    push!(pkgs, "KernelGradients")
+end
 
 Pkg.test(pkgs; coverage = true)
@@ -8,7 +8,7 @@ jobs:
     steps:
       - uses: julia-actions/setup-julia@latest
         with:
-          version: 1.3
+          version: 1.6
       - name: Pkg.add("CompatHelper")
         run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
       - name: CompatHelper.main()
 
@@ -22,7 +22,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6'
+          - '1.7'
           - '1' # automatically expands to the latest stable 1.x release of Julia.
           - 'nightly'
         os:
 
@@ -22,7 +22,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6'
+          - '1.7'
         os:
           - ubuntu-latest
           - macOS-latest
 
@@ -22,7 +22,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6'
+          - '1.7'
           - '1' # automatically expands to the latest stable 1.x release of Julia.
           - 'nightly'
         os:
@@ -76,7 +76,7 @@ jobs:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
         with:
-          version: 'nightly'
+          version: '1'
       - run: julia .ci/add-general-registry.jl
         env:
           JULIA_PKG_SERVER: ""
 
@@ -1,21 +1,17 @@
 name = "KernelAbstractions"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 authors = ["Valentin Churavy <v.churavy@gmail.com>"]
-version = "0.7.2"
+version = "0.8.0-dev"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-Cassette = "7057c7e9-c182-5462-911a-8362d720325c"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [compat]
 Adapt = "0.4, 1.0, 2.0, 3.0"
-Cassette = "0.3.3"
 MacroTools = "0.5"
-SpecialFunctions = "0.10, 1.0, 2.0"
 StaticArrays = "0.12, 1.0"
-julia = "1.6"
+julia = "1.7"
@@ -1,21 +1,17 @@
 name = "CUDAKernels"
 uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
 authors = ["Valentin Churavy <v.churavy@gmail.com>"]
-version = "0.3.3"
+version = "0.4.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-Cassette = "7057c7e9-c182-5462-911a-8362d720325c"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
 [compat]
 Adapt = "3.0"
-CUDA = "3.5"
-Cassette = "0.3.3"
-KernelAbstractions = "0.7"
-SpecialFunctions = "0.10, 1.0, 2.0"
+CUDA = "3.8.2"
+KernelAbstractions = "0.8"
 StaticArrays = "0.12, 1.0"
-julia = "1.6"
+julia = "1.7"
@@ -1,10 +1,8 @@
 module CUDAKernels
 
 import CUDA
-import SpecialFunctions
 import StaticArrays
 import StaticArrays: MArray
-import Cassette
 import Adapt
 import KernelAbstractions
 
@@ -191,7 +189,7 @@ function (obj::Kernel{CUDADevice})(args...; ndrange=nothing, dependencies=Event(
     ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize)
     # this might not be the final context, since we may tune the workgroupsize
     ctx = mkcontext(obj, ndrange, iterspace)
-    kernel = CUDA.@cuda launch=false name=String(nameof(obj.f)) Cassette.overdub(CUDACTX, obj.f, ctx, args...)
+    kernel = CUDA.@cuda launch=false obj.f(ctx, args...)
 
     # figure out the optimal workgroupsize automatically
     if KernelAbstractions.workgroupsize(obj) <: DynamicSize && workgroupsize === nothing
@@ -220,52 +218,49 @@ function (obj::Kernel{CUDADevice})(args...; ndrange=nothing, dependencies=Event(
 
     # Launch kernel
     event = CUDA.CuEvent(CUDA.EVENT_DISABLE_TIMING)
-    kernel(CUDACTX, obj.f, ctx, args...; threads=threads, blocks=nblocks, stream=stream)
+    kernel(ctx, args...; threads=threads, blocks=nblocks, stream=stream)
 
     CUDA.record(event, stream)
     return CudaEvent(event)
 end
 
-Cassette.@context CUDACtx
+import CUDA: @device_override
 
 import KernelAbstractions: CompilerMetadata, CompilerPass, DynamicCheck, LinearIndices
 import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print
 import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds
 
-const CUDACTX = Cassette.disablehooks(CUDACtx(pass = CompilerPass))
-KernelAbstractions.cassette(::Kernel{CUDADevice}) = CUDACTX
-
 function mkcontext(kernel::Kernel{CUDADevice}, _ndrange, iterspace)
     CompilerMetadata{KernelAbstractions.ndrange(kernel), DynamicCheck}(_ndrange, iterspace)
 end
 
-@inline function Cassette.overdub(::CUDACtx, ::typeof(__index_Local_Linear), ctx)
+@device_override @inline function __index_Local_Linear(ctx)
     return CUDA.threadIdx().x
 end
 
-@inline function Cassette.overdub(::CUDACtx, ::typeof(__index_Group_Linear), ctx)
+@device_override @inline function __index_Group_Linear(ctx)
     return CUDA.blockIdx().x
 end
 
-@inline function Cassette.overdub(::CUDACtx, ::typeof(__index_Global_Linear), ctx)
+@device_override @inline function __index_Global_Linear(ctx)
     I =  @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
     # TODO: This is unfortunate, can we get the linear index cheaper
     @inbounds LinearIndices(__ndrange(ctx))[I]
 end
 
-@inline function Cassette.overdub(::CUDACtx, ::typeof(__index_Local_Cartesian), ctx)
+@device_override @inline function __index_Local_Cartesian(ctx)
     @inbounds workitems(__iterspace(ctx))[CUDA.threadIdx().x]
 end
 
-@inline function Cassette.overdub(::CUDACtx, ::typeof(__index_Group_Cartesian), ctx)
+@device_override @inline function __index_Group_Cartesian(ctx)
     @inbounds blocks(__iterspace(ctx))[CUDA.blockIdx().x]
 end
 
-@inline function Cassette.overdub(::CUDACtx, ::typeof(__index_Global_Cartesian), ctx)
+@device_override @inline function __index_Global_Cartesian(ctx)
     return @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
 end
 
-@inline function Cassette.overdub(::CUDACtx, ::typeof(__validindex), ctx)
+@device_override @inline function __validindex(ctx)
     if __dynamic_checkbounds(ctx)
         I = @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
         return I in __ndrange(ctx)
@@ -276,43 +271,6 @@ end
 
 import KernelAbstractions: groupsize, __groupsize, __workitems_iterspace, add_float_contract, sub_float_contract, mul_float_contract
 
-KernelAbstractions.generate_overdubs(@__MODULE__, CUDACtx)
-
-###
-# CUDA specific method rewrites
-###
-
-@inline Cassette.overdub(::CUDACtx, ::typeof(^), x::Float64, y::Float64) = ^(x, y)
-@inline Cassette.overdub(::CUDACtx, ::typeof(^), x::Float32, y::Float32) = ^(x, y)
-@inline Cassette.overdub(::CUDACtx, ::typeof(^), x::Float64, y::Int32)   = ^(x, y)
-@inline Cassette.overdub(::CUDACtx, ::typeof(^), x::Float32, y::Int32)   = ^(x, y)
-@inline Cassette.overdub(::CUDACtx, ::typeof(^), x::Union{Float32, Float64}, y::Int64) = ^(x, y)
-
-# libdevice.jl
-const cudafuns = (:cos, :cospi, :sin, :sinpi, :tan,
-          :acos, :asin, :atan,
-          :cosh, :sinh, :tanh,
-          :acosh, :asinh, :atanh,
-          :log, :log10, :log1p, :log2,
-          :exp, :exp2, :exp10, :expm1, :ldexp,
-          # :isfinite, :isinf, :isnan, :signbit,
-          :abs,
-          :sqrt, :cbrt,
-          :ceil, :floor,)
-for f in cudafuns
-    @eval function Cassette.overdub(ctx::CUDACtx, ::typeof(Base.$f), x::Union{Float32, Float64})
-        @Base._inline_meta
-        return Base.$f(x)
-    end
-end
-
-@inline Cassette.overdub(::CUDACtx, ::typeof(sincos), x::Union{Float32, Float64}) = (Base.sin(x), Base.cos(x))
-@inline Cassette.overdub(::CUDACtx, ::typeof(exp), x::Union{ComplexF32, ComplexF64}) = Base.exp(x)
-
-@inline Cassette.overdub(::CUDACtx, ::typeof(SpecialFunctions.gamma), x::Union{Float32, Float64}) = CUDA.tgamma(x)
-@inline Cassette.overdub(::CUDACtx, ::typeof(SpecialFunctions.erf), x::Union{Float32, Float64}) = SpecialFunctions.erf(x)
-@inline Cassette.overdub(::CUDACtx, ::typeof(SpecialFunctions.erfc), x::Union{Float32, Float64}) = SpecialFunctions.erfc(x)
-
 @static if Base.isbindingresolved(CUDA, :emit_shmem) && Base.isdefined(CUDA, :emit_shmem)
     const emit_shmem = CUDA.emit_shmem
 else
@@ -325,7 +283,7 @@ import KernelAbstractions: ConstAdaptor, SharedMemory, Scratchpad, __synchronize
 # GPU implementation of shared memory
 ###
 
-@inline function Cassette.overdub(::CUDACtx, ::typeof(SharedMemory), ::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+@device_override @inline function SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
     ptr = emit_shmem(T, Val(prod(Dims)))
     CUDA.CuDeviceArray(Dims, ptr)
 end
@@ -335,15 +293,15 @@ end
 # - private memory for each workitem
 ###
 
-@inline function Cassette.overdub(::CUDACtx, ::typeof(Scratchpad), ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
+@device_override @inline function Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
     MArray{__size(Dims), T}(undef)
 end
 
-@inline function Cassette.overdub(::CUDACtx, ::typeof(__synchronize))
+@device_override @inline function __synchronize()
     CUDA.sync_threads()
 end
 
-@inline function Cassette.overdub(::CUDACtx, ::typeof(__print), args...)
+@device_override @inline function __print(args...)
     CUDA._cuprint(args...)
 end
 
@@ -356,29 +314,4 @@ Adapt.adapt_storage(to::ConstAdaptor, a::CUDA.CuDeviceArray) = Base.Experimental
 # Argument conversion
 KernelAbstractions.argconvert(k::Kernel{CUDADevice}, arg) = CUDA.cudaconvert(arg)
 
-# Cassette.jl#195
-# Device intrinsics are inferred in a different World (1.6) or using MethodOverlay tables (1.7)
-# Cassette sees neither of them and thus overdubbing them fails.
-@inline function Cassette.overdub(::CUDACtx, ::typeof(CUDA.arrayref), args...)
-    CUDA.arrayref(args...)
-end
-@inline function Cassette.overdub(::CUDACtx, ::typeof(CUDA.arrayset), args...)
-    CUDA.arrayset(args...)
-end
-@inline function Cassette.overdub(::CUDACtx, ::typeof(CUDA.const_arrayref), args...)
-    CUDA.const_arrayref(args...)
-end
-@inline function Cassette.overdub(::CUDACtx, ::typeof(CUDA.logb), args...)
-    CUDA.logb(args...)
-end
-# @inline function Cassette.overdub(::CUDACtx, ::typeof(CUDA.tgamma), args...)
-#     CUDA.tgamma(args...)
-# end
-@inline function Cassette.overdub(::CUDACtx, ::typeof(CUDA.compute_capability), args...)
-    CUDA.compute_capability(args...)
-end
-@inline function Cassette.overdub(::CUDACtx, ::typeof(CUDA.ptx_isa_version), args...)
-    CUDA.ptx_isa_version(args...)
-end
-
 end
@@ -4,15 +4,10 @@ authors = ["Valentin Churavy <v.churavy@gmail.com>"]
 version = "0.1.0"
 
 [deps]
-Cassette = "7057c7e9-c182-5462-911a-8362d720325c"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-Requires = "ae029012-a4dd-5104-9daa-d747884805df"
-
 
 [compat]
-Cassette = "0.3"
-KernelAbstractions = "0.7"
-Requires = "1.1"
 Enzyme = "0.7"
-julia = "1.6"
+KernelAbstractions = "0.8"
+julia = "1.7"