JuliaGPU · DhairyaLGandhi · May 10, 2019 · Jun 4, 2019 · Jun 5, 2019 · Jun 13, 2019
diff --git a/Project.toml b/Project.toml
@@ -9,6 +9,7 @@ CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
 CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
 CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+IRTools = "7869d1d1-7146-5819-86e3-90919afe41df"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
@@ -18,19 +19,19 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
+[compat]
+Adapt = "1.0"
+CUDAapi = "0.5.3, 0.6, 1.0"
+CUDAdrv = "3.0"
+CUDAnative = "2.0"
+GPUArrays = "0.7.1, 1.0"
+NNlib = "0.6"
+julia = "1.0"
+
 [extras]
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
 test = ["Test", "FFTW", "ForwardDiff"]
-
-[compat]
-julia = "1.0"
-CUDAnative = "2.0"
-CUDAdrv = "3.0"
-CUDAapi = "0.5.3, 0.6, 1.0"
-NNlib = "0.6"
-GPUArrays = "0.7.1, 1.0"
-Adapt = "1.0"
diff --git a/src/CuArrays.jl b/src/CuArrays.jl
@@ -4,7 +4,7 @@ using CUDAapi, CUDAdrv, CUDAnative
 
 using GPUArrays
 
-export CuArray, CuVector, CuMatrix, CuVecOrMat, cu
+export CuArray, CuVector, CuMatrix, CuVecOrMat, cu, cuda
 
 import LinearAlgebra
 
@@ -81,6 +81,8 @@ include("dnn/CUDNN.jl")
 
 include("nnlib.jl")
 
+include("context.jl")
+
 include("deprecated.jl")
 
 

diff --git a/src/context.jl b/src/context.jl
@@ -0,0 +1,146 @@
+using IRTools: isexpr, IR, @dynamo, postwalk
+using IRTools: meta, Pipe, finish, Variable, self
+using MacroTools: @forward
+
+import Base.Broadcast.broadcasted
+import Base.Broadcast.materialize
+import Base.Broadcast.Broadcasted
+
+# TODO use a WeakKeyDict
+struct CUDACtx
+  array_bank::IdDict{Array,CuArray}
+end
+
+CUDACtx() = CUDACtx(IdDict{Array,CuArray}())
+
+# Display fns for debugging, remove before committing
+function Base.summary(io::IO, cx::CUDACtx)
+  print(io, "IR Context for CUDA ")
+  summary(io, cx.array_bank)
+end
+
+function Base.show(io::IO, cx::CUDACtx)
+  print(io, "IR Context for CUDA ")
+  display(cx.array_bank)
+end
+
+@forward CUDACtx.array_bank Base.getindex, Base.iterate,
+			Base.setindex!, Base.empty!,
+			Base.length, Base.get!
+			Base.first, Base.last, Base.haskey
+
+function _resize!(a::Array, sz::NTuple{<:Any,Integer})
+  ccall(:jl_array_grow_end, Cvoid, (Any, UInt), a, prod(sz))
+  ptr = convert(Ptr{Csize_t},pointer_from_objref(a))
+  for i = 1:length(sz)
+    unsafe_store!(ptr+8*(i+2), sz[i])
+  end
+  return a
+end
+
+function refill!(a::Array, b::CuArray)
+  _resize!(a, size(b))
+  copy!(a, b)
+end
+
+function cache(cx, x::CuArray{T,N})::Array{T,N} where {T,N}
+  cpu = Array{T,N}(undef, ntuple(_->0,N))
+  cx[cpu] = x
+  return cpu
+end
+cache(cx, f) = f
+
+for f in (:+, :-, :*, :/)
+  @eval function (cx::CUDACtx)(::typeof($f), a::AbstractArray, b::AbstractArray)
+    ga = get_cached(cx, a)
+    gb = get_cached(cx, b)
+    cache(cx, $f(ga, gb))
+  end
+end
+
+function get_cached(cx::CUDACtx, arr::Array{T,N})::CuArray{T,N} where {T,N}
+  get!(cx, arr, CuArray(arr))
+end
+get_cached(cx::CUDACtx, x) = x
+
+function (cx::CUDACtx)(::typeof(broadcasted), f, args...)
+  gargs = map(x -> get_cached(cx, x), args)
+  broadcasted(f, gargs...) |> x -> cache(cx, x)
+end
+
+function (cx::CUDACtx)(::typeof(broadcast), f, args...)
+  gargs = map(x -> get_cached(cx, x), args)
+  broadcast(f, gargs...) |> x -> cache(cx, x)
+end
+
+function wrap_cuize(f)
+  @eval function (cx::CUDACtx)(::typeof($f), args...)
+    gargs = map(x -> get_cached(cx, x), args)
+    cache(cx, $f(gargs...))
+  end
+end
+
+wrap_cuize.((sum, similar, materialize))
+
+function (cx::CUDACtx)(::typeof(reshape), arr, args...)
+  r = reshape(get_cached(cx, arr), args...)
+  cache(cx, r)
+end
+
+@dynamo function (cx::CUDACtx)(meta...)
+  ir = IR(meta...)
+  ir == nothing && return
+
+  pr = Pipe(ir)
+  for (v,st) in pr
+    isexpr(st.expr, :call) || continue
+    ex = st.expr
+
+    pr[v] = Expr(:call, self, ex.args...)
+
+  end
+  return finish(pr)
+end
+
+"""
+  Disable `CUDACtx` for a function
+"""
+function noop_pass(f)
+  @eval (c::CUDACtx)(::typeof($f), args...) = $f(args...)
+end
+
+noop_pass.((get_cached, NNlib.check_spdf,
+	))
+
+for f in names(NNlib)
+  getfield(NNlib, f) isa Function || continue
+  @eval function (cx::CUDACtx)(::typeof($f), args...)
+    gargs = map(x -> get_cached(cx, x), args)
+    cache(cx, $f(gargs...))
+  end
+end
+
+# Hold all the arrays related to the op
+# BitArray and friends would like an AbstractArray construct
+
+"""
+  Creates a `cuda` context within which we travel
+  through the entire callstack to find matrix/vector
+  operations and try to offload them to a GPU.
+
+  Example:
+  ```
+  cuda() do
+    # do something
+  end
+  ```
+"""
+function cuda(f, ctx = CUDACtx())
+  out = ctx(f)
+  for (x, cx) in ctx
+    length(x) == length(cx) && continue
+    refill!(x, cx)
+  end
+  empty!(ctx)
+  return out
+end
diff --git a/test/context.jl b/test/context.jl
@@ -0,0 +1,46 @@
+using CuArrays, Test
+using CuArrays.NNlib
+
+# Check simple ops work and broadcast
+@testset "simple ops" begin
+  W = rand(5, 5)
+  b = rand(5)
+  @test cuda(() -> W*b) ≈ W*b
+
+  a = rand(10)
+  b = rand(10)
+
+  r = cuda() do
+    a + b
+  end
+  @test r isa Array
+
+  r = cuda() do
+    a .+ b
+  end
+  @test r isa Array
+end
+
+# Check that functions happen
+@testset "linear" begin
+  linear(x, W, b) = (x * W) .+ b
+  w = rand(10, 10)
+  b = zeros(10)
+  x = rand(10,10)
+  r = cuda() do
+    linear(x, w, b)
+  end
+  @test r isa Array{Float32}
+end
+
+# check that NNlib is wrapped correctly
+@testset "conv Context" begin
+  w = rand(Float32, 3, 3, 3, 16)
+  r = rand(Float32, 32, 32, 3, 1)
+  g = cuda() do
+    conv(r, w)
+  end
+  g = conv(r, w)
+  @test c ≈ g
+  @test g isa Array
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -26,6 +26,7 @@ include("solver.jl")
 include("sparse_solver.jl")
 include("dnn.jl")
 include("forwarddiff.jl")
+include("context.jl")
 
 CuArrays.pool_status()
 CuArrays.pool_timings()