Skip to content
This repository was archived by the owner on Mar 12, 2021. It is now read-only.

Create a CUDA context #406

Draft
wants to merge 34 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
IRTools = "7869d1d1-7146-5819-86e3-90919afe41df"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Expand Down
4 changes: 3 additions & 1 deletion src/CuArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ using CUDAapi, CUDAdrv, CUDAnative

using GPUArrays

export CuArray, CuVector, CuMatrix, CuVecOrMat, cu
export CuArray, CuVector, CuMatrix, CuVecOrMat, cu, cuda
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems like a pretty generic function to export (both cu and cuda are bound to confuse users). why not something that implies its action, e.g., on_cuda? or @cuda re @async?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree about not exporting this for now. In the longer term, if this is successful it should replace cu entirely (alongside all the other APIs, for most users), so a generic name seems appropriate.

I think cuda() do ... reads right, and provides an obvious space for options (cuda(device=2) do ...), but @cuda could work well too (especially in that it's a bit nicer for one liners).


import LinearAlgebra

Expand Down Expand Up @@ -81,6 +81,8 @@ include("tensor/CUTENSOR.jl")

include("nnlib.jl")

include("contextual.jl")

include("deprecated.jl")


Expand Down
152 changes: 152 additions & 0 deletions src/contextual.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
using IRTools: isexpr, IR, @dynamo, postwalk
using IRTools: meta, Pipe, finish, Variable, self
using MacroTools: @forward

import Base.Broadcast.broadcasted
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These imports are redundant now

import Base.Broadcast.materialize
import Base.Broadcast.Broadcasted

# TODO use a WeakKeyDict
struct CUDACtx
array_bank::IdDict{Array,CuArray}
end

CUDACtx() = CUDACtx(IdDict{Array,CuArray}())

# Display fns for debugging, remove before committing
function Base.summary(io::IO, cx::CUDACtx)
print(io, "IR Context for CUDA ")
summary(io, cx.array_bank)
end

function Base.show(io::IO, cx::CUDACtx)
print(io, "IR Context for CUDA ")
display(cx.array_bank)
end

@forward CUDACtx.array_bank Base.getindex, Base.iterate,
Base.setindex!, Base.empty!,
Base.length, Base.get!
Base.first, Base.last, Base.haskey

function _resize!(a::Array, sz::NTuple{<:Any,Integer})
ccall(:jl_array_grow_end, Cvoid, (Any, UInt), a, prod(sz))
ptr = convert(Ptr{Csize_t},pointer_from_objref(a))
for i = 1:length(sz)
unsafe_store!(ptr+8*(i+2), sz[i])
end
return a
end

function refill!(a::Array, b::CuArray)
_resize!(a, size(b))
copy!(a, b)
end

function cache(cx, x::CuArray{T,N})::Array{T,N} where {T,N}
cpu = Array{T,N}(undef, ntuple(_->0,N))
cx[cpu] = x
return cpu
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In cases like BatchNorm, before any compute is hit, the check for dimensions takes the cpu which has 0 shape, which errors. Returning the data also seems wasteful. Thoughts?

This does seem to make things work (including backwards pass on Zygote with Flux models, but its hitting some bad code paths currently).

end
cache(cx, f) = f

# TODO: BitArray and friends would like an AbstractArray construct
function get_cached(cx::CUDACtx, arr::Array{T,N})::CuArray{T,N} where {T,N}
get!(cx, arr, CuArray(arr))
end
get_cached(cx::CUDACtx, x) = x

function (cx::CUDACtx)(::typeof(Base._mapreducedim!), f, op, args...)
gargs = map(x -> get_cached(cx, x), args)
Base._mapreducedim!(f, op, gargs...) |> x-> cache(cx, x)
end

macro contextual(fs...)
ex = Expr[]
for f in fs
q = quote
function (cx::CUDACtx)(::typeof($f), args...)
gargs = map(x -> get_cached(cx, x), args)
cache(cx, $f(gargs...))
end
end
push!(ex, q)
end

quote
$(ex...)
end
end

@contextual :+ :- :* :/ sum similar materialize
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should set these things up to explicitly call whatever lower-level bindings we have; it should show what it would look like if we got rid of CuArray altogether.


function (cx::CUDACtx)(::typeof(reshape), arr, args...)
r = reshape(get_cached(cx, arr), args...)
cache(cx, r)
end

@dynamo function (cx::CUDACtx)(meta...)
ir = IR(meta...)
ir == nothing && return

pr = Pipe(ir)
for (v,st) in pr
isexpr(st.expr, :call) || continue
ex = st.expr

pr[v] = Expr(:call, self, ex.args...)

end
return finish(pr)
end

"""
Disable `CUDACtx` for a function
"""
macro noop_pass(fs...)
ex = [:( (cx::CUDACtx)(::typeof($f), args...) = $f(args...) ) for f in fs]

quote
$(ex...)
end
end

@noop_pass get_cached NNlib.check_spdf
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need a noop for get_cached? That shouldn't ever be called in code that we're transforming, right?


for f in names(NNlib)
getfield(NNlib, f) isa Function || continue
@eval function (cx::CUDACtx)(::typeof($f), args...)
gargs = map(x -> get_cached(cx, x), args)
cache(cx, $f(gargs...))
end
end

for f in names(LinearAlgebra)
getfield(LinearAlgebra, f) isa Function || continue
@eval function (cx::CUDACtx)(::typeof($f), args...)
gargs = map(x -> get_cached(cx, x), args)
cache(cx, $f(gargs...))
end
end

"""
Creates a `cuda` context within which we travel
through the entire callstack to find matrix/vector
operations and try to offload them to a GPU.

Example:
```
cuda() do
# do something
end
```
"""
function cuda(f, ctx = CUDACtx())
out = ctx(f)
for (x, cx) in ctx
length(x) == length(cx) && continue
refill!(x, cx)
end
empty!(ctx)
return out
end
48 changes: 48 additions & 0 deletions test/contextual.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
using CuArrays, Test
using CuArrays.NNlib

# Check simple ops work and broadcast
@testset "simple ops" begin
W = rand(5, 5)
b = rand(5)
op = cuda(() -> W*b)
@test op ≈ W*b
@test op isa Array

a = rand(10)
b = rand(10)

r = cuda() do
a + b
end
@test r isa Array

r = cuda() do
a .+ b
end
@test r isa Array
end

# Check that functions happen
@testset "linear" begin
linear(x, W, b) = (x * W) .+ b
w = rand(10, 10)
b = zeros(10)
x = rand(10,10)
r = cuda() do
linear(x, w, b)
end
@test r isa Array{Float32}
end

# check that NNlib is wrapped correctly
@testset "conv Context" begin
w = rand(Float32, 3, 3, 3, 16)
r = rand(Float32, 32, 32, 3, 1)
g = cuda() do
conv(r, w)
end
g = conv(r, w)
@test c ≈ g
@test g isa Array
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ include("sparse_solver.jl")
include("dnn.jl")
include("tensor.jl")
include("forwarddiff.jl")
include("contextual.jl")

CuArrays.memory_status()
CuArrays.pool_timings()
Expand Down