Skip to content
This repository was archived by the owner on Mar 12, 2021. It is now read-only.

Commit b8b1c4e

Browse files
authored
Merge pull request #404 from JuliaGPU/tb/flux
WIP: CUDNN improvements
2 parents 616fb49 + 9ae7b88 commit b8b1c4e

33 files changed

+3204
-821
lines changed

.gitlab-ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ flux:
4747
Pkg.develop(PackageSpec(path=pwd()));
4848
Pkg.build();'
4949
- julia -e 'using Pkg;
50-
Pkg.add("Flux");
50+
Pkg.add(PackageSpec(name="Flux", rev="tb/cuarrays_dnn"));
5151
Pkg.test("Flux");'
5252
allow_failure: true
5353

src/CuArrays.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ include("broadcast.jl")
6464
include("matmul.jl")
6565
include("mapreduce.jl")
6666
include("accumulate.jl")
67+
include("linalg.jl")
6768

6869
include("gpuarray_interface.jl")
6970

src/array.jl

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -239,45 +239,6 @@ function Base.fill!(A::CuArray{T}, x) where T <: MemsetCompatTypes
239239
end
240240

241241

242-
## generic linear algebra routines
243-
244-
function LinearAlgebra.tril!(A::CuMatrix{T}, d::Integer = 0) where T
245-
function kernel!(_A, _d)
246-
li = (blockIdx().x - 1) * blockDim().x + threadIdx().x
247-
m, n = size(_A)
248-
if 0 < li <= m*n
249-
i, j = Tuple(CartesianIndices(_A)[li])
250-
if i < j - _d
251-
_A[i, j] = 0
252-
end
253-
end
254-
return nothing
255-
end
256-
257-
blk, thr = cudims(A)
258-
@cuda blocks=blk threads=thr kernel!(A, d)
259-
return A
260-
end
261-
262-
function LinearAlgebra.triu!(A::CuMatrix{T}, d::Integer = 0) where T
263-
function kernel!(_A, _d)
264-
li = (blockIdx().x - 1) * blockDim().x + threadIdx().x
265-
m, n = size(_A)
266-
if 0 < li <= m*n
267-
i, j = Tuple(CartesianIndices(_A)[li])
268-
if j < i + _d
269-
_A[i, j] = 0
270-
end
271-
end
272-
return nothing
273-
end
274-
275-
blk, thr = cudims(A)
276-
@cuda blocks=blk threads=thr kernel!(A, d)
277-
return A
278-
end
279-
280-
281242
## reversing
282243

283244
# the kernel works by treating the array as 1d. after reversing by dimension x an element at

src/blas/CUBLAS.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ module CUBLAS
22

33
using CUDAapi
44

5-
import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL, devices
5+
using CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL, devices
66

77
import CUDAnative
88

src/deprecated.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Deprecated functionality
22

3-
import Base: @deprecate_binding
3+
using Base: @deprecate_binding
44

55
@deprecate_binding BLAS CUBLAS
66
@deprecate_binding FFT CUFFT

src/dnn/CUDNN.jl

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,22 @@
11
module CUDNN
22

3-
import CUDAapi
3+
using CUDAapi
4+
using CUDAapi: libraryPropertyType
45

5-
import CUDAdrv: CUDAdrv, CuContext, CuPtr, CU_NULL
6+
using CUDAdrv
7+
using CUDAdrv: CuContext, CuPtr, PtrOrCuPtr, CU_NULL, CuStream_t
68

79
import CUDAnative
810

11+
using CEnum
12+
913
using ..CuArrays
10-
using ..CuArrays: @libcudnn, active_context, unsafe_free!
11-
using ..CuArrays: CuVecOrMat, CuVector
14+
using ..CuArrays: @libcudnn, active_context, CuVecOrMat, CuVector
15+
import ..CuArrays.unsafe_free!
1216

13-
using NNlib
14-
import NNlib: conv!, ∇conv_filter!, ∇conv_data!, stride, dilation, flipkernel,
15-
maxpool!, meanpool!, ∇maxpool!, ∇meanpool!, spatial_dims, padding, kernel_size,
16-
softmax, softmax!, ∇softmax!, logsoftmax, logsoftmax!, ∇logsoftmax
17+
import NNlib
1718

18-
include("libcudnn_types.jl")
19+
include("libcudnn_common.jl")
1920
include("error.jl")
2021

2122
const _handles = Dict{CuContext,cudnnHandle_t}()
@@ -35,13 +36,23 @@ function handle()
3536
return _handle[]
3637
end
3738

39+
include("base.jl")
3840
include("libcudnn.jl")
41+
3942
include("helpers.jl")
43+
include("tensor.jl")
44+
include("conv.jl")
45+
include("pooling.jl")
46+
include("activation.jl")
47+
include("filter.jl")
48+
include("softmax.jl")
49+
include("batchnorm.jl")
50+
include("dropout.jl")
51+
include("rnn.jl")
52+
53+
# interfaces with other software
4054
include("nnlib.jl")
41-
include("compat.jl")
4255

43-
version() = VersionNumber(cudnnGetProperty(CUDAapi.MAJOR_VERSION),
44-
cudnnGetProperty(CUDAapi.MINOR_VERSION),
45-
cudnnGetProperty(CUDAapi.PATCH_LEVEL))
56+
include("compat.jl")
4657

4758
end

src/dnn/activation.jl

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# descriptor
2+
3+
mutable struct ActivationDesc
4+
ptr::cudnnActivationDescriptor_t
5+
end
6+
7+
unsafe_free!(ad::ActivationDesc)=cudnnDestroyActivationDescriptor(ad.ptr)
8+
9+
Base.unsafe_convert(::Type{cudnnActivationDescriptor_t}, ad::ActivationDesc)=ad.ptr
10+
11+
function ActivationDesc(mode, coeff, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN)
12+
ad = Ref{cudnnActivationDescriptor_t}()
13+
cudnnCreateActivationDescriptor(ad)
14+
cudnnSetActivationDescriptor(ad[],mode,reluNanOpt,coeff)
15+
this = ActivationDesc(ad[])
16+
finalizer(unsafe_free!, this)
17+
return this
18+
end
19+
20+
21+
# wrappers
22+
23+
function cudnnActivationForward(y::CuArray{T,N}, x::CuArray{T,N}; mode=CUDNN_ACTIVATION_RELU, #CUDNN_ACTIVATION_IDENTITY will not work
24+
coeff=0.0, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1, beta=0) where {T,N}
25+
ad = ActivationDesc(mode, T(coeff), reluNanOpt)
26+
cudnnActivationForward(handle(), ad, Ref(T(alpha)), TensorDesc(x), x, Ref(T(beta)), TensorDesc(y), y)
27+
return y
28+
end
29+
30+
function cudnnActivationBackward(dx::CuArray{T,N}, x::CuArray{T,N}, y::CuArray{T,N}, dy::CuArray{T,N};
31+
mode=CUDNN_ACTIVATION_RELU, #CUDNN_ACTIVATION_IDENTITY will not work
32+
coeff=0.0, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1, beta=0) where {T,N}
33+
ad = ActivationDesc(mode, T(coeff), reluNanOpt)
34+
cudnnActivationBackward(handle(), ad, Ref(T(alpha)), TensorDesc(y), y, TensorDesc(dy), dy, TensorDesc(x), x, Ref(T(beta)), TensorDesc(dx), dx)
35+
return dx
36+
end

src/dnn/base.jl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
function cudnnCreate()
2+
handle = Ref{cudnnHandle_t}()
3+
cudnnCreate(handle)
4+
return handle[]
5+
end
6+
7+
function cudnnGetProperty(property::CUDAapi.libraryPropertyType)
8+
value_ref = Ref{Cint}()
9+
cudnnGetProperty(property, value_ref)
10+
value_ref[]
11+
end
12+
13+
version() = VersionNumber(cudnnGetProperty(CUDAapi.MAJOR_VERSION),
14+
cudnnGetProperty(CUDAapi.MINOR_VERSION),
15+
cudnnGetProperty(CUDAapi.PATCH_LEVEL))

src/dnn/batchnorm.jl

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
mutable struct BNCache
2+
mean
3+
ivar
4+
end
5+
6+
BNCache() = BNCache(nothing, nothing)
7+
8+
@inline _wsize(y) = (map(_ -> 1, size(y)[1:end-2])..., size(y)[end-1], 1)
9+
10+
@inline _reddims(y) = (collect(1:ndims(y)-2)..., ndims(y))
11+
12+
# NOTE: CuDNN supports only 4D and 5D Tensors for BatchNorm Operations
13+
# so reshape a 2D Tensor into 4D
14+
batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2},
15+
running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
16+
cache = nothing, alpha = T(1), beta = T(0),
17+
eps = T(1e-5), training = true) where T<:Union{Float32, Float64} =
18+
dropdims(batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), running_mean, running_var, momentum,
19+
cache = cache, alpha = alpha, beta = beta, eps = eps, training = training), dims = (1, 2))
20+
21+
function batchnorm(g::CuArray{T}, b::CuArray{T}, x::Union{CuArray{T, 4},CuArray{T,5}},
22+
running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
23+
cache = nothing, alpha = T(1), beta = T(0),
24+
eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
25+
y = similar(x)
26+
cudnnBNForward!(y, g, b, x, running_mean, running_var, momentum, cache = cache,
27+
alpha = alpha, beta = beta, eps = eps, training = training)
28+
y
29+
end
30+
31+
function cudnnBNForward!(y::CuArray{T}, g::CuArray{T}, b::CuArray{T}, x::CuArray{T},
32+
running_mean::CuArray{T}, running_var::CuArray{T},
33+
momentum; cache = nothing,
34+
alpha = T(1), beta = T(0),
35+
eps = T(1e-5), training = true) where T<:Union{Float32, Float64}
36+
dims = _wsize(x)
37+
if eps < CUDNN_BN_MIN_EPSILON
38+
# warn("eps ",eps," is too small for CuDNN so eps has been assigned the value ", CUDNN_BN_MIN_EPSILON)
39+
eps = CUDNN_BN_MIN_EPSILON
40+
end
41+
xd = TensorDesc(x)
42+
yd = TensorDesc(y)
43+
gd = TensorDesc(T, dims)
44+
45+
if training
46+
47+
if cache !== nothing
48+
mean = zeros(CuArray{T}, dims...)
49+
ivar = ones(CuArray{T}, dims...)
50+
else
51+
mean = CU_NULL
52+
ivar = CU_NULL
53+
end
54+
55+
cudnnBatchNormalizationForwardTraining(handle(), CUDNN_BATCHNORM_SPATIAL, Ref(T(alpha)), Ref(T(beta)), xd, x, yd, y, gd, g, b, momentum, running_mean, running_var, eps, mean, ivar)
56+
57+
if cache !== nothing
58+
cache.mean = mean
59+
cache.ivar = ivar
60+
end
61+
else
62+
cudnnBatchNormalizationForwardInference(handle(), CUDNN_BATCHNORM_SPATIAL, Ref(T(alpha)), Ref(T(beta)), xd, x, yd, y, gd, g, b, running_mean, running_var, eps)
63+
end
64+
end
65+
66+
function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T, 2}, dy::CuArray{T, 2},
67+
running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
68+
cache = nothing, eps = T(1e-5), alpha = T(1),
69+
beta = T(0), training = true) where T<:Union{Float32, Float64}
70+
dg, db, dx = ∇batchnorm(g, b, reshape(x, 1, 1, size(x, 1), size(x, 2)), reshape(dy, 1, 1, size(dy, 1),
71+
size(dy, 2)), running_mean, running_var, momentum, cache = cache, eps = eps,
72+
alpha = alpha, beta = beta, training = training)
73+
(dg, db, dropdims(dx, dims = (1, 2)))
74+
end
75+
76+
function ∇batchnorm(g::CuArray{T}, b::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
77+
running_mean::CuArray{T}, running_var::CuArray{T}, momentum;
78+
cache = nothing, eps = T(1e-5), alpha = T(1),
79+
beta = T(0), training = true) where T<:Union{Float32, Float64}
80+
dg = similar(g)
81+
db = similar(b)
82+
dx = similar(x)
83+
cudnnBNBackward!(dg, g, db, dx, x, dy, running_mean, running_var, T(momentum),
84+
training = training, cache = cache, eps = eps, alpha = alpha, beta = beta)
85+
(dg, db, dx)
86+
end
87+
88+
function cudnnBNBackward!(dg::CuArray{T}, g::CuArray{T}, db::CuArray{T},
89+
dx::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
90+
running_mean::CuArray{T}, running_var::CuArray{T},
91+
momentum; cache = nothing, eps = T(1e-5),
92+
alpha = T(1), beta = T(0),
93+
dalpha = T(1), dbeta = T(0), training = true) where T<:Union{Float32, Float64}
94+
if training
95+
xd = TensorDesc(x)
96+
dyd = TensorDesc(dy)
97+
dxd = TensorDesc(dx)
98+
gd = TensorDesc(T, _wsize(x))
99+
if cache !== nothing
100+
mean, ivar = cache.mean, cache.ivar
101+
info("mean and ivar are fetched from the cache")
102+
else
103+
mean, ivar = CU_NULL, CU_NULL
104+
end
105+
106+
if eps < CUDNN_BN_MIN_EPSILON
107+
eps = CUDNN_BN_MIN_EPSILON
108+
end
109+
110+
cudnnBatchNormalizationBackward(handle(), CUDNN_BATCHNORM_SPATIAL, Ref(T(alpha)), Ref(T(beta)), Ref(T(dalpha)), Ref(T(dbeta)), xd, x, dyd, dy, dxd, dx, gd, g, dg, db, eps, mean, ivar)
111+
else
112+
ivar = 1 ./ sqrt.(reshape(running_var, _wsize(x)) .+ eps)
113+
dx .= dy .* reshape(g, _wsize(x)) .* ivar
114+
dg .= squeeze(sum(dy .* (x .- reshape(running_mean, _wsize(x))) .* ivar, _reddims(dy)), dims = (1,2,4))
115+
db .= squeeze(sum(dy, _reddims(dy)), dims = (1,2,4))
116+
end
117+
end

0 commit comments

Comments
 (0)