Skip to content
This repository was archived by the owner on Mar 12, 2021. It is now read-only.

Commit cdcd4cc

Browse files
authored
Merge pull request #504 from JuliaGPU/tb/reclaim
API for manually reclaiming memory
2 parents 1f03ea7 + 6cc14ea commit cdcd4cc

File tree

12 files changed

+141
-91
lines changed

12 files changed

+141
-91
lines changed

.gitlab-ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ flux:
111111
tags:
112112
- nvidia
113113
script:
114-
- export FLUX="$HOME/.julia/dev/Flux"
114+
- export FLUX=".julia/dev/Flux"
115115
- julia -e 'using Pkg;
116116
Pkg.develop("Flux");'
117117
- julia --project -e 'using Pkg;

Manifest.toml

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,6 @@ git-tree-sha1 = "ed2c4abadf84c53d9e58510b5fc48912c2336fbb"
5050
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
5151
version = "2.2.0"
5252

53-
[[Crayons]]
54-
deps = ["Test"]
55-
git-tree-sha1 = "f621b8ef51fd2004c7cf157ea47f027fdeac5523"
56-
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
57-
version = "4.0.0"
58-
5953
[[DataStructures]]
6054
deps = ["InteractiveUtils", "OrderedCollections"]
6155
git-tree-sha1 = "0809951a1774dc724da22d26e4289bbaab77809a"
@@ -176,10 +170,10 @@ deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
176170
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
177171

178172
[[TimerOutputs]]
179-
deps = ["Crayons", "Printf", "Test", "Unicode"]
180-
git-tree-sha1 = "b80671c06f8f8bae08c55d67b5ce292c5ae2660c"
173+
deps = ["Printf"]
174+
git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
181175
uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
182-
version = "0.5.0"
176+
version = "0.5.3"
183177

184178
[[Tokenize]]
185179
git-tree-sha1 = "dfcdbbfb2d0370716c815cbd6f8a364efb6f42cf"

src/indexing.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ function Base.findfirst(vals::CuArray, xs::CuArray)
192192

193193
# blocks to cover the remaining dimensions
194194
dev = CUDAdrv.device(kernel.fun.mod.ctx)
195-
max_other_blocks = attribute(dev, CUDAdrv.MAX_GRID_DIM_Y)
195+
max_other_blocks = attribute(dev, CUDAdrv.DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)
196196
blocks_other = (min(length(Rother), max_other_blocks),
197197
cld(length(Rother), max_other_blocks))
198198

src/memory.jl

Lines changed: 76 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ function actual_free(ptr::CuPtr{Nothing})
8686
end
8787

8888

89-
## memory pool
89+
## memory pools
9090

9191
const pool_to = TimerOutput()
9292

@@ -96,11 +96,11 @@ end
9696

9797
pool_timings() = (show(pool_to; allocations=false, sortby=:name); println())
9898

99-
# API:
99+
# pool API:
100100
# - init()
101-
# - deinit()
102101
# - alloc(sz)::CuPtr{Nothing}
103102
# - free(::CuPtr{Nothing})
103+
# - reclaim(nb::Int=typemax(Int))::Int
104104
# - used_memory()
105105
# - cached_memory()
106106

@@ -111,8 +111,22 @@ include("memory/dummy.jl")
111111

112112
const pool = Ref{Module}(BinnedPool)
113113

114+
115+
## interface
116+
117+
export OutOfGPUMemoryError
118+
114119
const requested = Dict{CuPtr{Nothing},Int}()
115120

121+
struct OutOfGPUMemoryError <: Exception
122+
sz::Int
123+
end
124+
125+
function Base.showerror(io::IO, err::OutOfGPUMemoryError)
126+
println(io, "Out of GPU memory trying to allocate $(Base.format_bytes(err.sz))")
127+
memory_status(io)
128+
end
129+
116130
@inline function alloc(sz)::CuPtr{Nothing}
117131
# 0-byte allocations shouldn't hit the pool
118132
sz == 0 && return CU_NULL
@@ -121,10 +135,7 @@ const requested = Dict{CuPtr{Nothing},Int}()
121135
@pool_timeit "pooled alloc" ptr = pool[].alloc(sz)
122136
end
123137
if ptr === nothing
124-
@error "Out of GPU memory trying to allocate $(Base.format_bytes(sz))"
125-
pool[].dump()
126-
memory_status()
127-
throw(OutOfMemoryError())
138+
throw(OutOfGPUMemoryError(sz))
128139
end
129140

130141
alloc_stats.pool_nalloc += 1
@@ -151,7 +162,7 @@ end
151162
return
152163
end
153164

154-
pool_dump() = pool[].dump()
165+
reclaim(sz::Int=typemax(Int)) = pool[].reclaim(sz)
155166

156167

157168
## utilities
@@ -238,35 +249,35 @@ macro time(ex)
238249
end
239250
end
240251

241-
function memory_status()
252+
function memory_status(io::IO=stdout)
242253
free_bytes, total_bytes = CUDAdrv.Mem.info()
243254
used_bytes = total_bytes - free_bytes
244255
used_ratio = used_bytes / total_bytes
245256

246-
@printf("Effective GPU memory usage: %.2f%% (%s/%s)\n",
247-
100*used_ratio, Base.format_bytes(used_bytes),
248-
Base.format_bytes(total_bytes))
257+
@printf(io, "Effective GPU memory usage: %.2f%% (%s/%s)\n",
258+
100*used_ratio, Base.format_bytes(used_bytes),
259+
Base.format_bytes(total_bytes))
249260

250-
@printf("CuArrays GPU memory usage: %s", Base.format_bytes(usage[]))
261+
@printf(io, "CuArrays GPU memory usage: %s", Base.format_bytes(usage[]))
251262
if usage_limit[] !== nothing
252-
@printf(" (capped at %s)", Base.format_bytes(usage_limit[]))
263+
@printf(io, " (capped at %s)", Base.format_bytes(usage_limit[]))
253264
end
254-
println()
265+
println(io)
255266

256267
alloc_used_bytes = pool[].used_memory()
257268
alloc_cached_bytes = pool[].cached_memory()
258269
alloc_total_bytes = alloc_used_bytes + alloc_cached_bytes
259270

260-
@printf("%s usage: %s (%s allocated, %s cached)\n", nameof(pool[]),
261-
Base.format_bytes(alloc_total_bytes), Base.format_bytes(alloc_used_bytes),
262-
Base.format_bytes(alloc_cached_bytes))
271+
@printf(io, "%s usage: %s (%s allocated, %s cached)\n", nameof(pool[]),
272+
Base.format_bytes(alloc_total_bytes), Base.format_bytes(alloc_used_bytes),
273+
Base.format_bytes(alloc_cached_bytes))
263274

264275
requested_bytes = reduce(+, values(requested); init=0)
265276

266-
@printf("%s efficiency: %.2f%% (%s requested, %s allocated)\n", nameof(pool[]),
267-
100*requested_bytes/usage[],
268-
Base.format_bytes(requested_bytes),
269-
Base.format_bytes(usage[]))
277+
@printf(io, "%s efficiency: %.2f%% (%s requested, %s allocated)\n", nameof(pool[]),
278+
100*requested_bytes/usage[],
279+
Base.format_bytes(requested_bytes),
280+
Base.format_bytes(usage[]))
270281

271282
# check if the memory usage as counted by the CUDA allocator wrapper
272283
# matches what is reported by the pool implementation
@@ -276,6 +287,49 @@ function memory_status()
276287
end
277288
end
278289

290+
"""
291+
extalloc(f::Function; check::Function=isa(OutOfGPUMemoryError), nb::Integer=typemax(Int))
292+
293+
Run a function `f` repeatedly until it successfully allocates the memory it needs. Only
294+
out-of-memory exceptions that pass `check` are considered for retry; this defaults to
295+
checking for the CuArrays out-of-memory exception but should be customized as to detect how
296+
an out-of-memory situation is reported by the function `f`. The argument `nb` indicates how
297+
many bytes of memory `f` requires, and serves as a hint for how much memory to reclaim
298+
before trying `f` again.
299+
"""
300+
function extalloc(f::Function; check::Function=ex->isa(ex,OutOfGPUMemoryError), nb::Integer=typemax(Int))
301+
phase = 0
302+
while true
303+
phase += 1
304+
return try
305+
f()
306+
catch ex
307+
check(ex) || rethrow()
308+
309+
# incrementally costly reclaim of more and more memory
310+
if phase == 1
311+
reclaim(nb)
312+
elseif phase == 2
313+
GC.gc(false)
314+
reclaim(nb)
315+
elseif phase == 3
316+
GC.gc(true)
317+
reclaim(nb)
318+
elseif phase == 4
319+
# maybe the user lied, so try reclaiming all memory
320+
GC.gc(true)
321+
reclaim()
322+
else
323+
# give up
324+
rethrow()
325+
end
326+
327+
# try again
328+
continue
329+
end
330+
end
331+
end
332+
279333

280334
## init
281335

src/memory/binned.jl

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ module BinnedPool
1717
# or just use unified memory for all allocations.
1818
# - per-device pools
1919

20+
# TODO: move the management thread one level up, to be shared by all allocators
21+
2022
using ..CuArrays
2123
using ..CuArrays: @pool_timeit
2224

@@ -132,7 +134,7 @@ function scan()
132134
end
133135

134136
# reclaim unused buffers
135-
function reclaim(full::Bool=false, target_bytes::Int=typemax(Int))
137+
function reclaim(target_bytes::Int=typemax(Int); full::Bool=true)
136138
# find inactive buffers
137139
@pool_timeit "scan" begin
138140
pools_inactive = Vector{Int}(undef, length(pools_avail)) # pid => buffers that can be freed
@@ -161,6 +163,7 @@ function reclaim(full::Bool=false, target_bytes::Int=typemax(Int))
161163

162164
# reclaim buffers (in reverse, to discard largest buffers first)
163165
@pool_timeit "reclaim" begin
166+
freed = 0
164167
for pid in reverse(eachindex(pools_inactive))
165168
bytes = poolsize(pid)
166169
avail = pools_avail[pid]
@@ -172,13 +175,14 @@ function reclaim(full::Bool=false, target_bytes::Int=typemax(Int))
172175

173176
actual_free(block)
174177

175-
target_bytes -= bytes
176-
target_bytes <= 0 && return true
178+
freed += bytes
179+
if freed >= target_bytes
180+
return freed
181+
end
177182
end
178183
end
184+
return freed
179185
end
180-
181-
return false
182186
end
183187

184188

@@ -208,7 +212,7 @@ function pool_alloc(bytes, pid=-1)
208212
# would require proper block splitting + compaction to be any efficient.
209213

210214
@pool_timeit "3. reclaim unused" begin
211-
reclaim(true, bytes)
215+
reclaim(bytes)
212216
end
213217

214218
@pool_timeit "4. try alloc" begin
@@ -226,7 +230,7 @@ function pool_alloc(bytes, pid=-1)
226230
end
227231

228232
@pool_timeit "6. reclaim unused" begin
229-
reclaim(true, bytes)
233+
reclaim(bytes)
230234
end
231235

232236
@pool_timeit "7. try alloc" begin
@@ -236,7 +240,7 @@ function pool_alloc(bytes, pid=-1)
236240
end
237241

238242
@pool_timeit "8. reclaim everything" begin
239-
reclaim(true)
243+
reclaim()
240244
end
241245

242246
@pool_timeit "9. try alloc" begin
@@ -266,7 +270,7 @@ function init()
266270
delay = min(delay*2, MAX_DELAY)
267271
end
268272

269-
reclaim()
273+
reclaim(full=false)
270274
end
271275

272276
sleep(delay)
@@ -275,13 +279,6 @@ function init()
275279
end
276280
end
277281

278-
function deinit()
279-
@assert sum(length, pools_used) == 0 "Cannot deinitialize memory pool with outstanding allocations"
280-
reclam(fulll)
281-
282-
return
283-
end
284-
285282
function alloc(bytes)
286283
# only manage small allocations in the pool
287284
if bytes <= MAX_POOL

src/memory/dummy.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ using CUDAdrv
88

99
init() = return
1010

11-
deinit() = @assert isempty(allocated) "Cannot deinitialize memory pool with outstanding allocations"
12-
1311
const allocated = Dict{CuPtr{Nothing},Int}()
1412

1513
function alloc(sz)
@@ -42,6 +40,8 @@ function free(ptr)
4240
return
4341
end
4442

43+
reclaim(target_bytes::Int=typemax(Int)) = return 0
44+
4545
used_memory() = isempty(allocated) ? 0 : sum(sizeof, values(allocated))
4646

4747
cached_memory() = 0

src/memory/simple.jl

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,13 @@ function scan(sz)
6060
return
6161
end
6262

63-
function reclaim(sz)
63+
function reclaim(sz::Int=typemax(Int))
6464
freed = 0
6565
while freed < sz && !isempty(available)
6666
block = pop!(available)
67-
actual_free(block)
6867
freed += sizeof(block)
68+
actual_free(block)
6969
end
70-
7170
return freed
7271
end
7372

@@ -110,17 +109,6 @@ end
110109

111110
init() = return
112111

113-
function deinit()
114-
@assert isempty(allocated) "Cannot deinitialize memory pool with outstanding allocations"
115-
116-
for block in available
117-
actual_free(block)
118-
end
119-
empty!(available)
120-
121-
return
122-
end
123-
124112
function alloc(sz)
125113
block = pool_alloc(sz)
126114
if block !== nothing

src/memory/split.jl

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -402,23 +402,6 @@ end
402402

403403
init() = return
404404

405-
function deinit()
406-
@assert isempty(allocated) "Cannot deinitialize memory pool with outstanding allocations"
407-
408-
repopulate(freed)
409-
incremental_compact!(Set(freed))
410-
empty!(freed)
411-
412-
for available in (available_small, available_large, available_huge)
413-
while !isempty(available)
414-
block = pop!(available)
415-
actual_free(block)
416-
end
417-
end
418-
419-
return
420-
end
421-
422405
function alloc(sz)
423406
block = pool_alloc(sz)
424407
if block !== nothing
@@ -440,6 +423,15 @@ function free(ptr)
440423
return
441424
end
442425

426+
function reclaim(sz::Int=typemax(Int))
427+
freed = 0
428+
for available in (available_huge, available_large, available_small)
429+
freed >= sz && break
430+
freed += reclaim!(available, sz-freed)
431+
end
432+
return freed
433+
end
434+
443435
used_memory() = mapreduce(sizeof, +, values(allocated); init=0)
444436

445437
cached_memory() = mapreduce(sizeof, +, union(available_small, available_large, available_huge); init=0)

0 commit comments

Comments
 (0)