Skip to content

Commit d57508a

Browse files
committed
Simplify device selection.
1 parent 8824c24 commit d57508a

File tree

1 file changed

+39
-52
lines changed

1 file changed

+39
-52
lines changed

test/runtests.jl

Lines changed: 39 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ if do_help
4444
--thorough Don't allow skipping tests that are not supported.
4545
--quickfail Fail the entire run as soon as a single test errored.
4646
--jobs=N Launch `N` processes to perform tests (default: Sys.CPU_THREADS).
47-
--gpus=N Expose `N` GPUs to test processes (default: 1).
47+
--gpu=1,2,... Which GPUs to use (comma-separated list of indices, default: all)
4848
--sanitize[=tool] Run the tests under `compute-sanitizer`.
4949
5050
Remaining arguments filter the tests that will be executed.""")
@@ -54,14 +54,15 @@ set_jobs, jobs = extract_flag!(ARGS, "--jobs"; typ=Int)
5454
do_sanitize, sanitize_tool = extract_flag!(ARGS, "--sanitize", "memcheck")
5555
do_thorough, _ = extract_flag!(ARGS, "--thorough")
5656
do_quickfail, _ = extract_flag!(ARGS, "--quickfail")
57+
do_gpu_list, gpu_list = extract_flag!(ARGS, "--gpu")
58+
do_list, _ = extract_flag!(ARGS, "--list")
59+
## no options should remain
60+
optlike_args = filter(startswith("-"), ARGS)
61+
if !isempty(optlike_args)
62+
error("Unknown test options `$(join(optlike_args, " "))` (try `--help` for usage instructions)")
63+
end
5764

5865
include("setup.jl") # make sure everything is precompiled
59-
_, gpus = extract_flag!(ARGS, "--gpus", ndevices())
60-
if !set_jobs
61-
cpu_jobs = Sys.CPU_THREADS
62-
memory_jobs = Int(Sys.free_memory()) ÷ (2 * 2^30)
63-
jobs = min(cpu_jobs, memory_jobs)
64-
end
6566

6667
# choose tests
6768
const tests = ["core/initialization"] # needs to run first
@@ -100,22 +101,16 @@ for (rootpath, dirs, files) in walkdir(@__DIR__)
100101
end
101102
unique!(tests)
102103

103-
# parse some more command-line arguments
104-
## --list to list all available tests
105-
do_list, _ = extract_flag!(ARGS, "--list")
104+
# list tests, if requested
106105
if do_list
107106
println("Available tests:")
108107
for test in sort(tests)
109108
println(" - $test")
110109
end
111110
exit(0)
112111
end
113-
## no options should remain
114-
optlike_args = filter(startswith("-"), ARGS)
115-
if !isempty(optlike_args)
116-
error("Unknown test options `$(join(optlike_args, " "))` (try `--help` for usage instructions)")
117-
end
118-
## the remaining args filter tests
112+
113+
# filter tests
119114
if !isempty(ARGS)
120115
filter!(tests) do test
121116
any(arg->startswith(test, arg), ARGS)
@@ -128,46 +123,30 @@ label_match = match(r"^CUDA ([\d.]+)$", get(ENV, "BUILDKITE_LABEL", ""))
128123
if label_match !== nothing
129124
@test toolkit_release == VersionNumber(label_match.captures[1])
130125
end
131-
132-
# find suitable devices
133126
@info "System information:\n" * sprint(io->CUDA.versioninfo(io))
134-
candidates = []
135-
for (index,dev) in enumerate(devices())
136-
# fetch info that doesn't require a context
127+
128+
# select devices
129+
function gpu_entry(dev)
137130
id = deviceid(dev)
138-
mig = CUDA.uuid(dev) != CUDA.parent_uuid(dev)
139-
uuid = CUDA.uuid(dev)
140131
name = CUDA.name(dev)
141-
cap = capability(dev)
142-
143-
mem = try
144-
device!(dev)
145-
mem = CUDA.available_memory()
146-
# immediately reset the device. this helps to reduce memory usage,
147-
# and is needed for systems that only provide exclusive access to the GPUs
148-
CUDA.device_reset!()
149-
mem
150-
catch err
151-
if isa(err, OutOfGPUMemoryError)
152-
# the device doesn't even have enough memory left to instantiate a context...
153-
0
154-
else
155-
rethrow()
156-
end
132+
uuid = CUDA.uuid(dev)
133+
cap = CUDA.capability(dev)
134+
mig = uuid != CUDA.parent_uuid(dev)
135+
(; id, name, cap, uuid="$(mig ? "MIG" : "GPU")-$uuid")
136+
end
137+
gpus = if do_gpu_list
138+
# parse the list of GPUs
139+
map(gpu_list) do str
140+
id = parse(Int, str)
141+
gpu_entry(CuDevice(id))
157142
end
158-
159-
push!(candidates, (; id, uuid, mig, name, cap, mem))
160-
161-
# NOTE: we don't use NVML here because it doesn't respect CUDA_VISIBLE_DEVICES
143+
else
144+
# find all GPUs
145+
map(gpu_entry, CUDA.devices())
162146
end
163-
## order by available memory, but also by capability if testing needs to be thorough
164-
sort!(candidates, by=x->x.mem)
165-
## apply
166-
picks = reverse(candidates[end-gpus+1:end]) # best GPU first
167-
ENV["CUDA_VISIBLE_DEVICES"] = join(map(pick->"$(pick.mig ? "MIG" : "GPU")-$(pick.uuid)", picks), ",")
168-
@info "Testing using $(length(picks)) device(s): " * join(map(pick->"$(pick.id). $(pick.name) (UUID $(pick.uuid))", picks), ", ")
169-
170-
@info "Running $jobs tests in parallel. If this is too many, specify the `--jobs` argument to the tests, or set the JULIA_CPU_THREADS environment variable."
147+
@info("Testing using device(s) " * join(map(gpu->"$(gpu.id) ($(gpu.name))", gpus), ", ", " and ") *
148+
". To change this, specify the `--gpus` argument to the test, or set the `CUDA_VISIBLE_DEVICES` environment variable.")
149+
ENV["CUDA_VISIBLE_DEVICES"] = join(map(gpu->gpu.uuid, gpus), ",")
171150

172151
# determine tests to skip
173152
skip_tests = []
@@ -181,7 +160,7 @@ if do_sanitize
181160
# XXX: these hang for some reason
182161
append!(skip_tests, ["base/sorting"])
183162
end
184-
if first(picks).cap < v"7.0"
163+
if first(gpus).cap < v"7.0"
185164
push!(skip_tests, "core/device/intrinsics/wmma")
186165
end
187166
if Sys.ARCH == :aarch64
@@ -212,6 +191,14 @@ else
212191
all_tests = copy(tests)
213192
end
214193

194+
# determine parallelism
195+
if !set_jobs
196+
cpu_jobs = Sys.CPU_THREADS
197+
memory_jobs = Int(Sys.free_memory()) ÷ (2 * 2^30)
198+
jobs = min(cpu_jobs, memory_jobs)
199+
end
200+
@info "Running $jobs tests in parallel. If this is too many, specify the `--jobs` argument to the tests, or set the JULIA_CPU_THREADS environment variable."
201+
215202
# add workers
216203
const test_exeflags = Base.julia_cmd()
217204
filter!(test_exeflags.exec) do c

0 commit comments

Comments
 (0)