Skip to content

Commit 29afec2

Browse files
authored
Add a utility to download artifacts without a functional driver. (#1539)
1 parent c1ef162 commit 29afec2

File tree

3 files changed

+83
-8
lines changed

3 files changed

+83
-8
lines changed

.buildkite/pipeline.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,31 @@ steps:
146146
# if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
147147
# timeout_in_minutes: 120
148148

149+
- label: "GPU-less environment"
150+
plugins:
151+
- JuliaCI/julia#v1:
152+
version: 1.6
153+
- JuliaCI/julia-coverage#v1:
154+
codecov: true
155+
dirs:
156+
- src
157+
- lib
158+
- examples
159+
- JuliaCI/julia-test#v1:
160+
run_tests: false
161+
command: |
162+
julia --project -e 'using CUDA;
163+
@assert !CUDA.functional();
164+
CUDA.download_artifacts()'
165+
env:
166+
CUDA_VISIBLE_DEVICES: ''
167+
JULIA_CUDA_VERSION: '11.6'
168+
agents:
169+
queue: "juliagpu"
170+
cuda: "*"
171+
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
172+
timeout_in_minutes: 60
173+
149174
- label: "NNlibCUDA.jl"
150175
plugins:
151176
- JuliaCI/julia#v1:

deps/bindeps.jl

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,22 @@ function toolkit()
106106
end
107107

108108
# workaround @artifact_str eagerness on unsupported platforms by passing a variable
109+
function generic_artifact(id)
110+
dir = try
111+
@artifact_str(id)
112+
catch ex
113+
@debug "Could not load artifact '$id'" exception=(ex,catch_backtrace())
114+
return nothing
115+
end
116+
117+
# sometimes artifact downloads fail (e.g. JuliaGPU/CUDA.jl#1003)
118+
if isempty(readdir(dir))
119+
error("""The artifact at $dir is empty.
120+
This is probably caused by a failed download. Remove the directory and try again.""")
121+
end
122+
123+
return dir
124+
end
109125
function cuda_artifact(id, cuda::VersionNumber)
110126
platform = Base.BinaryPlatforms.HostPlatform()
111127
platform.tags["cuda"] = "$(cuda.major).$(cuda.minor)"
@@ -604,7 +620,7 @@ function libcutensormg(; throw_error::Bool=true)
604620
# CUTENSORMg additionally depends on CUDARt
605621
libcudart()
606622

607-
if CUDA.CUTENSOR.version() < v"1.4"
623+
if CUTENSOR.version() < v"1.4"
608624
nothing
609625
else
610626
find_cutensor(toolkit(), "cutensorMg", v"1")
@@ -682,6 +698,11 @@ function find_nccl(cuda::LocalToolkit, name, version)
682698
return path
683699
end
684700

701+
702+
#
703+
# CUQUANTUM
704+
#
705+
685706
export libcutensornet, has_cutensornet, libcustatevec, has_custatevec
686707

687708
const __libcutensornet = Ref{Union{String,Nothing}}()
@@ -712,7 +733,7 @@ end
712733
has_custatevec() = libcustatevec(throw_error=false) !== nothing
713734

714735
function find_cutensornet(cuda::ArtifactToolkit, name, version)
715-
artifact_dir = cuda_artifact("cuQuantum", v"0.1.3")
736+
artifact_dir = generic_artifact("cuQuantum")
716737
if artifact_dir === nothing
717738
return nothing
718739
end
@@ -757,3 +778,36 @@ function find_custatevec(cuda::LocalToolkit, name, version)
757778
return path
758779
end
759780

781+
782+
#
783+
# Utilities
784+
#
785+
786+
export download_artifacts
787+
788+
"""
789+
download_artifacts()
790+
791+
Downloads the artifacts you will need to run CUDA.jl. This can be used to pre-populate the
792+
artifacts directory from, e.g., a container build script.
793+
794+
If you want this function to not require a CUDA driver (which wouldn't be available from
795+
said container build environment) be sure to set the `JULIA_CUDA_VERSION` environment
796+
variable to an appropriate CUDA release number. This environment variable should then also
797+
be set at run-time, and should be compatible with the NVIDIA driver that will be available
798+
in that environment.
799+
800+
!!! warning
801+
802+
This function is a temporary hack, and will be removed once CUDA.jl uses JLLs for
803+
downloading and installing artifacts.
804+
"""
805+
function download_artifacts()
806+
toolkit = find_artifact_cuda()
807+
@assert nothing !== cuda_artifact("CUDNN", toolkit.release)
808+
@assert nothing !== cuda_artifact("CUTENSOR", toolkit.release)
809+
@assert nothing !== cuda_artifact("NCCL", toolkit.release)
810+
811+
@assert nothing !== generic_artifact("CUDA_compat")
812+
@assert nothing !== generic_artifact("cuQuantum")
813+
end

lib/cudadrv/error.jl

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ function Base.showerror(io::IO, err::CuError)
6363
print(io, "CUDA error (code $(reinterpret(Int32, err.code)), $(err.code))")
6464
end
6565

66-
if err.meta != nothing
66+
if err.meta !== nothing
6767
print(io, "\n")
6868
print(io, err.meta)
6969
end
@@ -82,8 +82,6 @@ Base.show(io::IO, ::MIME"text/plain", err::CuError) = print(io, "CuError($(err.c
8282
end
8383

8484
# outlined functionality to avoid GC frame allocation
85-
@noinline throw_stub_error() =
86-
error("Cannot use the CUDA stub libraries. You either don't have the NVIDIA driver installed, or it is not properly discoverable.")
8785
@noinline function throw_api_error(res)
8886
if res == ERROR_OUT_OF_MEMORY
8987
throw(OutOfGPUMemoryError())
@@ -95,9 +93,7 @@ end
9593
macro check(ex)
9694
quote
9795
res = $(esc(ex))
98-
if res == 0xffffffff
99-
throw_stub_error()
100-
elseif res != SUCCESS
96+
if res != SUCCESS
10197
throw_api_error(res)
10298
end
10399

0 commit comments

Comments
 (0)