Skip to content

Commit bb9aa81

Browse files
authored
Fix restarts with GPUs (#2309)
ClimaLand buildkite started failing with ClimaCore 0.14.31 with the following error: ```julia Got exception outside of a @test LoadError: ArgumentError: Illegal conversion of a CUDA.DeviceMemory to a Ptr{Bool} Stacktrace: [1] convert(T::Type{Ptr{Bool}}, mem::CUDA.DeviceMemory) @ CUDA /central/scratch/esm/slurm-buildkite/climaland-ci/depot/default/packages/CUDA/oymHm/lib/cudadrv/memory.jl:16 [2] convert(::Type{Ptr{Bool}}, managed::CUDA.Managed{CUDA.DeviceMemory}) @ CUDA /central/scratch/esm/slurm-buildkite/climaland-ci/depot/default/packages/CUDA/oymHm/src/memory.jl:583 [3] unsafe_convert(typ::Type{Ptr{Bool}}, x::CUDA.CuArray{Bool, 4, CUDA.DeviceMemory}) @ CUDA /central/scratch/esm/slurm-buildkite/climaland-ci/depot/default/packages/CUDA/oymHm/src/array.jl:448 [4] unsafe_convert(::Type{Ptr{Nothing}}, a::CUDA.CuArray{Bool, 4, CUDA.DeviceMemory}) @ Base ./pointer.jl:66 [5] h5a_write(attr_hid::HDF5.Attribute, mem_type_id::HDF5.Datatype, buf::CUDA.CuArray{Bool, 4, CUDA.DeviceMemory}) @ HDF5.API /central/scratch/esm/slurm-buildkite/climaland-ci/depot/default/packages/HDF5/Z859u/src/api/functions.jl:438 [6] write_attribute(attr::HDF5.Attribute, memtype::HDF5.Datatype, x::CUDA.CuArray{Bool, 4, CUDA.DeviceMemory}) @ HDF5 /central/scratch/esm/slurm-buildkite/climaland-ci/depot/default/packages/HDF5/Z859u/src/attributes.jl:154 [7] write_attribute(parent::HDF5.Dataset, name::String, data::CUDA.CuArray{Bool, 4, CUDA.DeviceMemory}; pv::@kwargs{}) @ HDF5 /central/scratch/esm/slurm-buildkite/climaland-ci/depot/default/packages/HDF5/Z859u/src/attributes.jl:205 [8] write_attribute(parent::HDF5.Dataset, name::String, data::CUDA.CuArray{Bool, 4, CUDA.DeviceMemory}) @ HDF5 /central/scratch/esm/slurm-buildkite/climaland-ci/depot/default/packages/HDF5/Z859u/src/attributes.jl:202 ``` I tracked this down to trying to write a CUArray as an attribute of an HDF5 file. To fix this, I first move the array to the CPU. As far as I understand, attributes are meant to be small (`h5py` recommends keeping them under 64kb), so maybe the best solution would be to write them as datasets instead. This also exposed that InputOutput is not tested on GPU (or not tested with a mask). Add CUDA tests for InputOutput Set mask to avoid integer division Print debug Add more debug printing Fix mask initialization Remove debug prints
1 parent 75e958e commit bb9aa81

File tree

4 files changed

+21
-9
lines changed

4 files changed

+21
-9
lines changed

.buildkite/pipeline.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,6 +1179,14 @@ steps:
11791179
key: unit_spectralelement2d
11801180
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_spectralelement2d.jl"
11811181

1182+
- label: "Unit: spectralelement2d"
1183+
key: unit_spectralelement2d_gpu
1184+
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_spectralelement2d.jl"
1185+
env:
1186+
CLIMACOMMS_DEVICE: "CUDA"
1187+
agents:
1188+
slurm_gpus: 1
1189+
11821190
- label: "Unit: hybrid2dbox"
11831191
key: unit_hybrid2dbox
11841192
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_hybrid2dbox.jl"
@@ -1203,6 +1211,14 @@ steps:
12031211
key: unit_hybrid3dcubedsphere_topography
12041212
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_hybrid3dcubedsphere_topography.jl"
12051213

1214+
- label: "Unit: hybrid3dcubedsphere topography"
1215+
key: unit_hybrid3dcubedsphere_topography_gpu
1216+
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_hybrid3dcubedsphere_topography.jl"
1217+
env:
1218+
CLIMACOMMS_DEVICE: "CUDA"
1219+
agents:
1220+
slurm_gpus: 1
1221+
12061222
- label: "Unit: finitedifference"
12071223
key: unit_finitedifference
12081224
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_finitedifference.jl"

ext/cuda/data_layouts_threadblock.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ end
5959

6060
##### linear partition
6161
@inline function linear_partition(nitems::Integer, n_max_threads::Integer)
62+
@assert nitems > 0
6263
threads = min(nitems, n_max_threads)
6364
blocks = cld(nitems, threads)
6465
return (; threads, blocks)

src/DataLayouts/DataLayouts.jl

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2340,16 +2340,13 @@ function IJHMask(is_active::Union{IJFH, IJHF})
23402340
DA = unionall_type(typeof(parent(is_active)))
23412341
(Ni, Nj, _, _, Nh) = size(is_active)
23422342
Nijh = Ni * Nj * Nh
2343+
N = zeros(Int, 1)
23432344
i_map = zeros(Int, Nijh)
23442345
j_map = zeros(Int, Nijh)
23452346
h_map = zeros(Int, Nijh)
2346-
return IJHMask(
2347-
rebuild(is_active, DA),
2348-
zeros(Int, 1), # N
2349-
DA(i_map),
2350-
DA(j_map),
2351-
DA(h_map),
2352-
)
2347+
mask = IJHMask(rebuild(is_active, DA), N, DA(i_map), DA(j_map), DA(h_map))
2348+
set_mask_maps!(mask)
2349+
return mask
23532350
end
23542351

23552352
"""

src/InputOutput/writers.jl

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -604,7 +604,6 @@ function _write_mpi!(
604604
dxpl_mpio = :collective,
605605
)
606606
dataset[localidx...] = array
607-
write_attribute(dataset, "array", array)
608607
write_attribute(dataset, "data_layout", string(nameof(typeof(values))))
609608
write_attribute(dataset, "data_eltype", string(eltype(values)))
610609
return name
@@ -626,7 +625,6 @@ function _write!(group, values::DataLayouts.AbstractData, name::AbstractString;)
626625
h_dim = DataLayouts.h_dim(DataLayouts.singleton(values))
627626
array = parent(values)
628627
dataset = write_plain_array!(group, array, name)
629-
write_attribute(dataset, "array", array)
630628
write_attribute(dataset, "type", string(nameof(typeof(values))))
631629
write_attribute(dataset, "data_eltype", string(eltype(values)))
632630
return name

0 commit comments

Comments
 (0)