Skip to content

Interpolation on GPU #812

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Jun 5, 2025
8 changes: 8 additions & 0 deletions src/general/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,11 @@ end
@inline function PointNeighbors.parallel_foreach(f, iterator, semi::Semidiscretization)
PointNeighbors.parallel_foreach(f, iterator, semi.parallelization_backend)
end

function allocate(backend::KernelAbstractions.Backend, ELTYPE, size)
return KernelAbstractions.allocate(backend, ELTYPE, size)
end

function allocate(backend, ELTYPE, size)
return Array{ELTYPE, length(size)}(undef, size)
end
126 changes: 96 additions & 30 deletions src/general/interpolation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -504,20 +504,31 @@ function process_neighborhood_searches(semi, u_ode, ref_system, smoothing_length
end

# Interpolate points with given neighborhood searches
@inline function interpolate_points(point_coords, semi, ref_system, v_ode, u_ode,
@inline function interpolate_points(point_coords_, semi, ref_system, v_ode, u_ode,
neighborhood_searches;
smoothing_length=initial_smoothing_length(ref_system),
cut_off_bnd=true, clip_negative_pressure=false)
(; parallelization_backend) = semi

if semi.parallelization_backend isa KernelAbstractions.Backend
point_coords = Adapt.adapt(semi.parallelization_backend, point_coords_)
else
point_coords = point_coords_
end

n_points = size(point_coords, 2)
ELTYPE = eltype(point_coords)
computed_density = zeros(ELTYPE, n_points)
other_density = zeros(ELTYPE, n_points)
neighbor_count = zeros(Int, n_points)
shepard_coefficient = zeros(ELTYPE, n_points)
computed_density = allocate(semi.parallelization_backend, ELTYPE, n_points)
other_density = allocate(semi.parallelization_backend, ELTYPE, n_points)
shepard_coefficient = allocate(semi.parallelization_backend, ELTYPE, n_points)
neighbor_count = allocate(semi.parallelization_backend, Int, n_points)

cache = create_cache_interpolation(ref_system, n_points)
set_zero!(computed_density)
set_zero!(other_density)
set_zero!(shepard_coefficient)
set_zero!(neighbor_count)

cache = create_cache_interpolation(ref_system, n_points, semi)

ref_id = system_indices(ref_system, semi)
ref_smoothing_kernel = ref_system.smoothing_kernel
Expand Down Expand Up @@ -565,42 +576,52 @@ end
computed_density[point] = NaN
neighbor_count[point] = 0

foreach(cache) do field
if field isa AbstractVector
field[point] = NaN
else
field[:, point] .= NaN
end
# We need to convert the `NamedTuple` to a `Tuple` for GPU compatibility
foreach(Tuple(cache)) do field
set_nan!(field, point)
end
else
# Normalize all quantities by the shepard coefficient
foreach(cache) do field
if field isa AbstractVector
field[point] /= shepard_coefficient[point]
else
field[:, point] ./= shepard_coefficient[point]
end
# Normalize all quantities by the shepard coefficient.
# We need to convert the `NamedTuple` to a `Tuple` for GPU compatibility.
foreach(Tuple(cache)) do field
divide_by_shepard_coefficient!(field, shepard_coefficient, point)
end
end
end

return (; computed_density=computed_density, neighbor_count, point_coords, cache...)
return (; computed_density=computed_density, point_coords=point_coords,
neighbor_count=neighbor_count, cache...)
end

@inline function create_cache_interpolation(ref_system::FluidSystem, n_points)
velocity = zeros(eltype(ref_system), ndims(ref_system), n_points)
pressure = zeros(eltype(ref_system), n_points)
density = zeros(eltype(ref_system), n_points)
@inline function create_cache_interpolation(ref_system::FluidSystem, n_points, semi)
(; parallelization_backend) = semi

velocity = allocate(parallelization_backend, eltype(ref_system),
(ndims(ref_system), n_points))
pressure = allocate(parallelization_backend, eltype(ref_system), n_points)
density = allocate(parallelization_backend, eltype(ref_system), n_points)

set_zero!(velocity)
set_zero!(pressure)
set_zero!(density)

return (; velocity, pressure, density)
end

@inline function create_cache_interpolation(ref_system::SolidSystem, n_points)
velocity = zeros(eltype(ref_system), ndims(ref_system), n_points)
jacobian = zeros(eltype(ref_system), n_points)
von_mises_stress = zeros(eltype(ref_system), n_points)
cauchy_stress = zeros(eltype(ref_system), ndims(ref_system), ndims(ref_system),
n_points)
@inline function create_cache_interpolation(ref_system::SolidSystem, n_points, semi)
(; parallelization_backend) = semi

velocity = allocate(parallelization_backend, eltype(ref_system),
(ndims(ref_system), n_points))
jacobian = allocate(parallelization_backend, eltype(ref_system), n_points)
von_mises_stress = allocate(parallelization_backend, eltype(ref_system), n_points)
cauchy_stress = allocate(parallelization_backend, eltype(ref_system),
(ndims(ref_system), ndims(ref_system), n_points))

set_zero!(velocity)
set_zero!(jacobian)
set_zero!(von_mises_stress)
set_zero!(cauchy_stress)

return (; velocity, jacobian, von_mises_stress, cauchy_stress)
end
Expand Down Expand Up @@ -643,3 +664,48 @@ end

return cache
end

function set_nan!(field::AbstractVector, point)
field[point] = NaN

return field
end

function set_nan!(field::AbstractMatrix, point)
for dim in axes(field, 1)
field[dim, point] = NaN
end

return field
end

function set_nan!(field::AbstractArray{T, 3}, point) where {T}
for j in axes(field, 2), i in axes(field, 1)
field[i, j, point] = NaN
end

return field
end

function divide_by_shepard_coefficient!(field::AbstractVector, shepard_coefficient, point)
field[point] /= shepard_coefficient[point]

return field
end

function divide_by_shepard_coefficient!(field::AbstractMatrix, shepard_coefficient, point)
for dim in axes(field, 1)
field[dim, point] /= shepard_coefficient[point]
end

return field
end

function divide_by_shepard_coefficient!(field::AbstractArray{T, 3}, shepard_coefficient,
point) where {T}
for j in axes(field, 2), i in axes(field, 1)
field[i, j, point] /= shepard_coefficient[point]
end

return field
end
17 changes: 8 additions & 9 deletions src/general/semidiscretization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -282,19 +282,18 @@ function semidiscretize(semi, tspan; reset_threads=true)
sizes_u = (u_nvariables(system) * n_moving_particles(system) for system in systems)
sizes_v = (v_nvariables(system) * n_moving_particles(system) for system in systems)

# Use either the specified backend, e.g., `CUDABackend` or `MetalBackend` or
# use CPU vectors for all CPU backends.
u0_ode_ = allocate(semi.parallelization_backend, ELTYPE, sum(sizes_u))
v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, sum(sizes_v))

if semi.parallelization_backend isa KernelAbstractions.Backend
# Use the specified backend, e.g., `CUDABackend` or `MetalBackend`
u0_ode = KernelAbstractions.allocate(semi.parallelization_backend, ELTYPE,
sum(sizes_u))
v0_ode = KernelAbstractions.allocate(semi.parallelization_backend, ELTYPE,
sum(sizes_v))
u0_ode = u0_ode_
v0_ode = v0_ode_
else
# Use CPU vectors for all CPU backends.
# These are wrapped in `ThreadedBroadcastArray`s
# CPU vectors are wrapped in `ThreadedBroadcastArray`s
# to make broadcasting (which is done by OrdinaryDiffEq.jl) multithreaded.
# See https://github.com/trixi-framework/TrixiParticles.jl/pull/722 for more details.
u0_ode_ = Vector{ELTYPE}(undef, sum(sizes_u))
v0_ode_ = Vector{ELTYPE}(undef, sum(sizes_v))
u0_ode = ThreadedBroadcastArray(u0_ode_;
parallelization_backend=semi.parallelization_backend)
v0_ode = ThreadedBroadcastArray(v0_ode_;
Expand Down
65 changes: 65 additions & 0 deletions test/examples/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -391,4 +391,69 @@ end
@test backend == Main.parallelization_backend
end
end

@testset verbose=true "Postprocessing $TRIXIPARTICLES_TEST_" begin
@testset verbose=true "Interpolation" begin
# Run the dam break example to get a solution
trixi_include_changeprecision(Float32, @__MODULE__,
joinpath(examples_dir(), "fluid",
"dam_break_2d_gpu.jl");
fluid_particle_spacing=0.05f0,
tspan=(0.0f0, 0.01f0),
parallelization_backend=Main.parallelization_backend)

semi_new = sol.prob.p

@testset verbose=true "Line" begin
# Interpolation parameters
n_interpolation_points = 10
start_point = Float32[0.5, 0.0]
end_point = Float32[0.5, 0.5]

result = interpolate_line(start_point, end_point, n_interpolation_points,
semi_new, semi_new.systems[1], sol;
cut_off_bnd=false)

@test isapprox(Array(result.computed_density),
Float32[500.33255, 893.09766, 997.7032, 1001.14355, 1001.234,
1001.0098, 1000.4352, 999.7572, 999.1139, 989.6319])

@test isapprox(Array(result.density),
Float32[1002.3152, 1002.19653, 1001.99915, 1001.7685,
1001.5382,
1001.3093, 1001.0836, 1000.8649, 1000.635,
1000.4053])

@test isapprox(Array(result.pressure),
Float32[5450.902, 5171.2856, 4706.551, 4163.9185, 3621.5042,
3082.6948, 2551.5725, 2036.1208, 1494.8608,
954.14355])
end

@testset verbose=true "Plane" begin
interpolation_start = Float32[0.0, 0.0]
interpolation_end = Float32[1.0, 1.0]
resolution = 0.4f0

result = interpolate_plane_2d(interpolation_start, interpolation_end,
resolution, semi_new, semi_new.systems[1],
sol;
cut_off_bnd=false)

@test isapprox(Array(result.computed_density),
Float32[250.18625, 500.34482, 499.77225, 254.3632, 499.58026,
999.1413, 998.6351, 503.0122])

@test isapprox(Array(result.density),
Float32[1002.34467, 1002.3365, 1001.5021, 999.7109,
1000.84863,
1000.8373, 1000.3423, 1000.20734])

@test isapprox(Array(result.pressure),
Float32[5520.0513, 5501.1846, 3536.2256, -680.5194,
1997.7814,
1971.0717, 805.8584, 488.4068])
end
end
end
end
Loading