diff --git a/src/general/gpu.jl b/src/general/gpu.jl index 87b5f95aa..9898da475 100644 --- a/src/general/gpu.jl +++ b/src/general/gpu.jl @@ -29,3 +29,11 @@ end @inline function PointNeighbors.parallel_foreach(f, iterator, semi::Semidiscretization) PointNeighbors.parallel_foreach(f, iterator, semi.parallelization_backend) end + +function allocate(backend::KernelAbstractions.Backend, ELTYPE, size) + return KernelAbstractions.allocate(backend, ELTYPE, size) +end + +function allocate(backend, ELTYPE, size) + return Array{ELTYPE, length(size)}(undef, size) +end diff --git a/src/general/interpolation.jl b/src/general/interpolation.jl index a17cddddb..9b0ac40d7 100644 --- a/src/general/interpolation.jl +++ b/src/general/interpolation.jl @@ -504,20 +504,31 @@ function process_neighborhood_searches(semi, u_ode, ref_system, smoothing_length end # Interpolate points with given neighborhood searches -@inline function interpolate_points(point_coords, semi, ref_system, v_ode, u_ode, +@inline function interpolate_points(point_coords_, semi, ref_system, v_ode, u_ode, neighborhood_searches; smoothing_length=initial_smoothing_length(ref_system), cut_off_bnd=true, clip_negative_pressure=false) (; parallelization_backend) = semi + if semi.parallelization_backend isa KernelAbstractions.Backend + point_coords = Adapt.adapt(semi.parallelization_backend, point_coords_) + else + point_coords = point_coords_ + end + n_points = size(point_coords, 2) ELTYPE = eltype(point_coords) - computed_density = zeros(ELTYPE, n_points) - other_density = zeros(ELTYPE, n_points) - neighbor_count = zeros(Int, n_points) - shepard_coefficient = zeros(ELTYPE, n_points) + computed_density = allocate(semi.parallelization_backend, ELTYPE, n_points) + other_density = allocate(semi.parallelization_backend, ELTYPE, n_points) + shepard_coefficient = allocate(semi.parallelization_backend, ELTYPE, n_points) + neighbor_count = allocate(semi.parallelization_backend, Int, n_points) - cache = create_cache_interpolation(ref_system, n_points) + set_zero!(computed_density) + set_zero!(other_density) + set_zero!(shepard_coefficient) + set_zero!(neighbor_count) + + cache = create_cache_interpolation(ref_system, n_points, semi) ref_id = system_indices(ref_system, semi) ref_smoothing_kernel = ref_system.smoothing_kernel @@ -565,42 +576,52 @@ end computed_density[point] = NaN neighbor_count[point] = 0 - foreach(cache) do field - if field isa AbstractVector - field[point] = NaN - else - field[:, point] .= NaN - end + # We need to convert the `NamedTuple` to a `Tuple` for GPU compatibility + foreach(Tuple(cache)) do field + set_nan!(field, point) end else - # Normalize all quantities by the shepard coefficient - foreach(cache) do field - if field isa AbstractVector - field[point] /= shepard_coefficient[point] - else - field[:, point] ./= shepard_coefficient[point] - end + # Normalize all quantities by the shepard coefficient. + # We need to convert the `NamedTuple` to a `Tuple` for GPU compatibility. + foreach(Tuple(cache)) do field + divide_by_shepard_coefficient!(field, shepard_coefficient, point) end end end - return (; computed_density=computed_density, neighbor_count, point_coords, cache...) + return (; computed_density=computed_density, point_coords=point_coords, + neighbor_count=neighbor_count, cache...) end -@inline function create_cache_interpolation(ref_system::FluidSystem, n_points) - velocity = zeros(eltype(ref_system), ndims(ref_system), n_points) - pressure = zeros(eltype(ref_system), n_points) - density = zeros(eltype(ref_system), n_points) +@inline function create_cache_interpolation(ref_system::FluidSystem, n_points, semi) + (; parallelization_backend) = semi + + velocity = allocate(parallelization_backend, eltype(ref_system), + (ndims(ref_system), n_points)) + pressure = allocate(parallelization_backend, eltype(ref_system), n_points) + density = allocate(parallelization_backend, eltype(ref_system), n_points) + + set_zero!(velocity) + set_zero!(pressure) + set_zero!(density) return (; velocity, pressure, density) end -@inline function create_cache_interpolation(ref_system::SolidSystem, n_points) - velocity = zeros(eltype(ref_system), ndims(ref_system), n_points) - jacobian = zeros(eltype(ref_system), n_points) - von_mises_stress = zeros(eltype(ref_system), n_points) - cauchy_stress = zeros(eltype(ref_system), ndims(ref_system), ndims(ref_system), - n_points) +@inline function create_cache_interpolation(ref_system::SolidSystem, n_points, semi) + (; parallelization_backend) = semi + + velocity = allocate(parallelization_backend, eltype(ref_system), + (ndims(ref_system), n_points)) + jacobian = allocate(parallelization_backend, eltype(ref_system), n_points) + von_mises_stress = allocate(parallelization_backend, eltype(ref_system), n_points) + cauchy_stress = allocate(parallelization_backend, eltype(ref_system), + (ndims(ref_system), ndims(ref_system), n_points)) + + set_zero!(velocity) + set_zero!(jacobian) + set_zero!(von_mises_stress) + set_zero!(cauchy_stress) return (; velocity, jacobian, von_mises_stress, cauchy_stress) end @@ -643,3 +664,48 @@ end return cache end + +function set_nan!(field::AbstractVector, point) + field[point] = NaN + + return field +end + +function set_nan!(field::AbstractMatrix, point) + for dim in axes(field, 1) + field[dim, point] = NaN + end + + return field +end + +function set_nan!(field::AbstractArray{T, 3}, point) where {T} + for j in axes(field, 2), i in axes(field, 1) + field[i, j, point] = NaN + end + + return field +end + +function divide_by_shepard_coefficient!(field::AbstractVector, shepard_coefficient, point) + field[point] /= shepard_coefficient[point] + + return field +end + +function divide_by_shepard_coefficient!(field::AbstractMatrix, shepard_coefficient, point) + for dim in axes(field, 1) + field[dim, point] /= shepard_coefficient[point] + end + + return field +end + +function divide_by_shepard_coefficient!(field::AbstractArray{T, 3}, shepard_coefficient, + point) where {T} + for j in axes(field, 2), i in axes(field, 1) + field[i, j, point] /= shepard_coefficient[point] + end + + return field +end diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index 5b7eaa530..0bc2cef57 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -282,19 +282,18 @@ function semidiscretize(semi, tspan; reset_threads=true) sizes_u = (u_nvariables(system) * n_moving_particles(system) for system in systems) sizes_v = (v_nvariables(system) * n_moving_particles(system) for system in systems) + # Use either the specified backend, e.g., `CUDABackend` or `MetalBackend` or + # use CPU vectors for all CPU backends. + u0_ode_ = allocate(semi.parallelization_backend, ELTYPE, sum(sizes_u)) + v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, sum(sizes_v)) + if semi.parallelization_backend isa KernelAbstractions.Backend - # Use the specified backend, e.g., `CUDABackend` or `MetalBackend` - u0_ode = KernelAbstractions.allocate(semi.parallelization_backend, ELTYPE, - sum(sizes_u)) - v0_ode = KernelAbstractions.allocate(semi.parallelization_backend, ELTYPE, - sum(sizes_v)) + u0_ode = u0_ode_ + v0_ode = v0_ode_ else - # Use CPU vectors for all CPU backends. - # These are wrapped in `ThreadedBroadcastArray`s + # CPU vectors are wrapped in `ThreadedBroadcastArray`s # to make broadcasting (which is done by OrdinaryDiffEq.jl) multithreaded. # See https://github.com/trixi-framework/TrixiParticles.jl/pull/722 for more details. - u0_ode_ = Vector{ELTYPE}(undef, sum(sizes_u)) - v0_ode_ = Vector{ELTYPE}(undef, sum(sizes_v)) u0_ode = ThreadedBroadcastArray(u0_ode_; parallelization_backend=semi.parallelization_backend) v0_ode = ThreadedBroadcastArray(v0_ode_; diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl index 7aca5f83d..9d551a527 100644 --- a/test/examples/gpu.jl +++ b/test/examples/gpu.jl @@ -391,4 +391,69 @@ end @test backend == Main.parallelization_backend end end + + @testset verbose=true "Postprocessing $TRIXIPARTICLES_TEST_" begin + @testset verbose=true "Interpolation" begin + # Run the dam break example to get a solution + trixi_include_changeprecision(Float32, @__MODULE__, + joinpath(examples_dir(), "fluid", + "dam_break_2d_gpu.jl"); + fluid_particle_spacing=0.05f0, + tspan=(0.0f0, 0.01f0), + parallelization_backend=Main.parallelization_backend) + + semi_new = sol.prob.p + + @testset verbose=true "Line" begin + # Interpolation parameters + n_interpolation_points = 10 + start_point = Float32[0.5, 0.0] + end_point = Float32[0.5, 0.5] + + result = interpolate_line(start_point, end_point, n_interpolation_points, + semi_new, semi_new.systems[1], sol; + cut_off_bnd=false) + + @test isapprox(Array(result.computed_density), + Float32[500.33255, 893.09766, 997.7032, 1001.14355, 1001.234, + 1001.0098, 1000.4352, 999.7572, 999.1139, 989.6319]) + + @test isapprox(Array(result.density), + Float32[1002.3152, 1002.19653, 1001.99915, 1001.7685, + 1001.5382, + 1001.3093, 1001.0836, 1000.8649, 1000.635, + 1000.4053]) + + @test isapprox(Array(result.pressure), + Float32[5450.902, 5171.2856, 4706.551, 4163.9185, 3621.5042, + 3082.6948, 2551.5725, 2036.1208, 1494.8608, + 954.14355]) + end + + @testset verbose=true "Plane" begin + interpolation_start = Float32[0.0, 0.0] + interpolation_end = Float32[1.0, 1.0] + resolution = 0.4f0 + + result = interpolate_plane_2d(interpolation_start, interpolation_end, + resolution, semi_new, semi_new.systems[1], + sol; + cut_off_bnd=false) + + @test isapprox(Array(result.computed_density), + Float32[250.18625, 500.34482, 499.77225, 254.3632, 499.58026, + 999.1413, 998.6351, 503.0122]) + + @test isapprox(Array(result.density), + Float32[1002.34467, 1002.3365, 1001.5021, 999.7109, + 1000.84863, + 1000.8373, 1000.3423, 1000.20734]) + + @test isapprox(Array(result.pressure), + Float32[5520.0513, 5501.1846, 3536.2256, -680.5194, + 1997.7814, + 1971.0717, 805.8584, 488.4068]) + end + end + end end