Skip to content

Commit 6dbe186

Browse files
JackAKirkSeanst98ProGTXsteffenlarsen
authored
[SYCL][DOC][bindless][cuda] gather_image extension doc and impl (#17322)
This PR presents a portable `gather_image` API for the bindless images extension, with an initial cuda implementation for 2D images only. Cubemap support is also possible but not yet added at this stage, since - it is unclear whether this can map to the spirv cube case. - l0 doesn't yet even support cubemaps anyway For 2D images the mapping to `__spirv_SampledImageGather` is straightforward and unambiguous. For complete details see the extension documentation in this PR. --------- Signed-off-by: JackAKirk <jack.kirk@codeplay.com> Co-authored-by: Sean Stirling <sean.a.stirling@gmail.com> Co-authored-by: Peter Žužek <peterzuzek@gmail.com> Co-authored-by: Steffen Larsen <steffen.larsen@intel.com>
1 parent 110e7bb commit 6dbe186

File tree

23 files changed

+502
-12
lines changed

23 files changed

+502
-12
lines changed

libclc/libspirv/lib/ptx-nvidiacl/images/image.cl

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2727,6 +2727,69 @@ _CLC_DEFINE_SAMPLEDIMAGE_BINDLESS_FETCH_BUILTIN(half4, 1, Dv4_DF16_, v4f16, i, i
27272727
_CLC_DEFINE_SAMPLEDIMAGE_BINDLESS_FETCH_BUILTIN(half4, 2, Dv4_DF16_, v4f16, Dv2_i, int2 coord, coord.x, coord.y)
27282728
_CLC_DEFINE_SAMPLEDIMAGE_BINDLESS_FETCH_BUILTIN(half4, 3, Dv4_DF16_, v4f16, Dv3_i, int4 coord, coord.x, coord.y, coord.z)
27292729

2730+
float4
2731+
__nvvm_tld4_r_2d_v4f32_f32(long, float,
2732+
float) __asm("__clc_llvm_nvvm_tld4_r_2d_v4f32_f32");
2733+
float4
2734+
__nvvm_tld4_g_2d_v4f32_f32(long, float,
2735+
float) __asm("__clc_llvm_nvvm_tld4_g_2d_v4f32_f32");
2736+
float4
2737+
__nvvm_tld4_b_2d_v4f32_f32(long, float,
2738+
float) __asm("__clc_llvm_nvvm_tld4_b_2d_v4f32_f32");
2739+
float4
2740+
__nvvm_tld4_a_2d_v4f32_f32(long, float,
2741+
float) __asm("__clc_llvm_nvvm_tld4_a_2d_v4f32_f32");
2742+
2743+
int4 __nvvm_tld4_r_2d_v4s32_f32(long, float, float) __asm(
2744+
"__clc_llvm_nvvm_tld4_r_2d_v4s32_f32");
2745+
int4 __nvvm_tld4_g_2d_v4s32_f32(long, float, float) __asm(
2746+
"__clc_llvm_nvvm_tld4_g_2d_v4s32_f32");
2747+
int4 __nvvm_tld4_b_2d_v4s32_f32(long, float, float) __asm(
2748+
"__clc_llvm_nvvm_tld4_b_2d_v4s32_f32");
2749+
int4 __nvvm_tld4_a_2d_v4s32_f32(long, float, float) __asm(
2750+
"__clc_llvm_nvvm_tld4_a_2d_v4s32_f32");
2751+
2752+
uint4 __nvvm_tld4_r_2d_v4u32_f32(long, float, float) __asm(
2753+
"__clc_llvm_nvvm_tld4_r_2d_v4u32_f32");
2754+
uint4 __nvvm_tld4_g_2d_v4u32_f32(long, float, float) __asm(
2755+
"__clc_llvm_nvvm_tld4_g_2d_v4u32_f32");
2756+
uint4 __nvvm_tld4_b_2d_v4u32_f32(long, float, float) __asm(
2757+
"__clc_llvm_nvvm_tld4_b_2d_v4u32_f32");
2758+
uint4 __nvvm_tld4_a_2d_v4u32_f32(long, float, float) __asm(
2759+
"__clc_llvm_nvvm_tld4_a_2d_v4u32_f32");
2760+
2761+
#define _CLC_DEFINE_SAMPLEDIMAGE_BINDLESS_GATHER_BUILTIN( \
2762+
elem_t, elem_t_mangled, vec_size, type) \
2763+
_CLC_DEF elem_t MANGLE_FUNC_IMG_HANDLE( \
2764+
26, __spirv_SampledImageGather, I##elem_t_mangled, \
2765+
Dv2_f##ET_T0_T1_j)(ulong imageHandle, float2 coord, unsigned i) { \
2766+
elem_t result; \
2767+
switch (i) { \
2768+
case 0: \
2769+
result = __nvvm_tld4_r_2d_v4##type##_f32(imageHandle, coord.x, coord.y); \
2770+
break; \
2771+
case 1: \
2772+
result = __nvvm_tld4_g_2d_v4##type##_f32(imageHandle, coord.x, coord.y); \
2773+
break; \
2774+
case 2: \
2775+
result = __nvvm_tld4_b_2d_v4##type##_f32(imageHandle, coord.x, coord.y); \
2776+
break; \
2777+
case 3: \
2778+
result = __nvvm_tld4_a_2d_v4##type##_f32(imageHandle, coord.x, coord.y); \
2779+
break; \
2780+
default: \
2781+
__builtin_trap(); \
2782+
__builtin_unreachable(); \
2783+
} \
2784+
return result; \
2785+
}
2786+
2787+
_CLC_DEFINE_SAMPLEDIMAGE_BINDLESS_GATHER_BUILTIN(float4, Dv4_f, v4f32, f32)
2788+
_CLC_DEFINE_SAMPLEDIMAGE_BINDLESS_GATHER_BUILTIN(int4, Dv4_i, v4i32, s32)
2789+
_CLC_DEFINE_SAMPLEDIMAGE_BINDLESS_GATHER_BUILTIN(uint4, Dv4_j, v4j32, u32)
2790+
2791+
#undef _CLC_DEFINE_SAMPLEDIMAGE_BINDLESS_GATHER_BUILTIN
2792+
27302793
// <--- MIPMAP --->
27312794

27322795
// Define functions to call intrinsic

libclc/libspirv/lib/ptx-nvidiacl/images/image_helpers.ll

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,103 @@ entry:
345345
ret <4 x float> %1
346346
}
347347

348+
; <--- TEXTURE GATHER --->
349+
declare {float,float,float,float} @llvm.nvvm.tld4.unified.r.2d.v4f32.f32(i64, float, float)
350+
define <4 x float> @__clc_llvm_nvvm_tld4_r_2d_v4f32_f32(i64 %img, float %x, float %y) nounwind alwaysinline {
351+
entry:
352+
%0 = tail call {float,float,float,float} @llvm.nvvm.tld4.unified.r.2d.v4f32.f32(i64 %img, float %x, float %y);
353+
%1 = tail call <4 x float>@__clc_structf32_to_vector({float,float,float,float} %0)
354+
ret <4 x float> %1
355+
}
356+
357+
declare {float,float,float,float} @llvm.nvvm.tld4.unified.g.2d.v4f32.f32(i64, float, float)
358+
define <4 x float> @__clc_llvm_nvvm_tld4_g_2d_v4f32_f32(i64 %img, float %x, float %y) nounwind alwaysinline {
359+
entry:
360+
%0 = tail call {float,float,float,float} @llvm.nvvm.tld4.unified.g.2d.v4f32.f32(i64 %img, float %x, float %y);
361+
%1 = tail call <4 x float>@__clc_structf32_to_vector({float,float,float,float} %0)
362+
ret <4 x float> %1
363+
}
364+
365+
declare {float,float,float,float} @llvm.nvvm.tld4.unified.b.2d.v4f32.f32(i64, float, float)
366+
define <4 x float> @__clc_llvm_nvvm_tld4_b_2d_v4f32_f32(i64 %img, float %x, float %y) nounwind alwaysinline {
367+
entry:
368+
%0 = tail call {float,float,float,float} @llvm.nvvm.tld4.unified.b.2d.v4f32.f32(i64 %img, float %x, float %y);
369+
%1 = tail call <4 x float>@__clc_structf32_to_vector({float,float,float,float} %0)
370+
ret <4 x float> %1
371+
}
372+
373+
declare {float,float,float,float} @llvm.nvvm.tld4.unified.a.2d.v4f32.f32(i64, float, float)
374+
define <4 x float> @__clc_llvm_nvvm_tld4_a_2d_v4f32_f32(i64 %img, float %x, float %y) nounwind alwaysinline {
375+
entry:
376+
%0 = tail call {float,float,float,float} @llvm.nvvm.tld4.unified.a.2d.v4f32.f32(i64 %img, float %x, float %y);
377+
%1 = tail call <4 x float>@__clc_structf32_to_vector({float,float,float,float} %0)
378+
ret <4 x float> %1
379+
}
380+
381+
declare {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.r.2d.v4s32.f32(i64, float, float)
382+
define <4 x i32> @__clc_llvm_nvvm_tld4_r_2d_v4s32_f32(i64 %img, float %x, float %y) nounwind alwaysinline {
383+
entry:
384+
%0 = tail call {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.r.2d.v4s32.f32(i64 %img, float %x, float %y);
385+
%1 = tail call <4 x i32>@__clc_struct32_to_vector({i32,i32,i32,i32} %0)
386+
ret <4 x i32> %1
387+
}
388+
389+
declare {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.g.2d.v4s32.f32(i64, float, float)
390+
define <4 x i32> @__clc_llvm_nvvm_tld4_g_2d_v4s32_f32(i64 %img, float %x, float %y) nounwind alwaysinline {
391+
entry:
392+
%0 = tail call {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.g.2d.v4s32.f32(i64 %img, float %x, float %y);
393+
%1 = tail call <4 x i32>@__clc_struct32_to_vector({i32,i32,i32,i32} %0)
394+
ret <4 x i32> %1
395+
}
396+
397+
declare {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.b.2d.v4s32.f32(i64, float, float)
398+
define <4 x i32> @__clc_llvm_nvvm_tld4_b_2d_v4s32_f32(i64 %img, float %x, float %y) nounwind alwaysinline {
399+
entry:
400+
%0 = tail call {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.b.2d.v4s32.f32(i64 %img, float %x, float %y);
401+
%1 = tail call <4 x i32>@__clc_struct32_to_vector({i32,i32,i32,i32} %0)
402+
ret <4 x i32> %1
403+
}
404+
405+
declare {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.a.2d.v4s32.f32(i64, float, float)
406+
define <4 x i32> @__clc_llvm_nvvm_tld4_a_2d_v4s32_f32(i64 %img, float %x, float %y) nounwind alwaysinline {
407+
entry:
408+
%0 = tail call {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.a.2d.v4s32.f32(i64 %img, float %x, float %y);
409+
%1 = tail call <4 x i32>@__clc_struct32_to_vector({i32,i32,i32,i32} %0)
410+
ret <4 x i32> %1
411+
}
412+
413+
declare {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.r.2d.v4u32.f32(i64, float, float)
414+
define <4 x i32> @__clc_llvm_nvvm_tld4_r_2d_v4u32_f32(i64 %img, float %x, float %y) nounwind alwaysinline {
415+
entry:
416+
%0 = tail call {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.r.2d.v4u32.f32(i64 %img, float %x, float %y);
417+
%1 = tail call <4 x i32>@__clc_struct32_to_vector({i32,i32,i32,i32} %0)
418+
ret <4 x i32> %1
419+
}
420+
421+
declare {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.g.2d.v4u32.f32(i64, float, float)
422+
define <4 x i32> @__clc_llvm_nvvm_tld4_g_2d_v4u32_f32(i64 %img, float %x, float %y) nounwind alwaysinline {
423+
entry:
424+
%0 = tail call {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.g.2d.v4u32.f32(i64 %img, float %x, float %y);
425+
%1 = tail call <4 x i32>@__clc_struct32_to_vector({i32,i32,i32,i32} %0)
426+
ret <4 x i32> %1
427+
}
428+
429+
declare {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.b.2d.v4u32.f32(i64, float, float)
430+
define <4 x i32> @__clc_llvm_nvvm_tld4_b_2d_v4u32_f32(i64 %img, float %x, float %y) nounwind alwaysinline {
431+
entry:
432+
%0 = tail call {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.b.2d.v4u32.f32(i64 %img, float %x, float %y);
433+
%1 = tail call <4 x i32>@__clc_struct32_to_vector({i32,i32,i32,i32} %0)
434+
ret <4 x i32> %1
435+
}
436+
437+
declare {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.a.2d.v4u32.f32(i64, float, float)
438+
define <4 x i32> @__clc_llvm_nvvm_tld4_a_2d_v4u32_f32(i64 %img, float %x, float %y) nounwind alwaysinline {
439+
entry:
440+
%0 = tail call {i32,i32,i32,i32} @llvm.nvvm.tld4.unified.a.2d.v4u32.f32(i64 %img, float %x, float %y);
441+
%1 = tail call <4 x i32>@__clc_struct32_to_vector({i32,i32,i32,i32} %0)
442+
ret <4 x i32> %1
443+
}
444+
348445
; <--- TEXTURE FETCHING (integer coordinates) --->
349446
declare {i32,i32,i32,i32} @llvm.nvvm.tex.unified.1d.v4s32.s32(i64, i32)
350447
define <4 x i32> @__clc_llvm_nvvm_tex_1d_v4i32_s32(i64 %img, i32 %x) nounwind alwaysinline {

llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ def AspectExt_intel_fpga_task_sequence : Aspect<"ext_intel_fpga_task_sequence">;
8787
def AspectExt_oneapi_atomic16 : Aspect<"ext_oneapi_atomic16">;
8888
def AspectExt_oneapi_virtual_functions : Aspect<"ext_oneapi_virtual_functions">;
8989
def AspectExt_intel_spill_memory_size : Aspect<"ext_intel_spill_memory_size">;
90+
def AspectExt_oneapi_bindless_images_gather : Aspect<"ext_oneapi_bindless_images_gather">;
9091
// Deprecated aspects
9192
def AspectInt64_base_atomics : Aspect<"int64_base_atomics">;
9293
def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">;
@@ -144,6 +145,7 @@ def : TargetInfo<"__TestAspectList",
144145
AspectExt_oneapi_bindless_sampled_image_fetch_1d_usm, AspectExt_oneapi_bindless_sampled_image_fetch_1d,
145146
AspectExt_oneapi_bindless_sampled_image_fetch_2d_usm, AspectExt_oneapi_bindless_sampled_image_fetch_2d,
146147
AspectExt_oneapi_bindless_sampled_image_fetch_3d,
148+
AspectExt_oneapi_bindless_images_gather,
147149
AspectExt_intel_esimd,
148150
AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group,
149151
AspectExt_oneapi_tangle_group, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component,

sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc

Lines changed: 68 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ permission by Khronos.
3030

3131
== Dependencies
3232

33-
This extension is written against the SYCL 2020 revision 6 specification. All
33+
This extension is written against the SYCL 2020 revision 9 specification. All
3434
references below to the "core SYCL specification" or to section numbers in the
3535
SYCL specification refer to that revision.
3636

@@ -181,6 +181,7 @@ enum class image_type : /* unspecified */ {
181181
mipmap,
182182
array,
183183
cubemap,
184+
gather,
184185
};
185186

186187
struct image_descriptor {
@@ -1329,10 +1330,15 @@ DataT fetch_image(const unsampled_image_handle &ImageHandle,
13291330
template <typename DataT, typename HintT = DataT, typename CoordT>
13301331
DataT fetch_image(const sampled_image_handle &ImageHandle,
13311332
const CoordT &Coords);
1333+
13321334
template <typename DataT, typename HintT = DataT, typename CoordT>
1333-
DataT sample_image(const sampled_image_handle &ImageHandle,
1335+
DataT sample_image(const sampled_image_handle &ImageHandle,
13341336
const CoordT &Coords);
13351337

1338+
template <typename DataT>
1339+
DataT gather_image(const sampled_image_handle &ImageHandle,
1340+
const sycl::float2 &Coords, const unsigned Component);
1341+
13361342
template <typename DataT, typename CoordT>
13371343
void write_image(unsampled_image_handle ImageHandle,
13381344
const CoordT &Coords, const DataT &Color);
@@ -1342,9 +1348,10 @@ void write_image(unsampled_image_handle ImageHandle,
13421348
Inside a kernel, it's possible to retrieve data from an image via `fetch_image`
13431349
or `sample_image`, passing the appropriate image handle. The `fetch_image` API
13441350
is applicable to sampled and unsampled images, and the data will be fetched
1345-
exactly as is in device memory. The `sample_image` API is only applicable to
1346-
sampled images, the image data will be sampled according to the
1347-
`bindless_image_sampler` that was passed to the image upon construction.
1351+
exactly as is in device memory. The `sample_image` and `gather_image` APIs are
1352+
only applicable to sampled images. For both APIs the image data will be sampled
1353+
according to the `bindless_image_sampler` that was passed to the image upon
1354+
construction.
13481355

13491356
When fetching from a sampled image handle, data exactly as is in memory, no
13501357
sampling operations will be performed, and the `bindless_image_sampler` passed
@@ -1406,6 +1413,58 @@ Attempting to sample a standard sampled image with `sample_mipmap` or any other
14061413
defined sampling function will result in undefined behaviour.
14071414
====
14081415

1416+
==== `gather_image`
1417+
1418+
We provide the following device aspect to retrieve support information for
1419+
`gather_image`.
1420+
1421+
The device aspect descriptor for this query is:
1422+
1423+
[frame="none",options="header"]
1424+
|======================
1425+
|Device descriptor |Description
1426+
|`aspect::ext_oneapi_bindless_images_gather` | Indicates if the device supports
1427+
`gather_image`.
1428+
|======================
1429+
1430+
`DataT` is a vector type with four components. DataT can only be `sycl::float4`
1431+
, `sycl::int4`, or `sycl::uint4`. `ImageHandle` must be a two dimensional image
1432+
instantiated using an `image_descriptor` with non-zero `width` and `height`
1433+
members. The `image_descriptor::depth` member must be zero. If these conditions
1434+
are not satisfied then a `sycl::exception` is thrown with error code
1435+
`sycl::errc::invalid`. The return values, `DataT`, consist of the four texture,
1436+
`T`, coordinates that would be used for linear filtering of `ImageHandle` at
1437+
the coordinate specified by `Coords`, and for channel component `Component`.
1438+
1439+
These coordinates are returned as follows:
1440+
1441+
- `DataT.x = T[i, j + 1]`
1442+
- `DataT.y = T[i + 1, j + 1]`
1443+
- `DataT.z = T[i + 1, j]`
1444+
- `DataT.w = T[i, j]`
1445+
1446+
where `i` and `j` are calculated depending on the
1447+
`sycl::coordinate_normalization_mode` used to constuct `ImageHandle` via a
1448+
`bindless_image_sampler`. For
1449+
`sycl::coordinate_normalization_mode::unnormalized` (in units of the image
1450+
discretization) they are calculated as:
1451+
1452+
- `i = floor(Coords.x - 0.5)`
1453+
- `j = floor(Coords.y - 0.5)`
1454+
1455+
[NOTE]
1456+
=====
1457+
In the CUDA backend there are different restrictions on the sizes of an image
1458+
that can be created for a given device when `gather_image` is being used.
1459+
=====
1460+
1461+
For guaranteed correctness and portability across backends, it is necessary to
1462+
construct `ImageHandle` using an `image_descriptor` with member
1463+
`image_type::gather`. Calling `gather_image` on an `ImageHandle` constructed
1464+
with `image_descriptor::num_channels < 4` will result in undefined behavior.
1465+
1466+
====
1467+
14091468
=== Recognized standard types [[recognized_standard_types]]
14101469
14111470
For the purposes of this extension, the following are classified as recognized
@@ -1418,6 +1477,8 @@ standard types.
14181477
14191478
Any other types are classified as user-defined types.
14201479
1480+
===
1481+
14211482
==== User-defined types
14221483
14231484
Some examples of a user-defined types may be:
@@ -2576,4 +2637,5 @@ These features still need to be handled:
25762637
- Add support for USM to USM copies and sub-copies.
25772638
- Add support for host to host copies and sub-copies.
25782639
|6.8|2025-03-13| - Add support for importing timeline semaphores.
2579-
|======================
2640+
|6.9|2025-03-18| - Add new `gather` image type and the accompanying
2641+
`gather_image` function.

sycl/include/sycl/__spirv/spirv_ops.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,10 @@ template <class RetT, typename ImageT, typename TempArgT>
196196
extern __DPCPP_SYCL_EXTERNAL RetT __spirv_SampledImageArrayFetch(ImageT,
197197
TempArgT, int);
198198

199+
template <class RetT, typename ImageT, typename TempArgT>
200+
extern __DPCPP_SYCL_EXTERNAL RetT __spirv_SampledImageGather(ImageT, TempArgT,
201+
unsigned);
202+
199203
template <class RetT, typename ImageT, typename TempArgT>
200204
extern __DPCPP_SYCL_EXTERNAL RetT __spirv_ImageArrayRead(ImageT, TempArgT, int);
201205

sycl/include/sycl/detail/image_ocl_types.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,22 @@ static RetType __invoke__SampledImageFetch(ImageT Img, CoordT Coords) {
103103
Img, TmpCoords));
104104
}
105105

106+
template <typename RetType, typename ImageT, typename CoordT>
107+
static std::enable_if_t<std::is_same_v<RetType, sycl::vec<float, 4>> ||
108+
std::is_same_v<RetType, sycl::vec<int, 4>> ||
109+
std::is_same_v<RetType, sycl::vec<unsigned int, 4>>,
110+
RetType>
111+
__invoke__SampledImageGather(ImageT Img, CoordT Coords, unsigned Component) {
112+
113+
// Convert from sycl types to builtin types to get correct function mangling.
114+
using TempRetT = sycl::detail::ConvertToOpenCLType_t<RetType>;
115+
auto TmpCoords = sycl::detail::convertToOpenCLType(Coords);
116+
117+
return sycl::detail::convertFromOpenCLTypeFor<RetType>(
118+
__spirv_SampledImageGather<TempRetT, ImageT, decltype(TmpCoords)>(
119+
Img, TmpCoords, Component));
120+
}
121+
106122
template <typename RetType, typename ImageT, typename CoordT>
107123
static RetType __invoke__ImageArrayFetch(ImageT Img, CoordT Coords,
108124
int ArrayLayer) {

sycl/include/sycl/ext/oneapi/bindless_images.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#pragma once
1010

11+
#include <sycl/aspects.hpp>
1112
#include <sycl/context.hpp> // for context
1213
#include <sycl/detail/export.hpp> // for __SYCL_EXPORT
1314
#include <sycl/device.hpp> // for device
@@ -910,6 +911,31 @@ DataT fetch_image(const sampled_image_handle &imageHandle [[maybe_unused]],
910911
#endif
911912
}
912913

914+
template <typename DataT>
915+
#ifdef __SYCL_DEVICE_ONLY__
916+
[[__sycl_detail__::__uses_aspects__(
917+
sycl::aspect::ext_oneapi_bindless_images_gather)]]
918+
#endif
919+
std::enable_if_t<std::is_same_v<DataT, float4> || std::is_same_v<DataT, int4> ||
920+
std::is_same_v<DataT, uint4>,
921+
DataT> gather_image(const sampled_image_handle &imageHandle
922+
[[maybe_unused]],
923+
const float2 &coords [[maybe_unused]],
924+
const unsigned i [[maybe_unused]]) {
925+
#if defined(__SYCL_DEVICE_ONLY__)
926+
#if defined(__NVPTX__)
927+
return __invoke__SampledImageGather<DataT>(
928+
CONVERT_HANDLE_TO_SAMPLED_IMAGE(imageHandle.raw_handle, float2::size()),
929+
coords, i);
930+
#else
931+
return {0, 0, 0, 0};
932+
#endif
933+
#else
934+
throw exception{make_error_code(errc::feature_not_supported),
935+
"gather_image is not supported on the host"};
936+
#endif
937+
}
938+
913939
/**
914940
* @brief Sample data from a sampled image using its handle
915941
*

0 commit comments

Comments
 (0)