[SYCL][Bindless][E2E] Workaround backend/type divergent integer rounding (#17017)

DBDuncan · web-flow · commit 63d688c5c7bf · 2025-02-21T15:30:42.000Z
Enable read_sampled.cpp and read_sampled_array.cpp to handle divergent
rounding when performing linear sampling depending on the backend and
integer type.
diff --git a/sycl/test-e2e/bindless_images/array/read_sampled_array.cpp b/sycl/test-e2e/bindless_images/array/read_sampled_array.cpp
@@ -1,4 +1,7 @@
-// REQUIRES: aspect-ext_oneapi_image_array
+// REQUIRES: aspect-ext_oneapi_bindless_images
+
+// UNSUPPORTED: hip
+// UNSUPPORTED-INTENDED: Image arrays are unimplemented in the HIP adapter.
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
@@ -30,7 +33,8 @@ static void
 runNDimTestHost(sycl::range<NDims> globalSize, float offset,
                 syclexp::bindless_image_sampler &samp,
                 std::vector<sycl::vec<DType, NChannels>> &inputImage,
-                std::vector<sycl::vec<DType, NChannels>> &output) {
+                std::vector<sycl::vec<DType, NChannels>> &output,
+                sycl::backend backend) {
 
   using VecType = sycl::vec<DType, NChannels>;
   bool isNorm =
@@ -67,7 +71,7 @@ runNDimTestHost(sycl::range<NDims> globalSize, float offset,
             inputImage.begin() + arr_idx * globalSizeTwoComp.size(),
             inputImage.begin() + (arr_idx + 1) * globalSizeTwoComp.size());
         VecType result = sampling_helpers::read<NDims - 1, DType, NChannels>(
-            globalSizeTwoComp, coords, offset, samp, layer);
+            globalSizeTwoComp, coords, offset, samp, layer, backend);
 
         output[arr_idx * globalSizeTwoComp.size() + i + (globalSize[0] * j)] =
             result;
@@ -151,6 +155,8 @@ static bool runTest(sycl::range<NDims> dims, sycl::range<NDims> localSize,
   sycl::queue q(dev);
   auto ctxt = q.get_context();
 
+  sycl::backend backend = dev.get_backend();
+
   size_t numElems = dims.size();
   auto image_array_dims = bindless_helpers::ImageArrayDims<NDims>(dims);
 
@@ -164,7 +170,7 @@ static bool runTest(sycl::range<NDims> dims, sycl::range<NDims> localSize,
   {
     sycl::range<NDims> globalSize = dims;
     runNDimTestHost<NDims, DType, NChannels>(globalSize, offset, samp, input,
-                                             expected);
+                                             expected, backend);
   }
 
   try {
@@ -319,9 +325,42 @@ bool runTests(sycl::range<2> dims, sycl::range<2> localSize, float offset,
       syclexp::bindless_image_sampler samp(addrMode, normMode, filtMode);
 
 #if defined(VERBOSE_LV2) || defined(VERBOSE_LV3)
-      util::printTestInfo(samp, offset);
+      sampling_helpers::printTestInfo(samp, offset);
 #endif
 
+      bindless_helpers::printTestName<NDims>("Running 1D int", dims, localSize);
+      failed |=
+          util::runTest<NDims, int, 1, sycl::image_channel_type::signed_int32,
+                        class int_1d>(dims, localSize, offset, samp, seed);
+      bindless_helpers::printTestName<NDims>("Running 1D int2", dims,
+                                             localSize);
+      failed |=
+          util::runTest<NDims, int, 2, sycl::image_channel_type::signed_int32,
+                        class int2_1d>(dims, localSize, offset, samp, seed);
+      bindless_helpers::printTestName<NDims>("Running 1D int4", dims,
+                                             localSize);
+      failed |=
+          util::runTest<NDims, int, 4, sycl::image_channel_type::signed_int32,
+                        class int4_1d>(dims, localSize, offset, samp, seed);
+
+      bindless_helpers::printTestName<NDims>("Running 1D uint", dims,
+                                             localSize);
+      failed |=
+          util::runTest<NDims, unsigned int, 1,
+                        sycl::image_channel_type::unsigned_int32,
+                        class uint_1d>(dims, localSize, offset, samp, seed);
+      bindless_helpers::printTestName<NDims>("Running 1D uint2", dims,
+                                             localSize);
+      failed |=
+          util::runTest<NDims, unsigned int, 2,
+                        sycl::image_channel_type::unsigned_int32,
+                        class uint2_1d>(dims, localSize, offset, samp, seed);
+      bindless_helpers::printTestName<NDims>("Running 1D uint4", dims,
+                                             localSize);
+      failed |=
+          util::runTest<NDims, int, 4, sycl::image_channel_type::signed_int32,
+                        class uint4_1d>(dims, localSize, offset, samp, seed);
+
       bindless_helpers::printTestName<NDims>("Running 1D short", dims,
                                              localSize);
       failed |=
@@ -468,9 +507,42 @@ bool runTests(sycl::range<3> dims, sycl::range<3> localSize, float offset,
       syclexp::bindless_image_sampler samp(addrMode, normMode, filtMode);
 
 #if defined(VERBOSE_LV2) || defined(VERBOSE_LV3)
-      util::printTestInfo(samp, offset);
+      sampling_helpers::printTestInfo(samp, offset);
 #endif
 
+      bindless_helpers::printTestName<NDims>("Running 2D int", dims, localSize);
+      failed |=
+          util::runTest<NDims, int, 1, sycl::image_channel_type::signed_int32,
+                        class int_2d>(dims, localSize, offset, samp, seed);
+      bindless_helpers::printTestName<NDims>("Running 2D int2", dims,
+                                             localSize);
+      failed |=
+          util::runTest<NDims, int, 2, sycl::image_channel_type::signed_int32,
+                        class int2_2d>(dims, localSize, offset, samp, seed);
+      bindless_helpers::printTestName<NDims>("Running 2D int4", dims,
+                                             localSize);
+      failed |=
+          util::runTest<NDims, int, 4, sycl::image_channel_type::signed_int32,
+                        class int4_2d>(dims, localSize, offset, samp, seed);
+
+      bindless_helpers::printTestName<NDims>("Running 2D uint", dims,
+                                             localSize);
+      failed |=
+          util::runTest<NDims, unsigned int, 1,
+                        sycl::image_channel_type::unsigned_int32,
+                        class uint_2d>(dims, localSize, offset, samp, seed);
+      bindless_helpers::printTestName<NDims>("Running 2D uint2", dims,
+                                             localSize);
+      failed |=
+          util::runTest<NDims, unsigned int, 2,
+                        sycl::image_channel_type::unsigned_int32,
+                        class uint2_2d>(dims, localSize, offset, samp, seed);
+      bindless_helpers::printTestName<NDims>("Running 2D uint4", dims,
+                                             localSize);
+      failed |=
+          util::runTest<NDims, int, 4, sycl::image_channel_type::signed_int32,
+                        class uint4_2d>(dims, localSize, offset, samp, seed);
+
       bindless_helpers::printTestName<NDims>("Running 2D short", dims,
                                              localSize);
       failed |=
diff --git a/sycl/test-e2e/bindless_images/helpers/sampling.hpp b/sycl/test-e2e/bindless_images/helpers/sampling.hpp
@@ -55,7 +55,7 @@ template <int NChannels, typename DType>
 static sycl::vec<DType, NChannels>
 linearOp(sycl::vec<DType, NChannels> pix1, sycl::vec<DType, NChannels> pix2,
          sycl::vec<DType, NChannels> pix3, sycl::vec<DType, NChannels> pix4,
-         float weight1, float weight2) {
+         float weight1, float weight2, sycl::backend backend) {
 
   sycl::vec<float, NChannels> weightArr1(weight1);
   sycl::vec<float, NChannels> weightArr2(weight2);
@@ -73,14 +73,41 @@ linearOp(sycl::vec<DType, NChannels> pix1, sycl::vec<DType, NChannels> pix2,
              (one - weightArr1) * weightArr2 * Ti0j1 +
              weightArr1 * weightArr2 * Ti1j1));
 
-  // Round to nearest whole number.
-  // There is no option to do this via sycl::rounding_mode.
-  if constexpr (std::is_same_v<DType, short> ||
-                std::is_same_v<DType, unsigned short> ||
-                std::is_same_v<DType, signed char> ||
-                std::is_same_v<DType, unsigned char>) {
-    for (int i = 0; i < NChannels; i++) {
-      result[i] = std::round(result[i]);
+  if (backend == sycl::backend::ext_oneapi_cuda) {
+    // On Nvidia devices, if the image being accessed contains smaller than
+    // 32-bit integer data, then the fractional result of linear interpolation
+    // is rounded to the nearest number.
+    if constexpr (std::is_same_v<DType, short> ||
+                  std::is_same_v<DType, unsigned short> ||
+                  std::is_same_v<DType, signed char> ||
+                  std::is_same_v<DType, unsigned char>) {
+      for (int i = 0; i < NChannels; i++) {
+        result[i] = std::round(result[i]);
+      }
+    }
+
+    // On Nvidia devices, if the image being accessed contains 32-bit integer
+    // data, then the fractional result of linear interpolation is rounded down.
+    if constexpr (std::is_same_v<DType, int> ||
+                  std::is_same_v<DType, unsigned int>) {
+      for (int i = 0; i < NChannels; i++) {
+        result[i] = std::floor(result[i]);
+      }
+    }
+  }
+
+  if (backend == sycl::backend::ext_oneapi_level_zero) {
+    // On Intel devices, if the image being accessed contains integer data, then
+    // the fractional result of linear interpolation is rounded down.
+    if constexpr (std::is_same_v<DType, short> ||
+                  std::is_same_v<DType, unsigned short> ||
+                  std::is_same_v<DType, signed char> ||
+                  std::is_same_v<DType, unsigned char> ||
+                  std::is_same_v<DType, int> ||
+                  std::is_same_v<DType, unsigned int>) {
+      for (int i = 0; i < NChannels; i++) {
+        result[i] = std::floor(result[i]);
+      }
     }
   }
 
@@ -360,7 +387,8 @@ struct InterpolRes {
 template <typename DType, int NChannels>
 static sycl::vec<DType, NChannels>
 clampLinear(sycl::vec<float, 2> coords, sycl::range<2> globalSize,
-            const std::vector<sycl::vec<DType, NChannels>> &inputImage) {
+            const std::vector<sycl::vec<DType, NChannels>> &inputImage,
+            sycl::backend backend) {
   using VecType = sycl::vec<DType, NChannels>;
 
   float coordX = coords[0];
@@ -391,14 +419,16 @@ clampLinear(sycl::vec<float, 2> coords, sycl::range<2> globalSize,
       clampLinearCheckBounds<VecType>(i1, j1, width, height, inputImage);
 
   // Perform linear sampling
-  return linearOp<NChannels, DType>(pix1, pix2, pix3, pix4, weightX, weightY);
+  return linearOp<NChannels, DType>(pix1, pix2, pix3, pix4, weightX, weightY,
+                                    backend);
 }
 
 // Out of range coords are clamped to the extent.
 template <typename DType, int NChannels>
 static sycl::vec<DType, NChannels>
 clampToEdgeLinear(sycl::vec<float, 2> coords, sycl::range<2> globalSize,
-                  const std::vector<sycl::vec<DType, NChannels>> &inputImage) {
+                  const std::vector<sycl::vec<DType, NChannels>> &inputImage,
+                  sycl::backend backend) {
   using VecType = sycl::vec<DType, NChannels>;
 
   float coordX = coords[0];
@@ -428,7 +458,8 @@ clampToEdgeLinear(sycl::vec<float, 2> coords, sycl::range<2> globalSize,
   VecType pix4 = inputImage[i1 + (width * j1)];
 
   // Perform linear sampling
-  return linearOp<NChannels, DType>(pix1, pix2, pix3, pix4, weightX, weightY);
+  return linearOp<NChannels, DType>(pix1, pix2, pix3, pix4, weightX, weightY,
+                                    backend);
 }
 
 // Out of range coords return a border color
@@ -451,7 +482,8 @@ static InterpolRes repeatLinearCoord(float coord, int dimSize) {
 template <typename DType, int NChannels>
 static sycl::vec<DType, NChannels>
 repeatLinear(sycl::vec<float, 2> coords, sycl::range<2> globalSize,
-             const std::vector<sycl::vec<DType, NChannels>> &inputImage) {
+             const std::vector<sycl::vec<DType, NChannels>> &inputImage,
+             sycl::backend backend) {
   using VecType = sycl::vec<DType, NChannels>;
 
   float coordX = coords[0];
@@ -482,7 +514,8 @@ repeatLinear(sycl::vec<float, 2> coords, sycl::range<2> globalSize,
   VecType pix4 = inputImage[i1 + (width * j1)];
 
   // Perform linear sampling
-  return linearOp<NChannels, DType>(pix1, pix2, pix3, pix4, weightX, weightY);
+  return linearOp<NChannels, DType>(pix1, pix2, pix3, pix4, weightX, weightY,
+                                    backend);
 }
 
 // Out of range coordinates are flipped at every integer junction
@@ -517,9 +550,10 @@ static InterpolRes mirroredRepeatLinearCoord(float coord, int dimSize) {
 
 // Out of range coordinates are flipped at every integer junction
 template <typename DType, int NChannels>
-static sycl::vec<DType, NChannels> mirroredRepeatLinear(
-    sycl::vec<float, 2> coords, sycl::range<2> globalSize,
-    const std::vector<sycl::vec<DType, NChannels>> &inputImage) {
+static sycl::vec<DType, NChannels>
+mirroredRepeatLinear(sycl::vec<float, 2> coords, sycl::range<2> globalSize,
+                     const std::vector<sycl::vec<DType, NChannels>> &inputImage,
+                     sycl::backend backend) {
   using VecType = sycl::vec<DType, NChannels>;
 
   float coordX = coords[0];
@@ -551,7 +585,8 @@ static sycl::vec<DType, NChannels> mirroredRepeatLinear(
   VecType pix4 = inputImage[i1 + (width * j1)];
 
   // Perform linear sampling
-  return linearOp<NChannels, DType>(pix1, pix2, pix3, pix4, weightX, weightY);
+  return linearOp<NChannels, DType>(pix1, pix2, pix3, pix4, weightX, weightY,
+                                    backend);
 }
 
 // Some vector sizes here are hardcoded because the sampling functions are
@@ -560,7 +595,8 @@ template <int NDims, typename DType, int NChannels>
 static sycl::vec<DType, NChannels>
 read(sycl::range<2> globalSize, sycl::vec<float, 2> coords, float offset,
      const sycl::ext::oneapi::experimental::bindless_image_sampler &samp,
-     const std::vector<sycl::vec<DType, NChannels>> &inputImage) {
+     const std::vector<sycl::vec<DType, NChannels>> &inputImage,
+     sycl::backend backend) {
   using VecType = sycl::vec<DType, NChannels>;
 
   // Add offset to coords
@@ -624,26 +660,28 @@ read(sycl::range<2> globalSize, sycl::vec<float, 2> coords, float offset,
   } else { // linear
     sycl::addressing_mode SampAddrMode = samp.addressing[0];
     if (SampAddrMode == sycl::addressing_mode::ext_oneapi_clamp_to_border) {
-      return clampLinear<DType, NChannels>(coords, globalSize, inputImage);
+      return clampLinear<DType, NChannels>(coords, globalSize, inputImage,
+                                           backend);
     }
     if (SampAddrMode == sycl::addressing_mode::clamp_to_edge) {
-      return clampToEdgeLinear<DType, NChannels>(coords, globalSize,
-                                                 inputImage);
+      return clampToEdgeLinear<DType, NChannels>(coords, globalSize, inputImage,
+                                                 backend);
     }
     if (SampAddrMode == sycl::addressing_mode::repeat) {
       if (SampNormMode == sycl::coordinate_normalization_mode::unnormalized) {
         assert(false &&
                "Repeat addressing mode must be used with normalized coords");
       }
-      return repeatLinear<DType, NChannels>(coords, globalSize, inputImage);
+      return repeatLinear<DType, NChannels>(coords, globalSize, inputImage,
+                                            backend);
     }
     if (SampAddrMode == sycl::addressing_mode::mirrored_repeat) {
       if (SampNormMode == sycl::coordinate_normalization_mode::unnormalized) {
         assert(false && "Mirrored repeat addressing mode must be used with "
                         "normalized coords");
       }
       return mirroredRepeatLinear<DType, NChannels>(coords, globalSize,
-                                                    inputImage);
+                                                    inputImage, backend);
     }
     if (SampAddrMode == sycl::addressing_mode::none) {
       // Ensure no access out of bounds when addressing_mode is none
diff --git a/sycl/test-e2e/bindless_images/read_sampled.cpp b/sycl/test-e2e/bindless_images/read_sampled.cpp