Factor out in-line use of submit to init padded vector into separate function

oleksandr-pavlyk · oleksandr-pavlyk · commit 0399b9304b56 · 2025-01-21T06:50:12.000-06:00
Doing so reduces the binary size of elementwise operations extension

Before:

```
(dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ ls -l dpctl/tensor/_tensor_elementwise_impl.cpython-312-x86_64-linux-gnu.so
-rw-r--r-- 1 opavlyk opavlyk 38659896 Jan 19 20:58 dpctl/tensor/_tensor_elementwise_impl.cpython-312-x86_64-linux-gnu.so
```

After:

```
dev_dpctl) opavlyk@mtl-world:~/repos/dpctl$ ls -l dpctl/tensor/_tensor_elementwise_impl.cpython-312-x86_64-linux-gnu.so
-rw-r--r-- 1 opavlyk opavlyk 37176600 Jan 21 06:36 dpctl/tensor/_tensor_elementwise_impl.cpython-312-x86_64-linux-gnu.so
```

Added static assertions to offset_utils to ensure that indexers are device copyable.
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -26,11 +26,13 @@
 #include <cstddef>
 #include <cstdint>
 #include <stdexcept>
-#include <sycl/sycl.hpp>
 #include <utility>
 
+#include <sycl/sycl.hpp>
+
 #include "kernels/alignment.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common_detail.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include "utils/sycl_utils.hpp"
@@ -324,21 +326,23 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
         {
             constexpr bool enable_sg_loadstore = true;
             using KernelName = BaseKernelName;
+            using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                        enable_sg_loadstore>;
 
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
-                ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
-                               enable_sg_loadstore>(arg_tp, res_tp, nelems));
+                Impl(arg_tp, res_tp, nelems));
         }
         else {
             constexpr bool disable_sg_loadstore = false;
             using KernelName =
                 disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
+            using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                        disable_sg_loadstore>;
 
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
-                ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
-                               disable_sg_loadstore>(arg_tp, res_tp, nelems));
+                Impl(arg_tp, res_tp, nelems));
         }
     });
 
@@ -377,9 +381,10 @@ unary_strided_impl(sycl::queue &exec_q,
         const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
         resTy *res_tp = reinterpret_cast<resTy *>(res_p);
 
+        using Impl = StridedFunctorT<argTy, resTy, IndexerT>;
+
         cgh.parallel_for<kernel_name<argTy, resTy, IndexerT>>(
-            {nelems},
-            StridedFunctorT<argTy, resTy, IndexerT>(arg_tp, res_tp, indexer));
+            {nelems}, Impl(arg_tp, res_tp, indexer));
     });
     return comp_ev;
 }
@@ -814,22 +819,23 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
         {
             constexpr bool enable_sg_loadstore = true;
             using KernelName = BaseKernelName;
+            using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
+                                              n_vecs, enable_sg_loadstore>;
 
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
-                BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz, n_vecs,
-                                     enable_sg_loadstore>(arg1_tp, arg2_tp,
-                                                          res_tp, nelems));
+                Impl(arg1_tp, arg2_tp, res_tp, nelems));
         }
         else {
             constexpr bool disable_sg_loadstore = false;
             using KernelName =
                 disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
+            using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
+                                              n_vecs, disable_sg_loadstore>;
+
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
-                BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz, n_vecs,
-                                     disable_sg_loadstore>(arg1_tp, arg2_tp,
-                                                           res_tp, nelems));
+                Impl(arg1_tp, arg2_tp, res_tp, nelems));
         }
     });
     return comp_ev;
@@ -873,9 +879,10 @@ binary_strided_impl(sycl::queue &exec_q,
         const argTy2 *arg2_tp = reinterpret_cast<const argTy2 *>(arg2_p);
         resTy *res_tp = reinterpret_cast<resTy *>(res_p);
 
+        using Impl = BinaryStridedFunctorT<argTy1, argTy2, resTy, IndexerT>;
+
         cgh.parallel_for<kernel_name<argTy1, argTy2, resTy, IndexerT>>(
-            {nelems}, BinaryStridedFunctorT<argTy1, argTy2, resTy, IndexerT>(
-                          arg1_tp, arg2_tp, res_tp, indexer));
+            {nelems}, Impl(arg1_tp, arg2_tp, res_tp, indexer));
     });
     return comp_ev;
 }
@@ -917,13 +924,9 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl(
                                                                exec_q);
     argT2 *padded_vec = padded_vec_owner.get();
 
-    sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends); // ensure vec contains actual data
-        cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
-            auto i = id[0];
-            padded_vec[i] = vec[i % n1];
-        });
-    });
+    sycl::event make_padded_vec_ev =
+        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
+            argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends);
 
     // sub-group spans work-items [I, I + sgSize)
     // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
@@ -942,10 +945,12 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl(
         std::size_t n_groups = (n_elems + lws - 1) / lws;
         auto gwsRange = sycl::range<1>(n_groups * lws);
 
+        using Impl =
+            BinaryContigMatrixContigRowBroadcastFunctorT<argT1, argT2, resT>;
+
         cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
             sycl::nd_range<1>(gwsRange, lwsRange),
-            BinaryContigMatrixContigRowBroadcastFunctorT<argT1, argT2, resT>(
-                mat, padded_vec, res, n_elems, n1));
+            Impl(mat, padded_vec, res, n_elems, n1));
     });
 
     sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
@@ -993,13 +998,9 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl(
                                                                exec_q);
     argT2 *padded_vec = padded_vec_owner.get();
 
-    sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends); // ensure vec contains actual data
-        cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
-            auto i = id[0];
-            padded_vec[i] = vec[i % n1];
-        });
-    });
+    sycl::event make_padded_vec_ev =
+        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
+            argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends);
 
     // sub-group spans work-items [I, I + sgSize)
     // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
@@ -1018,10 +1019,12 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl(
         std::size_t n_groups = (n_elems + lws - 1) / lws;
         auto gwsRange = sycl::range<1>(n_groups * lws);
 
+        using Impl =
+            BinaryContigRowContigMatrixBroadcastFunctorT<argT1, argT2, resT>;
+
         cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
             sycl::nd_range<1>(gwsRange, lwsRange),
-            BinaryContigRowContigMatrixBroadcastFunctorT<argT1, argT2, resT>(
-                padded_vec, mat, res, n_elems, n1));
+            Impl(padded_vec, mat, res, n_elems, n1));
     });
 
     sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
@@ -0,0 +1,70 @@
+//=== common_detail.hpp -                                     - *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines common code for elementwise tensor operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace elementwise_detail
+{
+
+template <typename T> class populate_padded_vec_krn;
+
+template <typename T>
+sycl::event
+populate_padded_vector(sycl::queue &exec_q,
+                       const T *vec,
+                       std::size_t vec_sz,
+                       T *padded_vec,
+                       size_t padded_vec_sz,
+                       const std::vector<sycl::event> &dependent_events)
+{
+    sycl::event populate_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
+        // ensure vec contains actual data
+        cgh.depends_on(dependent_events);
+
+        sycl::range<1> gRange{padded_vec_sz};
+
+        cgh.parallel_for<class populate_padded_vec_krn<T>>(
+            gRange, [=](sycl::id<1> id) {
+                std::size_t i = id[0];
+                padded_vec[i] = vec[i % vec_sz];
+            });
+    });
+
+    return populate_padded_vec_ev;
+}
+
+} // end of namespace elementwise_detail
+} // end of namespace kernels
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -27,10 +27,12 @@
 #include <cstddef>
 #include <cstdint>
 #include <stdexcept>
+
 #include <sycl/sycl.hpp>
 
 #include "kernels/alignment.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common_detail.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include "utils/sycl_utils.hpp"
@@ -337,23 +339,26 @@ binary_inplace_contig_impl(sycl::queue &exec_q,
         {
             constexpr bool enable_sg_loadstore = true;
             using KernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+            using Impl =
+                BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                            enable_sg_loadstore>;
+
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
-                BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs,
-                                            enable_sg_loadstore>(arg_tp, res_tp,
-                                                                 nelems));
+                Impl(arg_tp, res_tp, nelems));
         }
         else {
             constexpr bool disable_sg_loadstore = true;
             using InnerKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
             using KernelName =
                 disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+            using Impl =
+                BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                            disable_sg_loadstore>;
 
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(gws_range, lws_range),
-                BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs,
-                                            disable_sg_loadstore>(
-                    arg_tp, res_tp, nelems));
+                Impl(arg_tp, res_tp, nelems));
         }
     });
     return comp_ev;
@@ -389,9 +394,10 @@ binary_inplace_strided_impl(sycl::queue &exec_q,
         const argTy *arg_tp = reinterpret_cast<const argTy *>(rhs_p);
         resTy *res_tp = reinterpret_cast<resTy *>(lhs_p);
 
+        using Impl = BinaryInplaceStridedFunctorT<argTy, resTy, IndexerT>;
+
         cgh.parallel_for<kernel_name<argTy, resTy, IndexerT>>(
-            {nelems}, BinaryInplaceStridedFunctorT<argTy, resTy, IndexerT>(
-                          arg_tp, res_tp, indexer));
+            {nelems}, Impl(arg_tp, res_tp, indexer));
     });
     return comp_ev;
 }
@@ -428,13 +434,9 @@ sycl::event binary_inplace_row_matrix_broadcast_impl(
                                                               exec_q);
     argT *padded_vec = padded_vec_owner.get();
 
-    sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends); // ensure vec contains actual data
-        cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
-            auto i = id[0];
-            padded_vec[i] = vec[i % n1];
-        });
-    });
+    sycl::event make_padded_vec_ev =
+        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
+            argT>(exec_q, vec, n1, padded_vec, n1_padded, depends);
 
     // sub-group spans work-items [I, I + sgSize)
     // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
@@ -453,10 +455,11 @@ sycl::event binary_inplace_row_matrix_broadcast_impl(
         std::size_t n_groups = (n_elems + lws - 1) / lws;
         auto gwsRange = sycl::range<1>(n_groups * lws);
 
+        using Impl = BinaryInplaceRowMatrixBroadcastFunctorT<argT, resT>;
+
         cgh.parallel_for<class kernel_name<argT, resT>>(
             sycl::nd_range<1>(gwsRange, lwsRange),
-            BinaryInplaceRowMatrixBroadcastFunctorT<argT, resT>(padded_vec, mat,
-                                                                n_elems, n1));
+            Impl(padded_vec, mat, n_elems, n1));
     });
 
     sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp