Add cuda.parallel.experimental.iterators._strided with NdArrayIterator

oleksandr-pavlyk · oleksandr-pavlyk · commit 26f5778b2f8c · 2025-04-15T11:40:24.000-05:00
The NdArrayIterator is an input iterator which traverses the array elements
of a strided nd-array in the same order of corresponding flat array, but without
making copies.

This iterator enables two `test_segmented_reduce_api.py` examples:
"segmented-reduce-columnwise-maximum" and "segmented-reduce-multiaxis-sum".
diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_strided.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_strided.py
@@ -0,0 +1,220 @@
+import ctypes
+from functools import lru_cache
+from typing import Tuple
+
+import numba
+import numpy as np
+from numba import types
+from numba.core import cgutils
+from numba.core.extending import (
+    models,
+    register_model,
+)
+from numba.core.typing.templates import AttributeTemplate
+from numba.cuda.cudadecl import registry as cuda_registry
+from numba.cuda.cudaimpl import registry as cuda_lower_registry
+
+from . import _iterators
+
+
+@lru_cache
+def make_iterator_struct_class(ndim):
+    shape_ctype = ctypes.c_int64 * ndim
+    strides_ctype = ctypes.c_int64 * ndim
+
+    class StridedArrayView(ctypes.Structure):
+        _fields_ = [
+            ("linear_id", ctypes.c_int64),
+            ("ptr", ctypes.c_void_p),
+            ("shape", shape_ctype),
+            ("strides", strides_ctype),
+            ("ndim", ctypes.c_int32),
+        ]
+
+    return StridedArrayView
+
+
+def iterator_struct_ctype(ptr: int, ndim: int, shape: Tuple[int], strides: Tuple[int]):
+    StridedArrayView_cls = make_iterator_struct_class(ndim)
+
+    c_shape = (ctypes.c_int64 * ndim)(*shape)
+    c_strides = (ctypes.c_int64 * ndim)(*strides)
+    return StridedArrayView_cls(0, ptr, c_shape, c_strides, ndim)
+
+
+@lru_cache
+def strided_view_iterator_numba_type(value_type: types.Type, ndim: int):
+    """Returns the numba type that stores a typed pointer
+    to record describing strided view into nd-array of
+    elements with type `value_type` that has `ndim` dimensions.
+
+    On the host the struct would be accessed using
+    ``StridedArrayView`` ctype class defined above.
+    """
+    # ------
+    # Typing
+    # ------
+
+    # View into strided device ndarray
+    class NdArrayViewType(types.Type):
+        def __init__(self):
+            super(NdArrayViewType, self).__init__(name="NdArrayView")
+
+    ndarray_view_type = NdArrayViewType()
+    ptr_type = types.CPointer(ndarray_view_type)
+
+    int64_numba_t = numba.from_dtype(np.int64)
+    shape_arr_numba_t = types.UniTuple(int64_numba_t, ndim)
+    strides_arr_numba_t = types.UniTuple(int64_numba_t, ndim)
+    ndarray_view_members = [
+        ("linear_id", int64_numba_t),
+        ("ptr", types.CPointer(value_type)),
+        ("shape", shape_arr_numba_t),
+        ("strides", strides_arr_numba_t),
+        ("ndim", numba.from_dtype(np.int32)),
+    ]
+
+    # Typing for accessing attributes of the struct members
+    class NdArrayViewAttrsTemplate(AttributeTemplate):
+        pass
+
+    def make_attr_resolver(ty):
+        """
+        Function to capture a copy of **ty** argument in resolve function
+        """
+
+        def resolve_fn(self, pp):
+            return ty
+
+        return resolve_fn
+
+    for name, typ in ndarray_view_members:
+        setattr(NdArrayViewAttrsTemplate, f"resolve_{name}", make_attr_resolver(typ))
+
+    @cuda_registry.register_attr
+    class NdArrayViewAttrs(NdArrayViewAttrsTemplate):
+        key = ndarray_view_type
+
+    @cuda_registry.register_attr
+    class PtrAttrs(AttributeTemplate):
+        key = ptr_type
+
+        def resolve_linear_id(self, pp):
+            return int64_numba_t
+
+    # -----------
+    # Data models
+    # -----------
+
+    @register_model(NdArrayViewType)
+    class NdArrayViewModel(models.StructModel):
+        def __init__(self, dmm, fe_type):
+            super().__init__(dmm, fe_type, ndarray_view_members)
+
+    # --------
+    # Lowering
+    # --------
+
+    @cuda_lower_registry.lower_getattr_generic(ndarray_view_type)
+    def ndarray_view_getattr_lowering_fn(context, builder, sig, arg, attr):
+        struct_values = cgutils.create_struct_proxy(ndarray_view_type)(
+            context, builder, value=arg
+        )
+        attr_ptr = struct_values._get_ptr_by_name(attr)
+        attr_val = builder.load(attr_ptr)
+        return attr_val
+
+    @cuda_lower_registry.lower_setattr(ptr_type, "linear_id")
+    def ndarray_view_pointer_set_linear_id(context, builder, sig, args):
+        data = builder.load(args[0])
+        values = cgutils.create_struct_proxy(ndarray_view_type)(
+            context, builder, value=data
+        )
+        setattr(values, "linear_id", args[1])
+        return builder.store(values._getvalue(), args[0])
+
+    @cuda_lower_registry.lower_getattr(ptr_type, "linear_id")
+    def ndarray_view_pointer_get_linear_id(context, builder, sig, arg):
+        data = builder.load(arg)
+        values = cgutils.create_struct_proxy(ndarray_view_type)(
+            context, builder, value=data
+        )
+        attr_ptr = values._get_ptr_by_name("linear_id")
+        attr_val = builder.load(attr_ptr)
+        return attr_val
+
+    return ndarray_view_type
+
+
+class NdArrayIteratorKind(_iterators.IteratorKind):
+    pass
+
+
+class NdArrayIterator(_iterators.IteratorBase):
+    iterator_kind_type = NdArrayIteratorKind
+
+    def __init__(
+        self, ptr: int, value_type: types.Type, shape: Tuple[int], strides: Tuple[int]
+    ):
+        ndim = len(shape)
+        if not (len(strides) == ndim):
+            raise ValueError
+
+        state_numba_type = strided_view_iterator_numba_type(value_type, ndim)
+        numba_type = types.CPointer(state_numba_type)
+        # build ctypes struct for state of iterator
+        host_sav_cvalue = iterator_struct_ctype(ptr, ndim, shape, strides)
+        super().__init__(
+            cvalue=host_sav_cvalue,
+            numba_type=numba_type,
+            state_type=state_numba_type,
+            value_type=value_type,
+        )
+
+    @staticmethod
+    def advance(state_ref, distance):
+        state_ref.linear_id = state_ref.linear_id + distance
+
+    @staticmethod
+    def dereference(state_ref):
+        state = state_ref[0]
+        id_ = state.linear_id
+        # init offset_ to zero of the same type as id_
+        offset_ = id_ - id_
+        ndim_ = state.ndim
+        if ndim_ > 0:
+            shape_ = state.shape
+            strides_ = state.strides
+            one_i32 = numba.int32(1)
+            for i in range(one_i32, ndim_):
+                bi_ = ndim_ - i
+                sh_i = shape_[bi_]
+                if sh_i > 0:
+                    q_ = id_ // sh_i
+                    r_ = id_ - q_ * sh_i
+                else:
+                    q_ = id_
+                    r_ = id_ - id_  # make zero of the right type
+                offset_ = offset_ + r_ * strides_[bi_]
+                id_ = q_
+            zero_i32 = one_i32 - one_i32
+            offset_ = offset_ + id_ * strides_[zero_i32]
+        val = (state.ptr)[offset_]
+        return val
+
+
+def make_ndarray_iterator(array_like, perm):
+    ptr = array_like.data.ptr
+    dt = array_like.dtype
+    shape_ = array_like.shape
+    strides_ = array_like.strides
+    itemsize = array_like.itemsize
+    perm_shape, perm_strides, rems = zip(
+        *tuple(
+            (shape_[idx], (strides_[idx] // itemsize), strides_[idx] % itemsize)
+            for idx in perm
+        )
+    )
+    assert all(rem == 0 for rem in rems)
+
+    return NdArrayIterator(ptr, numba.from_dtype(dt), perm_shape, perm_strides)
diff --git a/python/cuda_parallel/tests/test_segmented_reduce_api.py b/python/cuda_parallel/tests/test_segmented_reduce_api.py
@@ -105,3 +105,133 @@ def scale(row_id):
     expected = cp.sum(mat, axis=-1)
     assert cp.all(d_output == expected)
     # example-end segmented-reduce-columnwise-total
+
+
+def test_segmented_reduce_for_columnwise_max():
+    # example-begin segmented-reduce-columnwise-maximum
+    import cupy as cp
+    import numpy as np
+
+    import cuda.parallel.experimental.algorithms as algorithms
+    import cuda.parallel.experimental.iterators as iterators
+    from cuda.parallel.experimental.iterators._strided import make_ndarray_iterator
+
+    def binary_op(a, b):
+        return max(a, b)
+
+    n_rows, n_cols = 123456, 78
+    rng = cp.random.default_rng()
+    mat = rng.integers(low=-31, high=32, dtype=np.int16, size=(n_rows, n_cols))
+
+    def make_scaler(step):
+        def scale(col_id):
+            return col_id * step
+
+        return scale
+
+    zero = np.int32(0)
+    row_offset = make_scaler(np.int32(n_rows))
+    start_offsets = iterators.TransformIterator(
+        iterators.CountingIterator(zero), row_offset
+    )
+
+    end_offsets = start_offsets + 1
+
+    d_input = cp.asarray(mat)
+    # identity of max operator is the smallest value held by a type
+    h_init = np.asarray(np.iinfo(np.int16).min, dtype=np.int16)
+    d_output = cp.empty(n_cols, dtype=d_input.dtype)
+
+    # iterator input array permutted so that columns are traversed first
+    input_it = make_ndarray_iterator(d_input, (1, 0))
+
+    alg = algorithms.segmented_reduce(
+        input_it, d_output, start_offsets, end_offsets, binary_op, h_init
+    )
+
+    # query size of temporary storage and allocate
+    temp_nbytes = alg(
+        None, input_it, d_output, n_cols, start_offsets, end_offsets, h_init
+    )
+    temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
+    # launch computation
+    alg(temp_storage, input_it, d_output, n_cols, start_offsets, end_offsets, h_init)
+
+    # Verify correctness
+    expected = cp.max(mat, axis=0)
+    assert cp.all(d_output == expected)
+    # example-end segmented-reduce-columnwise-maximum
+
+
+def test_segmented_reduce_for_multiaxis_sum():
+    # example-begin segmented-reduce-multiaxis-sum
+    import math
+
+    import cupy as cp
+    import numpy as np
+
+    import cuda.parallel.experimental.algorithms as algorithms
+    import cuda.parallel.experimental.iterators as iterators
+    from cuda.parallel.experimental.iterators._strided import make_ndarray_iterator
+
+    def binary_op(a, b):
+        return a + b
+
+    n0, n1, n2, n3 = 123, 18, 231, 17
+    rng = cp.random.default_rng()
+    arr = rng.integers(low=-31, high=32, dtype=np.int32, size=(n0, n1, n2, n3))
+
+    def make_scaler(step):
+        def scale(id):
+            return id * step
+
+        return scale
+
+    reduce_axis = (0, 2)
+    iterate_axis = (1, 3)
+
+    reduce_nelems = math.prod([arr.shape[i] for i in reduce_axis])
+    iter_nelems = math.prod([arr.shape[i] for i in iterate_axis])
+
+    zero = np.int32(0)
+    scaler_fn = make_scaler(np.int32(reduce_nelems))
+    start_offsets = iterators.TransformIterator(
+        iterators.CountingIterator(zero), scaler_fn
+    )
+
+    end_offsets = start_offsets + 1
+
+    d_input = arr
+    # identity of plus operator is 0
+    h_init = np.zeros(tuple(), dtype=np.int32)
+    d_output = cp.empty(iter_nelems, dtype=d_input.dtype)
+
+    # iterator input array permutted so that columns are traversed first
+    input_it = make_ndarray_iterator(d_input, iterate_axis + reduce_axis)
+
+    alg = algorithms.segmented_reduce(
+        input_it, d_output, start_offsets, end_offsets, binary_op, h_init
+    )
+
+    # query size of temporary storage and allocate
+    temp_nbytes = alg(
+        None, input_it, d_output, iter_nelems, start_offsets, end_offsets, h_init
+    )
+    temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
+    # launch computation
+    alg(
+        temp_storage,
+        input_it,
+        d_output,
+        iter_nelems,
+        start_offsets,
+        end_offsets,
+        h_init,
+    )
+
+    # Verify correctness
+    actual = cp.reshape(d_output, tuple(arr.shape[i] for i in iterate_axis))
+    expected = cp.sum(arr, axis=reduce_axis)
+
+    assert cp.all(actual == expected)
+    # example-end segmented-reduce-multiaxis-sum