Skip to content

Commit 1d2be44

Browse files
Host incrementable iterator approach 2 (#4697)
* Checkpoint for implementing approach 2 in gh-4148 * Checkpoint 2 * Printing added for debugging * Almost working version * Remove superfluous import, increased num_segments to exceed INT_MAX * Define segment size variable and use it in the test * Restore INT_MAX increment in streaming iterations in segmented_reduce * Remove stdio from indirect_arg, remove printf * Test for large_num_segments is parametrized by three modulus params The test cooks input sequence in the form of F(i+1) - F(i) for 0 <= i < num_segments * segment_size. Fixed-size segmented reduction is expected to to be [ F(segment_size * (k + 1)) - F(segment_size * k) for k in range(num_segments) ] All operations (addition (+), inversion (-)) are assumed to take place in a ring. In this test the ring is a cycling ring of integers modulo 7. F(i) = g(i) * g(i+1) % 7, where g: i -> (i % 5 + i % 3). Note that `my_add` computes addition modulo 7 too. Choice of moduli 3, 5, 7 is arbitrary and may be changed. The test is expected to pass so long as they are changed consistently throughout the test. * Add test_large_num_segments3 This example computes segmented reduce of input constant iterator of int8(1) values. The offsets iterators are crafted to that segment lengths are 1, 2, ..., (p - 2), (p - 1), 1, 2, ..., (p - 2), (p - 1), 1, ... The expected sums would equal these lengths. The example is run for (2**15 + 2**3) * 2**16 number of segments which is INT_MAX and about 2**19 elements. 1. Input iterator changed to have type int16 2. Initial value changed to have type int16 3. Changed range of change of segments from [1, p] to [m0, m0 + p) and set m0, p as they are set in native test. Specifically, m0, p = 265, 163 This means segment sizes vary from 265 elements to 428 elements, which is an improvement over previous situation where it was varying from 1 to 113 elements (and most threads in a 256-string block were unused). * Change indirect_iterator_t::operator+= to throw if host_advance_fn is null This ensures meaningful prescriptive cerr output. * Enable large num_segments test in test_segmented_reduce.cpp * Add docstrings to test_large_num_segments and test_large_num_segments3 * Fix type in the test name in test_bindings * Remove make_advance_iterator. Implement IteratorBase.__add__ to copy state and advance cvalue. We rely on cvalue being a ctype for which __add__ is defined * Add host_advance_ property to iterators For IteratorBase it is defined as None For concrete iterators, it is defined as input_iterators, except for transform iterator, where host_advance does not involve numba-cuda compiled object. * Add stubs for Iterators.host_advance_fn getter/setter * Add function to set host_advance_fn member of bindings Iterator member * segmented_reduce calls to set host_advance_fn on output and start/end offset iterators This is done during build step irrespective of the input size, but it should perhaps be done as needed the first time num_segments > INT_MAX is encountered. TBD * Remove band-aid setting of host_advance_fn. The host_advance function is now generated and set during segmented_reduce build step. * Use concept to narrow types of supported increment in indirect_iterator_t::operator+= * Change validation of large num_segments tests in Python Validation is now performed by constructing an iterator yielding expected result. The allocating fixed size uint8 buffer and running binary transform to compare actual to expected and write result of comparison to the validation buffer. We then use `cp.all(validation.view(cp.bool_))` to validate this chunk, and move to the next. * Mark large num_segments tests as large * Run large tests, but do it sequentially (-n 0)
1 parent 1ea3eeb commit 1d2be44

File tree

12 files changed

+576
-83
lines changed

12 files changed

+576
-83
lines changed

c/parallel/include/cccl/c/types.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#endif // !_WIN32
2222

2323
#include <stddef.h>
24+
#include <stdint.h>
2425

2526
#include <cccl/c/extern_c.h>
2627

@@ -77,6 +78,14 @@ typedef struct cccl_value_t
7778
void* state;
7879
} cccl_value_t;
7980

81+
typedef union
82+
{
83+
int64_t signed_offset;
84+
uint64_t unsigned_offset;
85+
} cccl_increment_t;
86+
87+
typedef void (*cccl_host_op_fn_ptr_t)(void*, cccl_increment_t);
88+
8089
typedef struct cccl_iterator_t
8190
{
8291
size_t size;
@@ -86,6 +95,7 @@ typedef struct cccl_iterator_t
8695
cccl_op_t dereference;
8796
cccl_type_info value_type;
8897
void* state;
98+
cccl_host_op_fn_ptr_t host_advance;
8999
} cccl_iterator_t;
90100

91101
typedef enum cccl_sort_order_t

c/parallel/src/segmented_reduce.cu

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ CUresult cccl_device_segmented_reduce_build(
276276

277277
const std::string input_iterator_src =
278278
make_kernel_input_iterator(offset_t, "input_iterator_t", input_it_value_t, input_it);
279+
279280
const std::string output_iterator_src =
280281
make_kernel_output_iterator(offset_t, "output_iterator_t", accum_cpp, output_it);
281282

@@ -407,9 +408,9 @@ CUresult cccl_device_segmented_reduce(
407408

408409
auto exec_status = cub::DispatchSegmentedReduce<
409410
indirect_arg_t, // InputIteratorT
410-
indirect_arg_t, // OutputIteratorT
411-
indirect_arg_t, // BeginSegmentIteratorT
412-
indirect_arg_t, // EndSegmentIteratorT
411+
indirect_iterator_t, // OutputIteratorT
412+
indirect_iterator_t, // BeginSegmentIteratorT
413+
indirect_iterator_t, // EndSegmentIteratorT
413414
OffsetT, // OffsetT
414415
indirect_arg_t, // ReductionOpT
415416
indirect_arg_t, // InitT
@@ -421,10 +422,10 @@ CUresult cccl_device_segmented_reduce(
421422
d_temp_storage,
422423
*temp_storage_bytes,
423424
d_in,
424-
d_out,
425+
indirect_iterator_t{d_out},
425426
num_segments,
426-
start_offset,
427-
end_offset,
427+
indirect_iterator_t{start_offset},
428+
indirect_iterator_t{end_offset},
428429
op,
429430
init,
430431
stream,

c/parallel/src/util/indirect_arg.h

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010

1111
#pragma once
1212

13+
#include <cstdint>
14+
#include <stdexcept>
15+
#include <type_traits>
16+
1317
#include <cccl/c/types.h>
1418

1519
struct indirect_arg_t
@@ -33,3 +37,69 @@ struct indirect_arg_t
3337
return ptr;
3438
}
3539
};
40+
41+
template <typename U>
42+
concept Increment64 = std::is_integral_v<U> && sizeof(U) == sizeof(int64_t);
43+
44+
struct indirect_iterator_t
45+
{
46+
void* ptr;
47+
size_t value_size;
48+
cccl_host_op_fn_ptr_t host_advance_fn_p;
49+
50+
indirect_iterator_t(cccl_iterator_t& it)
51+
: ptr{nullptr}
52+
, value_size{0}
53+
, host_advance_fn_p{nullptr}
54+
{
55+
if (it.type == cccl_iterator_kind_t::CCCL_POINTER)
56+
{
57+
value_size = it.value_type.size;
58+
ptr = &it.state;
59+
}
60+
else
61+
{
62+
ptr = it.state;
63+
host_advance_fn_p = it.host_advance;
64+
}
65+
}
66+
67+
void* operator&() const
68+
{
69+
return ptr;
70+
}
71+
72+
template <Increment64 U>
73+
void operator+=(U offset)
74+
{
75+
if (value_size)
76+
{
77+
// CCCL_POINTER case
78+
// ptr is a pointer to pointer we need to increment
79+
// read the iterator pointer value
80+
char*& p = *static_cast<char**>(ptr);
81+
// increment the value
82+
p += (offset * value_size);
83+
}
84+
else
85+
{
86+
if (host_advance_fn_p)
87+
{
88+
if constexpr (std::is_signed_v<U>)
89+
{
90+
cccl_increment_t incr{.signed_offset = offset};
91+
(*host_advance_fn_p)(ptr, incr);
92+
}
93+
else
94+
{
95+
cccl_increment_t incr{.unsigned_offset = offset};
96+
(*host_advance_fn_p)(ptr, incr);
97+
}
98+
}
99+
else
100+
{
101+
throw std::runtime_error("Attempt to increment iterator from host, but host advance function is not defined");
102+
}
103+
}
104+
}
105+
};

c/parallel/test/test_segmented_reduce.cpp

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,22 @@ struct host_check_functor_state
462462
DataT* m_ptr;
463463
};
464464

465+
template <typename StateT>
466+
void host_advance_transform_it_state(void* state, cccl_increment_t offset)
467+
{
468+
auto st = reinterpret_cast<StateT*>(state);
469+
using IndexT = decltype(st->base_it_state.value);
470+
471+
if constexpr (std::is_signed_v<IndexT>)
472+
{
473+
st->base_it_state.value += offset.signed_offset;
474+
}
475+
else
476+
{
477+
st->base_it_state.value += offset.unsigned_offset;
478+
}
479+
}
480+
465481
namespace validate
466482
{
467483

@@ -535,14 +551,12 @@ C2H_TEST("SegmentedReduce works with large num_segments", "[segmented_reduce]")
535551

536552
static constexpr IndexT n_segments_base = (IndexT(1) << 15) + (IndexT(1) << 3);
537553
static constexpr IndexT n_segments_under_int_max = n_segments_base << 10;
554+
static_assert(n_segments_under_int_max < INT_MAX);
538555

539-
#ifdef SUPPORTS_HOST_INCREMENT
540556
static constexpr IndexT n_segments_over_int_max = n_segments_base << 16;
557+
static_assert(n_segments_over_int_max > INT_MAX);
541558

542559
const IndexT n_segments = GENERATE(n_segments_under_int_max, n_segments_over_int_max);
543-
#else
544-
const IndexT n_segments = GENERATE(n_segments_under_int_max, n_segments_under_int_max * 8);
545-
#endif
546560

547561
// first define constant iterator:
548562
// iterators.ConstantIterator(np.int8(1))
@@ -660,6 +674,10 @@ extern "C" __device__ void {0}(const void *x1_p, const void *x2_p, void *out_p)
660674
auto cccl_start_offsets_it = static_cast<cccl_iterator_t>(start_offsets_it);
661675
auto cccl_end_offsets_it = static_cast<cccl_iterator_t>(end_offsets_it);
662676

677+
// set host_advance functions
678+
cccl_start_offsets_it.host_advance = &host_advance_transform_it_state<HostTransformStateT>;
679+
cccl_end_offsets_it.host_advance = &host_advance_transform_it_state<HostTransformStateT>;
680+
663681
value_t<DataT> h_init{DataT{0}};
664682

665683
auto& build_cache = get_cache<SegmentedReduce_LargeNumSegments_Fixture_Tag>();

ci/test_cuda_parallel_python.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ python -m pip install "${CUDA_PARALLEL_WHEEL_PATH}[test]"
2121
# Run tests
2222
cd "/home/coder/cccl/python/cuda_parallel/tests/"
2323
python -m pytest -n 6 -v -m "not large"
24+
python -m pytest -n 0 -v -m "large"

python/cuda_parallel/cuda/parallel/experimental/_bindings.pyi

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ class Iterator:
132132
dereference_fn: Op,
133133
value_type: TypeInfo,
134134
state=None,
135+
host_advance_fn=None,
135136
):
136137
pass
137138

@@ -148,6 +149,10 @@ class Iterator:
148149
def as_bytes(self) -> bytes: ...
149150
def is_kind_pointer(self) -> bool: ...
150151
def is_kind_iterator(self) -> bool: ...
152+
@property
153+
def host_advance_fn(self): ...
154+
@host_advance_fn.setter
155+
def host_advance_fn(self, value) -> None: ...
151156

152157
class CommonData:
153158
def __init__(

0 commit comments

Comments
 (0)