Skip to content

Commit 41e1124

Browse files
Implement indirect_host_incrementable_iterator_t as outlined as approach 1 of NVIDIAgh-4148
1 parent 6b55826 commit 41e1124

File tree

4 files changed

+275
-8
lines changed

4 files changed

+275
-8
lines changed

c/parallel/src/kernels/iterators.cpp

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,53 @@ struct __align__(OP_ALIGNMENT) {0} {{
7373
return std::format(format_template, diff_t, alignment, size, value_t, deref, advance, iter_def);
7474
};
7575

76+
std::string make_kernel_input_iterator_with_host_increment(
77+
std::string_view diff_t,
78+
size_t alignment,
79+
size_t size,
80+
std::string_view iterator_name,
81+
std::string_view value_t,
82+
std::string_view deref,
83+
std::string_view advance)
84+
{
85+
const std::string iter_def = std::format(R"XXX(
86+
extern "C" __device__ VALUE_T DEREF(const void *self_ptr);
87+
extern "C" __device__ void ADVANCE(void *self_ptr, DIFF_T offset);
88+
struct __align__(OP_ALIGNMENT) {0} {{
89+
using iterator_category = cuda::std::random_access_iterator_tag;
90+
using value_type = VALUE_T;
91+
using difference_type = DIFF_T;
92+
using pointer = VALUE_T*;
93+
using reference = VALUE_T&;
94+
95+
static_assert(sizeof(difference_type) == sizeof(long long int));
96+
97+
__device__ inline value_type operator*() const {{
98+
const {0} &it = (*this + host_offset);
99+
return DEREF(it.data);
100+
}}
101+
__device__ inline {0}& operator+=(difference_type diff) {{
102+
ADVANCE(data, diff);
103+
return *this;
104+
}}
105+
__device__ inline value_type operator[](difference_type diff) const {{
106+
const {0} &it = (*this + (diff + host_offset));
107+
return DEREF(it.data);
108+
}}
109+
__device__ inline {0} operator+(difference_type diff) const {{
110+
{0} result = *this;
111+
result += diff;
112+
return result;
113+
}}
114+
char data[OP_SIZE];
115+
difference_type host_offset;
116+
}};
117+
)XXX",
118+
iterator_name);
119+
120+
return std::format(format_template, diff_t, alignment, size, value_t, deref, advance, iter_def);
121+
};
122+
76123
std::string make_kernel_input_iterator(
77124
std::string_view offset_t, std::string_view iterator_name, std::string_view input_value_t, cccl_iterator_t iter)
78125
{
@@ -85,6 +132,18 @@ std::string make_kernel_input_iterator(
85132
offset_t, iter.alignment, iter.size, iterator_name, input_value_t, iter.dereference.name, iter.advance.name);
86133
}
87134

135+
std::string make_kernel_input_iterator_with_host_increment(
136+
std::string_view offset_t, std::string_view iterator_name, std::string_view input_value_t, cccl_iterator_t iter)
137+
{
138+
if (iter.type == cccl_iterator_kind_t::CCCL_POINTER)
139+
{
140+
return {};
141+
}
142+
143+
return make_kernel_input_iterator_with_host_increment(
144+
offset_t, iter.alignment, iter.size, iterator_name, input_value_t, iter.dereference.name, iter.advance.name);
145+
}
146+
88147
std::string make_kernel_output_iterator(
89148
std::string_view diff_t,
90149
size_t alignment,
@@ -148,6 +207,73 @@ std::string make_kernel_output_iterator(
148207
offset_t, iter.alignment, iter.size, iterator_name, input_value_t, iter.dereference.name, iter.advance.name);
149208
}
150209

210+
std::string make_kernel_output_iterator_with_host_increment(
211+
std::string_view diff_t,
212+
size_t alignment,
213+
size_t size,
214+
std::string_view iterator_name,
215+
std::string_view value_t,
216+
std::string_view deref,
217+
std::string_view advance)
218+
{
219+
const std::string iter_def = std::format(R"XXX(
220+
extern "C" __device__ void DEREF(const void *self_ptr, VALUE_T x);
221+
extern "C" __device__ void ADVANCE(void *self_ptr, DIFF_T offset);
222+
struct __align__(OP_ALIGNMENT) {0}_state_t {{
223+
char data[OP_SIZE];
224+
}};
225+
struct {0}_proxy_t {{
226+
__device__ {0}_proxy_t operator=(VALUE_T x) {{
227+
DEREF(&state, x);
228+
return *this;
229+
}}
230+
{0}_state_t state;
231+
}};
232+
struct {0} {{
233+
using iterator_category = cuda::std::random_access_iterator_tag;
234+
using difference_type = DIFF_T;
235+
using value_type = void;
236+
using pointer = {0}_proxy_t*;
237+
using reference = {0}_proxy_t;
238+
__device__ {0}_proxy_t operator*() const {{
239+
const {0} &it = (*this + host_offset);
240+
return {{it.state}};
241+
}}
242+
__device__ {0}& operator+=(difference_type diff) {{
243+
ADVANCE(&state, diff);
244+
return *this;
245+
}}
246+
__device__ {0}_proxy_t operator[](difference_type diff) const {{
247+
{0} result = *this;
248+
result += (diff + host_offset);
249+
return {{ result.state }};
250+
}}
251+
__device__ {0} operator+(difference_type diff) const {{
252+
{0} result = *this;
253+
result += diff;
254+
return result;
255+
}}
256+
{0}_state_t state;
257+
difference_type host_offset;
258+
}};
259+
)XXX",
260+
iterator_name);
261+
262+
return std::format(format_template, diff_t, alignment, size, value_t, deref, advance, iter_def);
263+
};
264+
265+
std::string make_kernel_output_iterator_with_host_increment(
266+
std::string_view offset_t, std::string_view iterator_name, std::string_view input_value_t, cccl_iterator_t iter)
267+
{
268+
if (iter.type == cccl_iterator_kind_t::CCCL_POINTER)
269+
{
270+
return {};
271+
}
272+
273+
return make_kernel_output_iterator_with_host_increment(
274+
offset_t, iter.alignment, iter.size, iterator_name, input_value_t, iter.dereference.name, iter.advance.name);
275+
}
276+
151277
std::string make_kernel_inout_iterator(
152278
std::string_view diff_t,
153279
size_t alignment,

c/parallel/src/kernels/iterators.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,13 @@
1818
std::string make_kernel_input_iterator(
1919
std::string_view offset_t, std::string_view iterator_name, std::string_view input_value_t, cccl_iterator_t iter);
2020

21+
std::string make_kernel_input_iterator_with_host_increment(
22+
std::string_view offset_t, std::string_view iterator_name, std::string_view input_value_t, cccl_iterator_t iter);
23+
2124
std::string make_kernel_output_iterator(
2225
std::string_view offset_t, std::string_view iterator_name, std::string_view input_value_t, cccl_iterator_t iter);
2326

27+
std::string make_kernel_output_iterator_with_host_increment(
28+
std::string_view offset_t, std::string_view iterator_name, std::string_view input_value_t, cccl_iterator_t iter);
29+
2430
std::string make_kernel_inout_iterator(std::string_view offset_t, std::string_view input_value_t, cccl_iterator_t iter);

c/parallel/src/segmented_reduce.cu

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -276,12 +276,12 @@ CUresult cccl_device_segmented_reduce_build(
276276
const std::string input_iterator_src =
277277
make_kernel_input_iterator(offset_t, "input_iterator_t", input_it_value_t, input_it);
278278
const std::string output_iterator_src =
279-
make_kernel_output_iterator(offset_t, "output_iterator_t", accum_cpp, output_it);
279+
make_kernel_output_iterator_with_host_increment(offset_t, "output_iterator_t", accum_cpp, output_it);
280280

281-
const std::string start_offset_iterator_src =
282-
make_kernel_input_iterator(offset_t, "start_offset_iterator_t", start_offset_it_value_t, start_offset_it);
283-
const std::string end_offset_iterator_src =
284-
make_kernel_input_iterator(offset_t, "end_offset_iterator_t", end_offset_it_value_t, end_offset_it);
281+
const std::string start_offset_iterator_src = make_kernel_input_iterator_with_host_increment(
282+
offset_t, "start_offset_iterator_t", start_offset_it_value_t, start_offset_it);
283+
const std::string end_offset_iterator_src = make_kernel_input_iterator_with_host_increment(
284+
offset_t, "end_offset_iterator_t", end_offset_it_value_t, end_offset_it);
285285

286286
const std::string op_src = make_kernel_user_binary_operator(accum_cpp, accum_cpp, accum_cpp, op);
287287

@@ -406,9 +406,9 @@ CUresult cccl_device_segmented_reduce(
406406

407407
cub::DispatchSegmentedReduce<
408408
indirect_arg_t, // InputIteratorT
409-
indirect_arg_t, // OutputIteratorT
410-
indirect_arg_t, // BeginSegmentIteratorT
411-
indirect_arg_t, // EndSegmentIteratorT
409+
indirect_host_incrementable_iterator_t<OffsetT>, // OutputIteratorT
410+
indirect_host_incrementable_iterator_t<OffsetT>, // BeginSegmentIteratorT
411+
indirect_host_incrementable_iterator_t<OffsetT>, // EndSegmentIteratorT
412412
OffsetT, // OffsetT
413413
indirect_arg_t, // ReductionOpT
414414
indirect_arg_t, // InitT

c/parallel/src/util/indirect_arg.h

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,14 @@
1010

1111
#pragma once
1212

13+
#include <cstdlib> // aligned_alloc
14+
#include <cstring> // for memcpy
15+
#include <iterator>
16+
#include <memory> // for make_unique, unique_ptr
17+
1318
#include <cccl/c/types.h>
19+
#include <stddef.h> // size_t
20+
#include <stdint.h> // uint64_t
1421

1522
struct indirect_arg_t
1623
{
@@ -33,3 +40,131 @@ struct indirect_arg_t
3340
return ptr;
3441
}
3542
};
43+
44+
class FreeDeleter
45+
{
46+
public:
47+
FreeDeleter() = default;
48+
~FreeDeleter() = default;
49+
50+
template <typename U>
51+
void operator()(U* ptr) const
52+
{
53+
std::free(ptr);
54+
}
55+
};
56+
57+
template <typename OffsetT>
58+
struct indirect_host_incrementable_iterator_t
59+
{
60+
static_assert(std::is_integral_v<OffsetT> && sizeof(OffsetT) == sizeof(uint64_t));
61+
62+
using iterator_category = ::std::random_access_iterator_tag;
63+
using value_type = void;
64+
using difference_type = OffsetT;
65+
using pointer = void*;
66+
using reference = void*;
67+
68+
void* state_ptr;
69+
OffsetT* host_offset_ptr;
70+
std::unique_ptr<void, FreeDeleter> owner;
71+
size_t value_size;
72+
size_t allocation_nbytes;
73+
74+
indirect_host_incrementable_iterator_t(cccl_iterator_t& it)
75+
: state_ptr{}
76+
, host_offset_ptr{}
77+
, owner{}
78+
, value_size{}
79+
, allocation_nbytes{}
80+
{
81+
if (it.type == cccl_iterator_kind_t::CCCL_ITERATOR)
82+
{
83+
// we allocate memory to hold state and host_offset value of type uint64_t
84+
// the content of this allocation is to be copied by CUDA driver to the device
85+
const size_t offset_ptr_offset = align_up(it.size, sizeof(OffsetT));
86+
allocation_nbytes = offset_ptr_offset + sizeof(OffsetT);
87+
88+
owner = std::unique_ptr<void, FreeDeleter>(std::aligned_alloc(it.alignment, allocation_nbytes), FreeDeleter{});
89+
state_ptr = owner.get();
90+
// initialized host_offset variable to zero
91+
std::memset(state_ptr, 0, allocation_nbytes);
92+
93+
host_offset_ptr = reinterpret_cast<OffsetT*>(reinterpret_cast<char*>(state_ptr) + offset_ptr_offset);
94+
std::memcpy(state_ptr, it.state, it.size);
95+
}
96+
else
97+
{
98+
state_ptr = &it.state;
99+
value_size = it.value_type.size;
100+
}
101+
}
102+
103+
indirect_host_incrementable_iterator_t(const indirect_host_incrementable_iterator_t& other)
104+
: state_ptr{}
105+
, host_offset_ptr{}
106+
, owner{}
107+
, value_size{}
108+
, allocation_nbytes{}
109+
{
110+
if (other.owner)
111+
{
112+
size_t alignment = reinterpret_cast<uintptr_t>(other.state_ptr) & 63;
113+
allocation_nbytes = other.allocation_nbytes;
114+
owner = std::unique_ptr<void, FreeDeleter>(std::aligned_alloc(alignment, allocation_nbytes), FreeDeleter{});
115+
state_ptr = owner.get();
116+
size_t relative_offset =
117+
(reinterpret_cast<char*>(other.host_offset_ptr) - reinterpret_cast<char*>(other.state_ptr));
118+
host_offset_ptr = reinterpret_cast<OffsetT*>(reinterpret_cast<char*>(state_ptr) + relative_offset);
119+
std::memcpy(state_ptr, other.owner.get(), allocation_nbytes);
120+
}
121+
else
122+
{
123+
state_ptr = other.state_ptr;
124+
value_size = other.value_size;
125+
}
126+
}
127+
128+
template <typename DiffT, std::enable_if_t<std::is_integral_v<DiffT> && sizeof(DiffT) == sizeof(OffsetT), bool> = true>
129+
indirect_host_incrementable_iterator_t& operator+=(DiffT offset)
130+
{
131+
if (host_offset_ptr)
132+
{
133+
// iterator kind: CCCL_ITERATOR
134+
DiffT* p = reinterpret_cast<DiffT*>(host_offset_ptr);
135+
*p += offset;
136+
return *this;
137+
}
138+
else
139+
{
140+
// iterator kind: CCCL_POINTER
141+
char** c_ptr = reinterpret_cast<char**>(state_ptr);
142+
*c_ptr += (offset * value_size);
143+
return *this;
144+
}
145+
}
146+
147+
template <typename DiffT, std::enable_if_t<std::is_integral_v<DiffT> && sizeof(DiffT) == sizeof(OffsetT), bool> = true>
148+
indirect_host_incrementable_iterator_t operator+(DiffT offset) const
149+
{
150+
indirect_host_incrementable_iterator_t temp = *this;
151+
return temp += offset;
152+
}
153+
154+
void* operator&() const
155+
{
156+
return state_ptr;
157+
}
158+
159+
OffsetT get_offset() const
160+
{
161+
return *host_offset_ptr;
162+
}
163+
164+
private:
165+
template <typename IndexT>
166+
IndexT align_up(IndexT n, IndexT m)
167+
{
168+
return ((n + m - 1) / m) * m;
169+
}
170+
};

0 commit comments

Comments
 (0)