Skip to content

Commit 848b524

Browse files
andralexcaugonnetoleksandr-pavlykdavebayergevtushenko
authored
[STF] Refactor CUDASTF allocators (NVIDIA#4306)
* clang-format * remove unnecessary headers * Remove few remaining qualifiers _CCCL_NODISCARD This fixes the build, after the macro definition was removed in NVIDIA#4265 * fix headers * Cleanup libcu++ `force_include.h` test file (NVIDIA#4262) * there are no more tests in this header * Simplify and reduce data copying in algorithm.cuh * Much less tupling and untupling * A few additional simplifications and a few eliminations of copies * Fix ratio plot (NVIDIA#4099) * Fix ratio plot * [pre-commit.ci] auto code formatting --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com> * Drop `_CCCL_NORETURN` (NVIDIA#4268) Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com> * fix clang portability issue in `__rcvr_with_env_t` and remove dead code (NVIDIA#4277) * change version check in `type_list.h` so that *NO* clang-19.X compilers try to use pack indexing (NVIDIA#4278) * fix shfl check (NVIDIA#4282) * Add necessary headers * tweak the cccl compiler version check macros to better agree with intuition (NVIDIA#4279) * tweak the cccl compiler version check macros to better agree with intuition prior to this commit, a compiler check such as: ```c++ ``` would fail if the compiler was actually v19.1. that is because 19.1 is greater than 19. what the author of this code probably intended was to check only the compiler's major version number, in which case the check would have succeed. this commit changes the behavior of the following macros when only a major version number is specified: * `_CCCL_COMPILER` * `_CCCL_CUDA_COMPILER` * `_CCCL_CUDACC_BELOW` * `_CCCL_CUDACC_AT_LEAST` * guard `_CCCL_COMPILER(FOO)` with an extra set of parens * Implement `ranges::single_view` (NVIDIA#4255) * Implement fp overflow handlers (NVIDIA#4261) * Implement fp overflow handlers * I hate nvfp types * use `[[nodiscard]]` --------- Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com> * Fix CI failure * Drop `_LIBCUDACXX_HAS_NO_UNICODE_CHARS` (NVIDIA#4295) * [Version] Update main to v3.1.0 (NVIDIA#4175) * Bump main to 3.1.0. * Update ci/update_version.sh to edit the docs VERSION.md file * Rerun ci/version_update.sh --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Wesley Maxey <wesley.maxey@gmail.com> * WIP: Refactor allocators * Make metadata direct member instead of pointer * Add cached_block_allocator_fifo from PR NVIDIA#2674 * use _CCCL_ASSERT instead of assert * add missing ctors --------- Co-authored-by: Cedric Augonnet <caugonnet@nvidia.com> Co-authored-by: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Co-authored-by: David Bayer <48736217+davebayer@users.noreply.github.com> Co-authored-by: Georgii Evtushenko <evtushenko.georgy@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com> Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com> Co-authored-by: Eric Niebler <eniebler@nvidia.com> Co-authored-by: Federico Busato <50413820+fbusato@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Wesley Maxey <wesley.maxey@gmail.com> Co-authored-by: Cédric Augonnet <158148890+caugonnet@users.noreply.github.com>
1 parent 8326a95 commit 848b524

File tree

4 files changed

+237
-48
lines changed

4 files changed

+237
-48
lines changed

cudax/include/cuda/experimental/__stf/allocators/block_allocator.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ class block_allocator : public block_allocator_untyped
174174
{
175175
public:
176176
template <typename ctx_t, typename... Args>
177-
block_allocator(ctx_t& ctx, Args... args)
177+
block_allocator(ctx_t& ctx, Args&&... args)
178178
: block_allocator_untyped(ctx, ::std::make_shared<T>(::std::forward<Args>(args)...))
179179
{}
180180

cudax/include/cuda/experimental/__stf/allocators/buddy_allocator.cuh

Lines changed: 42 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -63,26 +63,21 @@ private:
6363

6464
public:
6565
buddy_allocator_metadata(size_t size, event_list init_prereqs)
66+
: free_lists_(int_log2(next_power_of_two(size)) + 1)
6667
{
67-
total_size_ = next_power_of_two(size);
68-
assert(total_size_ == size);
69-
70-
max_level_ = int_log2(total_size_);
71-
72-
free_lists_.resize(max_level_ + 1);
73-
68+
_CCCL_ASSERT(size && (size & (size - 1)) == 0,
69+
"Allocation requests for this allocator must pass a size that is a power of two.");
7470
// Initially, the whole memory is free, but depends on init_prereqs
75-
free_lists_[max_level_].emplace_back(0, init_prereqs);
71+
free_lists_.back().emplace_back(0, mv(init_prereqs));
7672
}
7773

7874
::std::ptrdiff_t allocate(size_t size, event_list& prereqs)
7975
{
80-
size = next_power_of_two(size);
81-
size_t level = int_log2(size);
82-
83-
if (level > max_level_)
76+
size = next_power_of_two(size);
77+
const size_t level = int_log2(size);
78+
if (level >= free_lists_.size())
8479
{
85-
fprintf(stderr, "Level %zu > max level %zu\n", level, max_level_);
80+
fprintf(stderr, "Level %zu > max level %zu\n", level, free_lists_.size() - 1);
8681
return -1;
8782
}
8883

@@ -104,11 +99,11 @@ public:
10499
// Deallocated blocks will depend on these, and we will merge the
105100
// previous dependencies when merging buddies
106101
event_list block_prereqs(prereqs);
107-
108-
while (level < max_level_)
102+
const size_t max_level = free_lists_.size() - 1;
103+
while (level < max_level)
109104
{
110-
size_t buddy_index = get_buddy_index(index, level);
111-
auto& buddy_list = free_lists_[level];
105+
const size_t buddy_index = get_buddy_index(index, level);
106+
auto& buddy_list = free_lists_[level];
112107
auto it = ::std::find_if(buddy_list.begin(), buddy_list.end(), [buddy_index](const avail_block& block) {
113108
return block.index == buddy_index;
114109
});
@@ -144,7 +139,7 @@ public:
144139
void debug_print() const
145140
{
146141
size_t power = 1;
147-
for (size_t i = 0; i <= max_level_; ++i, power *= 2)
142+
for (size_t i = 0; i < free_lists_.size(); ++i, power *= 2)
148143
{
149144
if (!free_lists_[i].empty())
150145
{
@@ -161,16 +156,22 @@ public:
161156
private:
162157
static size_t next_power_of_two(size_t size)
163158
{
159+
static_assert(sizeof(size_t) <= 8, "You must be from the future. Review and adjust this code.");
164160
if (size == 0)
165161
{
166162
return 1;
167163
}
168-
size_t power = 1;
169-
while (power < size)
164+
--size;
165+
size |= size >> 1;
166+
size |= size >> 2;
167+
size |= size >> 4;
168+
size |= size >> 8;
169+
size |= size >> 16;
170+
if constexpr (sizeof(size_t) == 8)
170171
{
171-
power *= 2;
172+
size |= size >> 32;
172173
}
173-
return power;
174+
return size + 1;
174175
}
175176

176177
static size_t int_log2(size_t n)
@@ -187,7 +188,7 @@ private:
187188

188189
::std::ptrdiff_t find_free_block(size_t level, event_list& prereqs)
189190
{
190-
for (size_t current_level : each(level, max_level_ + 1))
191+
for (size_t current_level : each(level, free_lists_.size()))
191192
{
192193
if (free_lists_[current_level].empty())
193194
{
@@ -222,8 +223,6 @@ private:
222223
}
223224

224225
::std::vector<::std::vector<avail_block>> free_lists_;
225-
size_t total_size_ = 0;
226-
size_t max_level_ = 0;
227226
};
228227

229228
} // end namespace reserved
@@ -244,16 +243,20 @@ private:
244243
// Per data place buffer and its corresponding metadata
245244
struct per_place
246245
{
247-
per_place(void* base_, size_t size, event_list& prereqs)
246+
per_place(void* base_, size_t size, event_list prereqs)
248247
: base(base_)
249248
, buffer_size(size)
250-
{
251-
metadata = ::std::make_shared<reserved::buddy_allocator_metadata>(buffer_size, prereqs);
252-
}
249+
, metadata(buffer_size, mv(prereqs))
250+
{}
251+
252+
per_place& operator=(const per_place&) = delete;
253+
per_place& operator=(per_place&&) = default;
254+
per_place(const per_place&) = delete;
255+
per_place(per_place&&) = default;
253256

254257
void* base = nullptr;
255258
size_t buffer_size = 0;
256-
::std::shared_ptr<reserved::buddy_allocator_metadata> metadata;
259+
reserved::buddy_allocator_metadata metadata;
257260
};
258261

259262
public:
@@ -270,28 +273,28 @@ public:
270273
void* base = a.allocate(ctx, memory_node, sz, prereqs);
271274

272275
// 2. creates meta data for that buffer, and 3. associate it to the data place
273-
it = map.emplace(memory_node, ::std::make_shared<per_place>(base, sz, prereqs)).first;
276+
it = map.emplace(memory_node, per_place(base, sz, prereqs)).first;
274277
}
275278

276279
// There should be exactly one entry in the map
277280
assert(map.count(memory_node) == 1);
278281
auto& m = it->second;
279282

280-
::std::ptrdiff_t offset = m->metadata->allocate(s, prereqs);
283+
::std::ptrdiff_t offset = m.metadata.allocate(s, prereqs);
281284
assert(offset != -1);
282-
return static_cast<char*>(m->base) + offset;
285+
return static_cast<char*>(m.base) + offset;
283286
}
284287

285288
void
286289
deallocate(backend_ctx_untyped&, const data_place& memory_node, event_list& prereqs, void* ptr, size_t sz) override
287290
{
288291
// There should be exactly one entry in the map
289292
assert(map.count(memory_node) == 1);
290-
auto& m = map[memory_node];
293+
auto& m = map.find(memory_node)->second;
291294

292-
size_t offset = static_cast<char*>(ptr) - static_cast<char*>(m->base);
295+
size_t offset = static_cast<char*>(ptr) - static_cast<char*>(m.base);
293296

294-
m->metadata->deallocate(offset, sz, prereqs);
297+
m.metadata.deallocate(offset, sz, prereqs);
295298
}
296299

297300
event_list deinit(backend_ctx_untyped& ctx) override
@@ -303,11 +306,11 @@ public:
303306
event_list local_prereqs;
304307

305308
// Deinitialize the metadata of the buddy allocator for this place
306-
pp->metadata->deinit(local_prereqs);
309+
pp.metadata.deinit(local_prereqs);
307310

308311
// Deallocate the underlying buffer for this buddy allocator
309312
auto& a = root_allocator ? root_allocator : ctx.get_uncached_allocator();
310-
a.deallocate(ctx, memory_node, local_prereqs, pp->base, pp->buffer_size);
313+
a.deallocate(ctx, memory_node, local_prereqs, pp.base, pp.buffer_size);
311314

312315
result.merge(local_prereqs);
313316
}
@@ -320,7 +323,7 @@ public:
320323
}
321324

322325
private:
323-
::std::unordered_map<data_place, ::std::shared_ptr<per_place>, hash<data_place>> map;
326+
::std::unordered_map<data_place, per_place, hash<data_place>> map;
324327

325328
block_allocator_untyped root_allocator;
326329
};

0 commit comments

Comments
 (0)