Skip to content

Commit 94bb1da

Browse files
authored
Implement double-buffer for ellpack. (#11465)
This helps us to have the external memory cache in two different places. - Make the accessor a variant type. - Most of the changes are for calling a visit function instead of actual code changes.
1 parent bcf0b20 commit 94bb1da

20 files changed

+1003
-503
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,3 +163,4 @@ Rplots.pdf
163163

164164
# nsys
165165
*.nsys-rep
166+
rmm_log.dev*

include/xgboost/base.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* Copyright 2015-2024, XGBoost Contributors
2+
* Copyright 2015-2025, XGBoost Contributors
33
* \file base.h
44
* \brief Defines configuration macros and basic types for xgboost.
55
*/
@@ -72,6 +72,14 @@
7272
#define XGBOOST_DEV_INLINE
7373
#endif // defined(__CUDA__) || defined(__CUDACC__)
7474

75+
76+
// restrict
77+
#if defined(_MSC_VER)
78+
#define XGBOOST_RESTRICT __restrict
79+
#else
80+
#define XGBOOST_RESTRICT __restrict__
81+
#endif
82+
7583
// These check are for Makefile.
7684
#if !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined(XGBOOST_BUILTIN_PREFETCH_PRESENT)
7785
/* default logic for software pre-fetching */
@@ -122,7 +130,7 @@ using bst_target_t = std::uint32_t; // NOLINT
122130
*/
123131
using bst_layer_t = std::int32_t; // NOLINT
124132
/**
125-
* \brief Type for indexing trees.
133+
* @brief Type for indexing trees.
126134
*/
127135
using bst_tree_t = std::int32_t; // NOLINT
128136
/**

src/common/compressed_iterator.h

Lines changed: 115 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
/**
2-
* Copyright 2017-2024, XGBoost Contributors
2+
* Copyright 2017-2025, XGBoost Contributors
33
* \file compressed_iterator.h
44
*/
55
#pragma once
6-
#include <xgboost/base.h>
6+
#include <xgboost/base.h> // for XGBOOST_RESTRICT
77

8-
#include <cmath> // for ceil, log2
9-
#include <cstddef> // for size_t
8+
#include <algorithm> // for max
9+
#include <cmath> // for ceil, log2
10+
#include <cstddef> // for size_t
11+
#include <cstdint> // for uint32_t
1012

1113
#include "common.h"
1214

@@ -79,7 +81,8 @@ class CompressedBufferWriter {
7981
size_t ret = std::ceil(static_cast<double>(compressed_size + detail::kPadding) /
8082
static_cast<double>(sizeof(std::uint32_t))) *
8183
sizeof(std::uint32_t);
82-
return ret;
84+
// Need at least 5 bytes for the reader
85+
return std::max(ret, static_cast<std::size_t>(detail::kPadding + 1));
8386
}
8487

8588
template <typename T>
@@ -212,4 +215,111 @@ class CompressedIterator {
212215
return *offset;
213216
}
214217
};
218+
219+
/**
220+
* @brief A compressed iterator with two buffers for the underlying storage.
221+
*
222+
* This accessor is significantly slower than the single buffer one due to pipeline
223+
* stalling and should not be used as default. Pre-calculating the buffer selection
224+
* indicator can help mitigate it. But we only use this iterator for external memory with
225+
* direct memory access, which is slow anyway.
226+
*
227+
* Use the single buffer one as a reference for how it works.
228+
*/
229+
template <typename OutT>
230+
class DoubleCompressedIter {
231+
public:
232+
// Type definitions for thrust
233+
using self_type = DoubleCompressedIter<OutT>; // NOLINT
234+
using difference_type = ptrdiff_t; // NOLINT
235+
using value_type = OutT; // NOLINT
236+
using pointer = value_type *; // NOLINT
237+
using reference = value_type; // NOLINT
238+
239+
private:
240+
using BufT = CompressedByteT const *;
241+
BufT XGBOOST_RESTRICT buf0_{nullptr};
242+
BufT XGBOOST_RESTRICT buf1_{nullptr};
243+
bst_idx_t const n0_{0}; // Size of the first buffer in bytes.
244+
bst_idx_t const symbol_bits_{0};
245+
std::size_t offset_{0};
246+
247+
public:
248+
DoubleCompressedIter() = default;
249+
DoubleCompressedIter(CompressedByteT const *XGBOOST_RESTRICT buf0, std::size_t n0_bytes,
250+
CompressedByteT const *XGBOOST_RESTRICT buf1, bst_idx_t n_symbols)
251+
: buf0_{buf0}, buf1_{buf1}, n0_{n0_bytes}, symbol_bits_{detail::SymbolBits(n_symbols)} {}
252+
253+
XGBOOST_HOST_DEV_INLINE reference operator*() const {
254+
constexpr std::int32_t kBitsPerByte = 8;
255+
std::size_t start_bit_idx = ((offset_ + 1) * symbol_bits_ - 1);
256+
std::size_t start_byte_idx = start_bit_idx >> 3;
257+
start_byte_idx += detail::kPadding;
258+
259+
std::uint64_t tmp;
260+
261+
if (start_byte_idx >= this->n0_ && (start_byte_idx - 4) < this->n0_) {
262+
// Access between two buffers.
263+
auto getv = [&](auto shift) {
264+
auto shifted = start_byte_idx - shift;
265+
bool ind = (shifted >= n0_); // indicator for which buffer to read
266+
// Pick the buffer to read
267+
auto const *XGBOOST_RESTRICT buf = ind ? buf1_ : buf0_;
268+
shifted -= ind * n0_;
269+
return static_cast<std::uint64_t>(buf[shifted]);
270+
};
271+
// Read 5 bytes - the maximum we will need
272+
tmp = static_cast<std::uint64_t>(buf0_[start_byte_idx - 4]) << 32 | getv(3) << 24 |
273+
getv(2) << 16 | getv(1) << 8 | static_cast<std::uint64_t>(buf1_[start_byte_idx - n0_]);
274+
} else {
275+
// Access one of the buffers
276+
bool ind = start_byte_idx >= n0_;
277+
// Pick the buffer to read
278+
auto const *XGBOOST_RESTRICT buf = reinterpret_cast<CompressedByteT const *>(
279+
(!ind) * reinterpret_cast<std::uintptr_t>(buf0_) +
280+
ind * reinterpret_cast<std::uintptr_t>(buf1_));
281+
auto shifted = start_byte_idx - n0_ * ind;
282+
283+
/**
284+
* Alternatively, we can use vector loads, but it requires aligned memory allocation
285+
* by the backing storage.
286+
*
287+
* // Align the pointer for vector load
288+
* auto beg_ptr = buf + shifted - 4;
289+
* // base ptr in bytes
290+
* auto aligned_beg_ptr = rmm::align_down(reinterpret_cast<std::uintptr_t>(beg_ptr),
291+
* std::alignment_of_v<std::uint32_t>);
292+
* // base ptr in uint32
293+
* auto aligned_beg_u32_ptr = reinterpret_cast<std::uint32_t const *>(aligned_beg_ptr);
294+
* // 2 vector loads for 8 bytes, we will need 5 of them
295+
* std::uint64_t v;
296+
* auto *XGBOOST_RESTRICT v_ptr = reinterpret_cast<std::uint32_t *>(&v);
297+
* v_ptr[0] = aligned_beg_u32_ptr[0];
298+
* v_ptr[1] = aligned_beg_u32_ptr[1];
299+
* // Difference between the original ptr and the aligned ptr.
300+
* auto diff = reinterpret_cast<std::uintptr_t>(beg_ptr) - aligned_beg_ptr;
301+
* // Beginning ptr that points to the first loaded values
302+
* auto loaded_beg_ptr = reinterpret_cast<CompressedByteT const *>(&v) + diff;
303+
*/
304+
305+
// Read 5 bytes - the maximum we will need
306+
tmp = static_cast<std::uint64_t>(buf[shifted - 4]) << 32 |
307+
static_cast<std::uint64_t>(buf[shifted - 3]) << 24 |
308+
static_cast<std::uint64_t>(buf[shifted - 2]) << 16 |
309+
static_cast<std::uint64_t>(buf[shifted - 1]) << 8 | buf[shifted];
310+
}
311+
312+
std::int32_t bit_shift = (kBitsPerByte - ((offset_ + 1) * symbol_bits_)) % kBitsPerByte;
313+
tmp >>= bit_shift;
314+
// Mask off unneeded bits
315+
std::uint64_t mask = (static_cast<std::uint64_t>(1) << symbol_bits_) - 1;
316+
return static_cast<OutT>(tmp & mask);
317+
}
318+
319+
XGBOOST_DEVICE reference operator[](std::size_t idx) const {
320+
self_type offset = (*this);
321+
offset.offset_ += idx;
322+
return *offset;
323+
}
324+
};
215325
} // namespace xgboost::common

0 commit comments

Comments
 (0)