|
1 | 1 | /**
|
2 |
| - * Copyright 2017-2024, XGBoost Contributors |
| 2 | + * Copyright 2017-2025, XGBoost Contributors |
3 | 3 | * \file compressed_iterator.h
|
4 | 4 | */
|
5 | 5 | #pragma once
|
6 |
| -#include <xgboost/base.h> |
| 6 | +#include <xgboost/base.h> // for XGBOOST_RESTRICT |
7 | 7 |
|
8 |
| -#include <cmath> // for ceil, log2 |
9 |
| -#include <cstddef> // for size_t |
| 8 | +#include <algorithm> // for max |
| 9 | +#include <cmath> // for ceil, log2 |
| 10 | +#include <cstddef> // for size_t |
| 11 | +#include <cstdint> // for uint32_t |
10 | 12 |
|
11 | 13 | #include "common.h"
|
12 | 14 |
|
@@ -79,7 +81,8 @@ class CompressedBufferWriter {
|
79 | 81 | size_t ret = std::ceil(static_cast<double>(compressed_size + detail::kPadding) /
|
80 | 82 | static_cast<double>(sizeof(std::uint32_t))) *
|
81 | 83 | sizeof(std::uint32_t);
|
82 |
| - return ret; |
| 84 | + // Need at least 5 bytes for the reader |
| 85 | + return std::max(ret, static_cast<std::size_t>(detail::kPadding + 1)); |
83 | 86 | }
|
84 | 87 |
|
85 | 88 | template <typename T>
|
@@ -212,4 +215,111 @@ class CompressedIterator {
|
212 | 215 | return *offset;
|
213 | 216 | }
|
214 | 217 | };
|
| 218 | + |
| 219 | +/** |
| 220 | + * @brief A compressed iterator with two buffers for the underlying storage. |
| 221 | + * |
| 222 | + * This accessor is significantly slower than the single buffer one due to pipeline |
| 223 | + * stalling and should not be used as default. Pre-calculating the buffer selection |
| 224 | + * indicator can help mitigate it. But we only use this iterator for external memory with |
| 225 | + * direct memory access, which is slow anyway. |
| 226 | + * |
| 227 | + * Use the single buffer one as a reference for how it works. |
| 228 | + */ |
| 229 | +template <typename OutT> |
| 230 | +class DoubleCompressedIter { |
| 231 | + public: |
| 232 | + // Type definitions for thrust |
| 233 | + using self_type = DoubleCompressedIter<OutT>; // NOLINT |
| 234 | + using difference_type = ptrdiff_t; // NOLINT |
| 235 | + using value_type = OutT; // NOLINT |
| 236 | + using pointer = value_type *; // NOLINT |
| 237 | + using reference = value_type; // NOLINT |
| 238 | + |
| 239 | + private: |
| 240 | + using BufT = CompressedByteT const *; |
| 241 | + BufT XGBOOST_RESTRICT buf0_{nullptr}; |
| 242 | + BufT XGBOOST_RESTRICT buf1_{nullptr}; |
| 243 | + bst_idx_t const n0_{0}; // Size of the first buffer in bytes. |
| 244 | + bst_idx_t const symbol_bits_{0}; |
| 245 | + std::size_t offset_{0}; |
| 246 | + |
| 247 | + public: |
| 248 | + DoubleCompressedIter() = default; |
| 249 | + DoubleCompressedIter(CompressedByteT const *XGBOOST_RESTRICT buf0, std::size_t n0_bytes, |
| 250 | + CompressedByteT const *XGBOOST_RESTRICT buf1, bst_idx_t n_symbols) |
| 251 | + : buf0_{buf0}, buf1_{buf1}, n0_{n0_bytes}, symbol_bits_{detail::SymbolBits(n_symbols)} {} |
| 252 | + |
| 253 | + XGBOOST_HOST_DEV_INLINE reference operator*() const { |
| 254 | + constexpr std::int32_t kBitsPerByte = 8; |
| 255 | + std::size_t start_bit_idx = ((offset_ + 1) * symbol_bits_ - 1); |
| 256 | + std::size_t start_byte_idx = start_bit_idx >> 3; |
| 257 | + start_byte_idx += detail::kPadding; |
| 258 | + |
| 259 | + std::uint64_t tmp; |
| 260 | + |
| 261 | + if (start_byte_idx >= this->n0_ && (start_byte_idx - 4) < this->n0_) { |
| 262 | + // Access between two buffers. |
| 263 | + auto getv = [&](auto shift) { |
| 264 | + auto shifted = start_byte_idx - shift; |
| 265 | + bool ind = (shifted >= n0_); // indicator for which buffer to read |
| 266 | + // Pick the buffer to read |
| 267 | + auto const *XGBOOST_RESTRICT buf = ind ? buf1_ : buf0_; |
| 268 | + shifted -= ind * n0_; |
| 269 | + return static_cast<std::uint64_t>(buf[shifted]); |
| 270 | + }; |
| 271 | + // Read 5 bytes - the maximum we will need |
| 272 | + tmp = static_cast<std::uint64_t>(buf0_[start_byte_idx - 4]) << 32 | getv(3) << 24 | |
| 273 | + getv(2) << 16 | getv(1) << 8 | static_cast<std::uint64_t>(buf1_[start_byte_idx - n0_]); |
| 274 | + } else { |
| 275 | + // Access one of the buffers |
| 276 | + bool ind = start_byte_idx >= n0_; |
| 277 | + // Pick the buffer to read |
| 278 | + auto const *XGBOOST_RESTRICT buf = reinterpret_cast<CompressedByteT const *>( |
| 279 | + (!ind) * reinterpret_cast<std::uintptr_t>(buf0_) + |
| 280 | + ind * reinterpret_cast<std::uintptr_t>(buf1_)); |
| 281 | + auto shifted = start_byte_idx - n0_ * ind; |
| 282 | + |
| 283 | + /** |
| 284 | + * Alternatively, we can use vector loads, but it requires aligned memory allocation |
| 285 | + * by the backing storage. |
| 286 | + * |
| 287 | + * // Align the pointer for vector load |
| 288 | + * auto beg_ptr = buf + shifted - 4; |
| 289 | + * // base ptr in bytes |
| 290 | + * auto aligned_beg_ptr = rmm::align_down(reinterpret_cast<std::uintptr_t>(beg_ptr), |
| 291 | + * std::alignment_of_v<std::uint32_t>); |
| 292 | + * // base ptr in uint32 |
| 293 | + * auto aligned_beg_u32_ptr = reinterpret_cast<std::uint32_t const *>(aligned_beg_ptr); |
| 294 | + * // 2 vector loads for 8 bytes, we will need 5 of them |
| 295 | + * std::uint64_t v; |
| 296 | + * auto *XGBOOST_RESTRICT v_ptr = reinterpret_cast<std::uint32_t *>(&v); |
| 297 | + * v_ptr[0] = aligned_beg_u32_ptr[0]; |
| 298 | + * v_ptr[1] = aligned_beg_u32_ptr[1]; |
| 299 | + * // Difference between the original ptr and the aligned ptr. |
| 300 | + * auto diff = reinterpret_cast<std::uintptr_t>(beg_ptr) - aligned_beg_ptr; |
| 301 | + * // Beginning ptr that points to the first loaded values |
| 302 | + * auto loaded_beg_ptr = reinterpret_cast<CompressedByteT const *>(&v) + diff; |
| 303 | + */ |
| 304 | + |
| 305 | + // Read 5 bytes - the maximum we will need |
| 306 | + tmp = static_cast<std::uint64_t>(buf[shifted - 4]) << 32 | |
| 307 | + static_cast<std::uint64_t>(buf[shifted - 3]) << 24 | |
| 308 | + static_cast<std::uint64_t>(buf[shifted - 2]) << 16 | |
| 309 | + static_cast<std::uint64_t>(buf[shifted - 1]) << 8 | buf[shifted]; |
| 310 | + } |
| 311 | + |
| 312 | + std::int32_t bit_shift = (kBitsPerByte - ((offset_ + 1) * symbol_bits_)) % kBitsPerByte; |
| 313 | + tmp >>= bit_shift; |
| 314 | + // Mask off unneeded bits |
| 315 | + std::uint64_t mask = (static_cast<std::uint64_t>(1) << symbol_bits_) - 1; |
| 316 | + return static_cast<OutT>(tmp & mask); |
| 317 | + } |
| 318 | + |
| 319 | + XGBOOST_DEVICE reference operator[](std::size_t idx) const { |
| 320 | + self_type offset = (*this); |
| 321 | + offset.offset_ += idx; |
| 322 | + return *offset; |
| 323 | + } |
| 324 | +}; |
215 | 325 | } // namespace xgboost::common
|
0 commit comments