Skip to content

Commit 2fad970

Browse files
authored
Specialize for GPU dense histogram. (#11443)
1 parent d114196 commit 2fad970

File tree

5 files changed

+124
-60
lines changed

5 files changed

+124
-60
lines changed

src/data/ellpack_page.cu

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -685,15 +685,10 @@ std::size_t EllpackPageImpl::MemCostBytes() const {
685685

686686
EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
687687
Context const* ctx, common::Span<FeatureType const> feature_types) const {
688-
auto null = this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1;
689-
return {ctx,
690-
this->cuts_,
691-
this->info.row_stride,
692-
this->base_rowid,
693-
this->n_rows,
694-
common::CompressedIterator<uint32_t>{gidx_buffer.data(), this->NumSymbols()},
695-
null,
696-
feature_types};
688+
auto null = this->NullValue();
689+
auto iter = common::CompressedIterator<uint32_t>{gidx_buffer.data(), this->NumSymbols()};
690+
return {ctx, this->cuts_, this->info.row_stride, this->base_rowid, this->n_rows,
691+
iter, null, this->IsDense(), feature_types};
697692
}
698693

699694
EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
@@ -705,15 +700,11 @@ EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
705700
dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(),
706701
cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
707702
Context cpu_ctx;
708-
auto null = this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1;
709-
return {ctx->IsCPU() ? ctx : &cpu_ctx,
710-
this->cuts_,
711-
this->info.row_stride,
712-
this->base_rowid,
713-
this->n_rows,
714-
common::CompressedIterator<uint32_t>{h_gidx_buffer->data(), this->NumSymbols()},
715-
null,
716-
feature_types};
703+
auto null = this->NullValue();
704+
auto iter = common::CompressedIterator<uint32_t>{h_gidx_buffer->data(), this->NumSymbols()};
705+
auto sctx = ctx->IsCPU() ? ctx : &cpu_ctx;
706+
return {sctx, this->cuts_, this->info.row_stride, this->base_rowid, this->n_rows,
707+
iter, null, this->IsDense(), feature_types};
717708
}
718709

719710
[[nodiscard]] bst_idx_t EllpackPageImpl::NumNonMissing(

src/data/ellpack_page.cuh

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,20 @@ namespace xgboost {
2020
/**
2121
* @brief Struct for accessing and manipulating an ELLPACK matrix on the device.
2222
*
23-
* Does not own underlying memory and may be trivially copied into kernels.
23+
* Does not own the underlying memory and may be trivially copied into kernels.
2424
*/
2525
struct EllpackDeviceAccessor {
26-
/** @brief Whether or not if the matrix is dense. */
27-
bst_idx_t null_value;
26+
private:
27+
/**
28+
* @brief Stores the null value and whether the matrix is dense. The `IsDense` is stored in the
29+
* first bit of this value.
30+
*/
31+
bst_idx_t null_value_;
32+
33+
constexpr static auto Ind() { return static_cast<bst_idx_t>(1); }
34+
constexpr static std::size_t NullShift() { return sizeof(null_value_) * 8 - Ind(); }
35+
36+
public:
2837
/** @brief Row length for ELLPACK, equal to number of features when the data is dense. */
2938
bst_idx_t row_stride;
3039
/** @brief Starting index of the rows. Used for external memory. */
@@ -45,9 +54,9 @@ struct EllpackDeviceAccessor {
4554
EllpackDeviceAccessor() = delete;
4655
EllpackDeviceAccessor(Context const* ctx, std::shared_ptr<const common::HistogramCuts> cuts,
4756
bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows,
48-
common::CompressedIterator<uint32_t> gidx_iter, bst_idx_t null_value,
49-
common::Span<FeatureType const> feature_types)
50-
: null_value{null_value},
57+
common::CompressedIterator<std::uint32_t> gidx_iter, bst_idx_t null_value,
58+
bool is_dense, common::Span<FeatureType const> feature_types)
59+
: null_value_{null_value},
5160
row_stride{row_stride},
5261
base_rowid{base_rowid},
5362
n_rows{n_rows},
@@ -65,8 +74,17 @@ struct EllpackDeviceAccessor {
6574
feature_segments = cuts->cut_ptrs_.ConstHostPointer();
6675
min_fvalue = cuts->min_vals_.ConstHostSpan();
6776
}
77+
78+
if (is_dense) {
79+
static_assert(NullShift() == 63);
80+
CHECK(!IsDense());
81+
this->null_value_ |= (Ind() << NullShift());
82+
}
6883
}
6984

85+
[[nodiscard]] XGBOOST_HOST_DEV_INLINE bool IsDense() const {
86+
return (this->null_value_ >> NullShift()) != 0;
87+
}
7088
[[nodiscard]] XGBOOST_HOST_DEV_INLINE bool IsDenseCompressed() const {
7189
return this->row_stride == this->NumFeatures();
7290
}
@@ -133,7 +151,9 @@ struct EllpackDeviceAccessor {
133151
}
134152
return gidx_fvalue_map[gidx];
135153
}
136-
[[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NullValue() const { return this->null_value; }
154+
[[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NullValue() const {
155+
return this->null_value_ & ((Ind() << NullShift()) - Ind());
156+
}
137157
[[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NumBins() const { return gidx_fvalue_map.size(); }
138158
[[nodiscard]] XGBOOST_HOST_DEV_INLINE size_t NumFeatures() const { return min_fvalue.size(); }
139159
};
@@ -224,9 +244,7 @@ class EllpackPageImpl {
224244
[[nodiscard]] bst_idx_t Size() const;
225245

226246
/** @brief Set the base row id for this page. */
227-
void SetBaseRowId(std::size_t row_id) {
228-
base_rowid = row_id;
229-
}
247+
void SetBaseRowId(std::size_t row_id) { base_rowid = row_id; }
230248

231249
[[nodiscard]] common::HistogramCuts const& Cuts() const { return *cuts_; }
232250
[[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
@@ -251,6 +269,12 @@ class EllpackPageImpl {
251269
*/
252270
[[nodiscard]] auto NumSymbols() const { return this->info.n_symbols; }
253271
void SetNumSymbols(bst_idx_t n_symbols) { this->info.n_symbols = n_symbols; }
272+
/**
273+
* @brief Get the value used to represent missing.
274+
*/
275+
[[nodiscard]] bst_idx_t NullValue() const {
276+
return this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1;
277+
}
254278
/**
255279
* @brief Copy basic shape from another page.
256280
*/

src/data/ellpack_page_source.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ struct EllpackCacheInfo {
2828
std::int64_t max_num_device_pages{0}; // Maximum number of pages cached in device.
2929
float missing{std::numeric_limits<float>::quiet_NaN()};
3030
std::vector<bst_idx_t> cache_mapping;
31-
std::vector<bst_idx_t> buffer_bytes;
31+
std::vector<bst_idx_t> buffer_bytes; // N bytes of the concatenated pages.
3232
std::vector<bst_idx_t> buffer_rows;
3333

3434
EllpackCacheInfo() = default;

src/tree/gpu_hist/histogram.cu

Lines changed: 52 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -132,13 +132,11 @@ XGBOOST_DEV_INLINE void AtomicAddGpairGlobal(xgboost::GradientPairInt64* dest,
132132
auto g = gpair.GetQuantisedGrad();
133133
auto h = gpair.GetQuantisedHess();
134134

135-
atomicAdd(dst_ptr,
136-
*reinterpret_cast<uint64_t*>(&g));
137-
atomicAdd(dst_ptr + 1,
138-
*reinterpret_cast<uint64_t*>(&h));
135+
atomicAdd(dst_ptr, *reinterpret_cast<uint64_t*>(&g));
136+
atomicAdd(dst_ptr + 1, *reinterpret_cast<uint64_t*>(&h));
139137
}
140138

141-
template <bool kCompressed, int kBlockThreads, int kItemsPerThread>
139+
template <bool kCompressed, bool kDense, int kBlockThreads, int kItemsPerThread>
142140
class HistogramAgent {
143141
int constexpr static kItemsPerTile = kBlockThreads * kItemsPerThread;
144142

@@ -154,6 +152,8 @@ class HistogramAgent {
154152
const bst_idx_t n_elements_;
155153
const GradientQuantiser& rounding_;
156154

155+
static_assert(kCompressed >= kDense);
156+
157157
public:
158158
__device__ HistogramAgent(GradientPairInt64* smem_arr,
159159
GradientPairInt64* __restrict__ d_node_hist, const FeatureGroup& group,
@@ -176,7 +176,7 @@ class HistogramAgent {
176176
Idx ridx = d_ridx_[idx / feature_stride_];
177177
auto fidx = FeatIdx(group_, idx, feature_stride_);
178178
bst_bin_t compressed_bin = matrix_.gidx_iter[IterIdx(matrix_, ridx, fidx)];
179-
if (compressed_bin != matrix_.NullValue()) {
179+
if (kDense || compressed_bin != matrix_.NullValue()) {
180180
// The matrix is compressed with feature-local bins.
181181
if (kCompressed) {
182182
compressed_bin += this->matrix_.feature_segments[fidx];
@@ -211,18 +211,20 @@ class HistogramAgent {
211211
gpair[i] = d_gpair_[ridx[i]];
212212
auto fidx = FeatIdx(group_, idx[i], feature_stride_);
213213
gidx[i] = matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)];
214-
if (gidx[i] != matrix_.NullValue()) {
215-
if (kCompressed) {
214+
if (kDense || gidx[i] != matrix_.NullValue()) {
215+
if constexpr (kCompressed) {
216216
gidx[i] += matrix_.feature_segments[fidx];
217217
}
218218
} else {
219-
gidx[i] = -1; // missing
219+
// Use -1 to denote missing. Since we need to add the beginning bin to gidx, the
220+
// result might equal to the `NullValue`.
221+
gidx[i] = -1;
220222
}
221223
}
222224
#pragma unroll
223225
for (int i = 0; i < kItemsPerThread; i++) {
224226
// Avoid atomic add if it's a null value.
225-
if (gidx[i] != -1) {
227+
if (kDense || gidx[i] != -1) {
226228
auto adjusted = rounding_.ToFixedPoint(gpair[i]);
227229
AtomicAddGpairShared(smem_arr_ + gidx[i] - group_.start_bin, adjusted);
228230
}
@@ -262,7 +264,8 @@ class HistogramAgent {
262264
}
263265
};
264266

265-
template <bool kIsDense, bool use_shared_memory_histograms, int kBlockThreads, int kItemsPerThread>
267+
template <bool kCompressed, bool kDense, bool use_shared_memory_histograms, int kBlockThreads,
268+
int kItemsPerThread>
266269
__global__ void __launch_bounds__(kBlockThreads)
267270
SharedMemHistKernel(const EllpackDeviceAccessor matrix,
268271
const FeatureGroupsAccessor feature_groups,
@@ -273,7 +276,7 @@ __global__ void __launch_bounds__(kBlockThreads)
273276
extern __shared__ char smem[];
274277
const FeatureGroup group = feature_groups[blockIdx.y];
275278
auto smem_arr = reinterpret_cast<GradientPairInt64*>(smem);
276-
auto agent = HistogramAgent<kIsDense, kBlockThreads, kItemsPerThread>(
279+
auto agent = HistogramAgent<kCompressed, kDense, kBlockThreads, kItemsPerThread>(
277280
smem_arr, d_node_hist, group, matrix, d_ridx, rounding, d_gpair);
278281
if (use_shared_memory_histograms) {
279282
agent.BuildHistogramWithShared();
@@ -289,30 +292,41 @@ constexpr std::int32_t ItemsPerTile() { return kBlockThreads * kItemsPerThread;
289292
} // namespace
290293

291294
// Use auto deduction guide to workaround compiler error.
292-
template <auto GlobalDense = SharedMemHistKernel<true, false, kBlockThreads, kItemsPerThread>,
293-
auto Global = SharedMemHistKernel<false, false, kBlockThreads, kItemsPerThread>,
294-
auto SharedDense = SharedMemHistKernel<true, true, kBlockThreads, kItemsPerThread>,
295-
auto Shared = SharedMemHistKernel<false, true, kBlockThreads, kItemsPerThread>>
295+
template <auto GlobalCompr =
296+
SharedMemHistKernel<true, false, false, kBlockThreads, kItemsPerThread>,
297+
auto Global = SharedMemHistKernel<false, false, false, kBlockThreads, kItemsPerThread>,
298+
auto SharedCompr = SharedMemHistKernel<true, false, true, kBlockThreads, kItemsPerThread>,
299+
auto Shared = SharedMemHistKernel<false, false, true, kBlockThreads, kItemsPerThread>,
300+
auto GlobalDense = SharedMemHistKernel<true, true, false, kBlockThreads, kItemsPerThread>,
301+
auto SharedDense = SharedMemHistKernel<true, true, true, kBlockThreads, kItemsPerThread>>
296302
struct HistogramKernel {
297303
enum KernelType : std::size_t {
298-
kGlobalDense = 0,
304+
kGlobalCompr = 0,
299305
kGlobal = 1,
300-
kSharedDense = 2,
306+
kSharedCompr = 2,
301307
kShared = 3,
308+
kGlobalDense = 4,
309+
kSharedDense = 5,
302310
};
303311
// Kernel for working with dense Ellpack using the global memory.
304-
decltype(GlobalDense) global_dense_kernel{
305-
SharedMemHistKernel<true, false, kBlockThreads, kItemsPerThread>};
312+
decltype(GlobalCompr) global_compr_kernel{
313+
SharedMemHistKernel<true, false, false, kBlockThreads, kItemsPerThread>};
306314
// Kernel for working with sparse Ellpack using the global memory.
307-
decltype(Global) global_kernel{SharedMemHistKernel<false, false, kBlockThreads, kItemsPerThread>};
315+
decltype(Global) global_kernel{
316+
SharedMemHistKernel<false, false, false, kBlockThreads, kItemsPerThread>};
308317
// Kernel for working with dense Ellpack using the shared memory.
309-
decltype(SharedDense) shared_dense_kernel{
310-
SharedMemHistKernel<true, true, kBlockThreads, kItemsPerThread>};
318+
decltype(SharedCompr) shared_compr_kernel{
319+
SharedMemHistKernel<true, false, true, kBlockThreads, kItemsPerThread>};
311320
// Kernel for working with sparse Ellpack using the shared memory.
312-
decltype(Shared) shared_kernel{SharedMemHistKernel<false, true, kBlockThreads, kItemsPerThread>};
321+
decltype(Shared) shared_kernel{
322+
SharedMemHistKernel<false, false, true, kBlockThreads, kItemsPerThread>};
323+
decltype(GlobalDense) global_dense_kernel{
324+
SharedMemHistKernel<true, true, false, kBlockThreads, kItemsPerThread>};
325+
decltype(SharedDense) shared_dense_kernel{
326+
SharedMemHistKernel<true, true, true, kBlockThreads, kItemsPerThread>};
313327

314328
bool shared{false};
315-
std::array<std::uint32_t, 4> grid_sizes{0, 0, 0, 0};
329+
std::array<std::uint32_t, 6> grid_sizes{0, 0, 0, 0, 0, 0};
316330
std::size_t smem_size{0};
317331
bool const force_global;
318332

@@ -347,9 +361,11 @@ struct HistogramKernel {
347361
this->grid_sizes[static_cast<std::size_t>(k)] = n_blocks_per_mp * n_mps;
348362
};
349363
// Initialize all kernel instantiations
350-
std::array kernel_types{kGlobalDense, kGlobal, kSharedDense, kShared};
364+
std::array kernel_types{kGlobalCompr, kGlobal, kSharedCompr,
365+
kShared, kGlobalDense, kSharedDense};
351366
std::int32_t k = 0;
352-
for (auto& kernel : {global_dense_kernel, global_kernel, shared_dense_kernel, shared_kernel}) {
367+
for (auto& kernel : {global_compr_kernel, global_kernel, shared_compr_kernel, shared_kernel,
368+
global_dense_kernel, shared_dense_kernel}) {
353369
init(kernel, kernel_types[k]);
354370
++k;
355371
}
@@ -397,19 +413,24 @@ class DeviceHistogramBuilderImpl {
397413
using K = HistogramKernel<>::KernelType;
398414
if (!this->kernel_->shared) { // Use global memory
399415
CHECK_EQ(this->kernel_->smem_size, 0);
400-
if (matrix.IsDenseCompressed()) {
401-
// Dense must use shared memory except for testing.
416+
if (matrix.IsDense()) {
402417
CHECK(this->kernel_->force_global);
403418
launcher(this->kernel_->global_dense_kernel, this->kernel_->grid_sizes[K::kGlobalDense]);
419+
} else if (matrix.IsDenseCompressed()) {
420+
// Dense must use shared memory except for testing.
421+
CHECK(this->kernel_->force_global);
422+
launcher(this->kernel_->global_compr_kernel, this->kernel_->grid_sizes[K::kGlobalCompr]);
404423
} else {
405424
// Sparse
406425
launcher(this->kernel_->global_kernel, this->kernel_->grid_sizes[K::kGlobal]);
407426
}
408427
} else { // Use shared memory
409428
CHECK_NE(this->kernel_->smem_size, 0);
410-
if (matrix.IsDenseCompressed()) {
411-
// Dense
429+
if (matrix.IsDense()) {
412430
launcher(this->kernel_->shared_dense_kernel, this->kernel_->grid_sizes[K::kSharedDense]);
431+
} else if (matrix.IsDenseCompressed()) {
432+
// Dense
433+
launcher(this->kernel_->shared_compr_kernel, this->kernel_->grid_sizes[K::kSharedCompr]);
413434
} else {
414435
// Sparse
415436
launcher(this->kernel_->shared_kernel, this->kernel_->grid_sizes[K::kShared]);

tests/cpp/data/test_ellpack_page.cu

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,4 +466,32 @@ TEST_P(SparseEllpack, FromGHistIndex) { this->TestFromGHistIndex(GetParam()); }
466466
TEST_P(SparseEllpack, NumNonMissing) { this->TestNumNonMissing(this->GetParam()); }
467467

468468
INSTANTIATE_TEST_SUITE_P(EllpackPage, SparseEllpack, ::testing::Values(.0f, .2f, .4f, .8f));
469+
470+
TEST(EllpackPage, IsDense) {
471+
auto test = [](float sparsity) {
472+
auto p_fmat = RandomDataGenerator{64, 16, sparsity}.GenerateDMatrix();
473+
auto p = BatchParam{16, tree::TrainParam::DftSparseThreshold()};
474+
auto ctx = MakeCUDACtx(0);
475+
for (auto const& page : p_fmat->GetBatches<EllpackPage>(&ctx, p)) {
476+
auto d_acc = page.Impl()->GetDeviceAccessor(&ctx);
477+
if (sparsity == 0.0) {
478+
ASSERT_EQ(d_acc.IsDense(), page.Impl()->IsDense());
479+
ASSERT_TRUE(d_acc.IsDense());
480+
ASSERT_EQ(p.max_bin, d_acc.NullValue());
481+
} else {
482+
ASSERT_FALSE(d_acc.IsDense());
483+
ASSERT_EQ(p.max_bin * p_fmat->Info().num_col_, d_acc.NullValue());
484+
}
485+
std::vector<common::CompressedByteT> h_storage;
486+
auto h_acc = page.Impl()->GetHostAccessor(&ctx, &h_storage);
487+
if (sparsity == 0.0) {
488+
ASSERT_TRUE(h_acc.IsDense());
489+
} else {
490+
ASSERT_FALSE(h_acc.IsDense());
491+
}
492+
}
493+
};
494+
test(0.0);
495+
test(0.5);
496+
}
469497
} // namespace xgboost

0 commit comments

Comments
 (0)