Specialize for GPU dense histogram. (#11443)

trivialfis · web-flow · commit 2fad970c38fb · 2025-05-06T04:07:17.000+08:00
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
@@ -685,15 +685,10 @@ std::size_t EllpackPageImpl::MemCostBytes() const {
 
 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
     Context const* ctx, common::Span<FeatureType const> feature_types) const {
-  auto null = this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1;
-  return {ctx,
-          this->cuts_,
-          this->info.row_stride,
-          this->base_rowid,
-          this->n_rows,
-          common::CompressedIterator<uint32_t>{gidx_buffer.data(), this->NumSymbols()},
-          null,
-          feature_types};
+  auto null = this->NullValue();
+  auto iter = common::CompressedIterator<uint32_t>{gidx_buffer.data(), this->NumSymbols()};
+  return {ctx,  this->cuts_, this->info.row_stride, this->base_rowid, this->n_rows,
+          iter, null,        this->IsDense(),       feature_types};
 }
 
 EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
@@ -705,15 +700,11 @@ EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
   dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(),
                                 cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
   Context cpu_ctx;
-  auto null = this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1;
-  return {ctx->IsCPU() ? ctx : &cpu_ctx,
-          this->cuts_,
-          this->info.row_stride,
-          this->base_rowid,
-          this->n_rows,
-          common::CompressedIterator<uint32_t>{h_gidx_buffer->data(), this->NumSymbols()},
-          null,
-          feature_types};
+  auto null = this->NullValue();
+  auto iter = common::CompressedIterator<uint32_t>{h_gidx_buffer->data(), this->NumSymbols()};
+  auto sctx = ctx->IsCPU() ? ctx : &cpu_ctx;
+  return {sctx, this->cuts_, this->info.row_stride, this->base_rowid, this->n_rows,
+          iter, null,        this->IsDense(),       feature_types};
 }
 
 [[nodiscard]] bst_idx_t EllpackPageImpl::NumNonMissing(
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
@@ -20,11 +20,20 @@ namespace xgboost {
 /**
  * @brief Struct for accessing and manipulating an ELLPACK matrix on the device.
  *
- * Does not own underlying memory and may be trivially copied into kernels.
+ * Does not own the underlying memory and may be trivially copied into kernels.
  */
 struct EllpackDeviceAccessor {
-  /** @brief Whether or not if the matrix is dense. */
-  bst_idx_t null_value;
+ private:
+  /**
+   * @brief Stores the null value and whether the matrix is dense. The `IsDense` is stored in the
+   * first bit of this value.
+   */
+  bst_idx_t null_value_;
+
+  constexpr static auto Ind() { return static_cast<bst_idx_t>(1); }
+  constexpr static std::size_t NullShift() { return sizeof(null_value_) * 8 - Ind(); }
+
+ public:
   /** @brief Row length for ELLPACK, equal to number of features when the data is dense. */
   bst_idx_t row_stride;
   /** @brief Starting index of the rows. Used for external memory. */
@@ -45,9 +54,9 @@ struct EllpackDeviceAccessor {
   EllpackDeviceAccessor() = delete;
   EllpackDeviceAccessor(Context const* ctx, std::shared_ptr<const common::HistogramCuts> cuts,
                         bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows,
-                        common::CompressedIterator<uint32_t> gidx_iter, bst_idx_t null_value,
-                        common::Span<FeatureType const> feature_types)
-      : null_value{null_value},
+                        common::CompressedIterator<std::uint32_t> gidx_iter, bst_idx_t null_value,
+                        bool is_dense, common::Span<FeatureType const> feature_types)
+      : null_value_{null_value},
         row_stride{row_stride},
         base_rowid{base_rowid},
         n_rows{n_rows},
@@ -65,8 +74,17 @@ struct EllpackDeviceAccessor {
       feature_segments = cuts->cut_ptrs_.ConstHostPointer();
       min_fvalue = cuts->min_vals_.ConstHostSpan();
     }
+
+    if (is_dense) {
+      static_assert(NullShift() == 63);
+      CHECK(!IsDense());
+      this->null_value_ |= (Ind() << NullShift());
+    }
   }
 
+  [[nodiscard]] XGBOOST_HOST_DEV_INLINE bool IsDense() const {
+    return (this->null_value_ >> NullShift()) != 0;
+  }
   [[nodiscard]] XGBOOST_HOST_DEV_INLINE bool IsDenseCompressed() const {
     return this->row_stride == this->NumFeatures();
   }
@@ -133,7 +151,9 @@ struct EllpackDeviceAccessor {
     }
     return gidx_fvalue_map[gidx];
   }
-  [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NullValue() const { return this->null_value; }
+  [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NullValue() const {
+    return this->null_value_ & ((Ind() << NullShift()) - Ind());
+  }
   [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NumBins() const { return gidx_fvalue_map.size(); }
   [[nodiscard]] XGBOOST_HOST_DEV_INLINE size_t NumFeatures() const { return min_fvalue.size(); }
 };
@@ -224,9 +244,7 @@ class EllpackPageImpl {
   [[nodiscard]] bst_idx_t Size() const;
 
   /** @brief Set the base row id for this page. */
-  void SetBaseRowId(std::size_t row_id) {
-    base_rowid = row_id;
-  }
+  void SetBaseRowId(std::size_t row_id) { base_rowid = row_id; }
 
   [[nodiscard]] common::HistogramCuts const& Cuts() const { return *cuts_; }
   [[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
@@ -251,6 +269,12 @@ class EllpackPageImpl {
    */
   [[nodiscard]] auto NumSymbols() const { return this->info.n_symbols; }
   void SetNumSymbols(bst_idx_t n_symbols) { this->info.n_symbols = n_symbols; }
+  /**
+   * @brief Get the value used to represent missing.
+   */
+  [[nodiscard]] bst_idx_t NullValue() const {
+    return this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1;
+  }
   /**
    * @brief Copy basic shape from another page.
    */
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
@@ -28,7 +28,7 @@ struct EllpackCacheInfo {
   std::int64_t max_num_device_pages{0};  // Maximum number of pages cached in device.
   float missing{std::numeric_limits<float>::quiet_NaN()};
   std::vector<bst_idx_t> cache_mapping;
-  std::vector<bst_idx_t> buffer_bytes;
+  std::vector<bst_idx_t> buffer_bytes;  // N bytes of the concatenated pages.
   std::vector<bst_idx_t> buffer_rows;
 
   EllpackCacheInfo() = default;
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
@@ -132,13 +132,11 @@ XGBOOST_DEV_INLINE void AtomicAddGpairGlobal(xgboost::GradientPairInt64* dest,
   auto g = gpair.GetQuantisedGrad();
   auto h = gpair.GetQuantisedHess();
 
-  atomicAdd(dst_ptr,
-            *reinterpret_cast<uint64_t*>(&g));
-  atomicAdd(dst_ptr + 1,
-            *reinterpret_cast<uint64_t*>(&h));
+  atomicAdd(dst_ptr, *reinterpret_cast<uint64_t*>(&g));
+  atomicAdd(dst_ptr + 1, *reinterpret_cast<uint64_t*>(&h));
 }
 
-template <bool kCompressed, int kBlockThreads, int kItemsPerThread>
+template <bool kCompressed, bool kDense, int kBlockThreads, int kItemsPerThread>
 class HistogramAgent {
   int constexpr static kItemsPerTile = kBlockThreads * kItemsPerThread;
 
@@ -154,6 +152,8 @@ class HistogramAgent {
   const bst_idx_t n_elements_;
   const GradientQuantiser& rounding_;
 
+  static_assert(kCompressed >= kDense);
+
  public:
   __device__ HistogramAgent(GradientPairInt64* smem_arr,
                             GradientPairInt64* __restrict__ d_node_hist, const FeatureGroup& group,
@@ -176,7 +176,7 @@ class HistogramAgent {
       Idx ridx = d_ridx_[idx / feature_stride_];
       auto fidx = FeatIdx(group_, idx, feature_stride_);
       bst_bin_t compressed_bin = matrix_.gidx_iter[IterIdx(matrix_, ridx, fidx)];
-      if (compressed_bin != matrix_.NullValue()) {
+      if (kDense || compressed_bin != matrix_.NullValue()) {
         // The matrix is compressed with feature-local bins.
         if (kCompressed) {
           compressed_bin += this->matrix_.feature_segments[fidx];
@@ -211,18 +211,20 @@ class HistogramAgent {
       gpair[i] = d_gpair_[ridx[i]];
       auto fidx = FeatIdx(group_, idx[i], feature_stride_);
       gidx[i] = matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)];
-      if (gidx[i] != matrix_.NullValue()) {
-        if (kCompressed) {
+      if (kDense || gidx[i] != matrix_.NullValue()) {
+        if constexpr (kCompressed) {
           gidx[i] += matrix_.feature_segments[fidx];
         }
       } else {
-        gidx[i] = -1;  // missing
+        // Use -1 to denote missing. Since we need to add the beginning bin to gidx, the
+        // result might equal to the `NullValue`.
+        gidx[i] = -1;
       }
     }
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
       // Avoid atomic add if it's a null value.
-      if (gidx[i] != -1) {
+      if (kDense || gidx[i] != -1) {
         auto adjusted = rounding_.ToFixedPoint(gpair[i]);
         AtomicAddGpairShared(smem_arr_ + gidx[i] - group_.start_bin, adjusted);
       }
@@ -262,7 +264,8 @@ class HistogramAgent {
   }
 };
 
-template <bool kIsDense, bool use_shared_memory_histograms, int kBlockThreads, int kItemsPerThread>
+template <bool kCompressed, bool kDense, bool use_shared_memory_histograms, int kBlockThreads,
+          int kItemsPerThread>
 __global__ void __launch_bounds__(kBlockThreads)
     SharedMemHistKernel(const EllpackDeviceAccessor matrix,
                         const FeatureGroupsAccessor feature_groups,
@@ -273,7 +276,7 @@ __global__ void __launch_bounds__(kBlockThreads)
   extern __shared__ char smem[];
   const FeatureGroup group = feature_groups[blockIdx.y];
   auto smem_arr = reinterpret_cast<GradientPairInt64*>(smem);
-  auto agent = HistogramAgent<kIsDense, kBlockThreads, kItemsPerThread>(
+  auto agent = HistogramAgent<kCompressed, kDense, kBlockThreads, kItemsPerThread>(
       smem_arr, d_node_hist, group, matrix, d_ridx, rounding, d_gpair);
   if (use_shared_memory_histograms) {
     agent.BuildHistogramWithShared();
@@ -289,30 +292,41 @@ constexpr std::int32_t ItemsPerTile() { return kBlockThreads * kItemsPerThread;
 }  // namespace
 
 // Use auto deduction guide to workaround compiler error.
-template <auto GlobalDense = SharedMemHistKernel<true, false, kBlockThreads, kItemsPerThread>,
-          auto Global = SharedMemHistKernel<false, false, kBlockThreads, kItemsPerThread>,
-          auto SharedDense = SharedMemHistKernel<true, true, kBlockThreads, kItemsPerThread>,
-          auto Shared = SharedMemHistKernel<false, true, kBlockThreads, kItemsPerThread>>
+template <auto GlobalCompr =
+              SharedMemHistKernel<true, false, false, kBlockThreads, kItemsPerThread>,
+          auto Global = SharedMemHistKernel<false, false, false, kBlockThreads, kItemsPerThread>,
+          auto SharedCompr = SharedMemHistKernel<true, false, true, kBlockThreads, kItemsPerThread>,
+          auto Shared = SharedMemHistKernel<false, false, true, kBlockThreads, kItemsPerThread>,
+          auto GlobalDense = SharedMemHistKernel<true, true, false, kBlockThreads, kItemsPerThread>,
+          auto SharedDense = SharedMemHistKernel<true, true, true, kBlockThreads, kItemsPerThread>>
 struct HistogramKernel {
   enum KernelType : std::size_t {
-    kGlobalDense = 0,
+    kGlobalCompr = 0,
     kGlobal = 1,
-    kSharedDense = 2,
+    kSharedCompr = 2,
     kShared = 3,
+    kGlobalDense = 4,
+    kSharedDense = 5,
   };
   // Kernel for working with dense Ellpack using the global memory.
-  decltype(GlobalDense) global_dense_kernel{
-      SharedMemHistKernel<true, false, kBlockThreads, kItemsPerThread>};
+  decltype(GlobalCompr) global_compr_kernel{
+      SharedMemHistKernel<true, false, false, kBlockThreads, kItemsPerThread>};
   // Kernel for working with sparse Ellpack using the global memory.
-  decltype(Global) global_kernel{SharedMemHistKernel<false, false, kBlockThreads, kItemsPerThread>};
+  decltype(Global) global_kernel{
+      SharedMemHistKernel<false, false, false, kBlockThreads, kItemsPerThread>};
   // Kernel for working with dense Ellpack using the shared memory.
-  decltype(SharedDense) shared_dense_kernel{
-      SharedMemHistKernel<true, true, kBlockThreads, kItemsPerThread>};
+  decltype(SharedCompr) shared_compr_kernel{
+      SharedMemHistKernel<true, false, true, kBlockThreads, kItemsPerThread>};
   // Kernel for working with sparse Ellpack using the shared memory.
-  decltype(Shared) shared_kernel{SharedMemHistKernel<false, true, kBlockThreads, kItemsPerThread>};
+  decltype(Shared) shared_kernel{
+      SharedMemHistKernel<false, false, true, kBlockThreads, kItemsPerThread>};
+  decltype(GlobalDense) global_dense_kernel{
+      SharedMemHistKernel<true, true, false, kBlockThreads, kItemsPerThread>};
+  decltype(SharedDense) shared_dense_kernel{
+      SharedMemHistKernel<true, true, true, kBlockThreads, kItemsPerThread>};
 
   bool shared{false};
-  std::array<std::uint32_t, 4> grid_sizes{0, 0, 0, 0};
+  std::array<std::uint32_t, 6> grid_sizes{0, 0, 0, 0, 0, 0};
   std::size_t smem_size{0};
   bool const force_global;
 
@@ -347,9 +361,11 @@ struct HistogramKernel {
       this->grid_sizes[static_cast<std::size_t>(k)] = n_blocks_per_mp * n_mps;
     };
     // Initialize all kernel instantiations
-    std::array kernel_types{kGlobalDense, kGlobal, kSharedDense, kShared};
+    std::array kernel_types{kGlobalCompr, kGlobal,      kSharedCompr,
+                            kShared,      kGlobalDense, kSharedDense};
     std::int32_t k = 0;
-    for (auto& kernel : {global_dense_kernel, global_kernel, shared_dense_kernel, shared_kernel}) {
+    for (auto& kernel : {global_compr_kernel, global_kernel, shared_compr_kernel, shared_kernel,
+                         global_dense_kernel, shared_dense_kernel}) {
       init(kernel, kernel_types[k]);
       ++k;
     }
@@ -397,19 +413,24 @@ class DeviceHistogramBuilderImpl {
     using K = HistogramKernel<>::KernelType;
     if (!this->kernel_->shared) {  // Use global memory
       CHECK_EQ(this->kernel_->smem_size, 0);
-      if (matrix.IsDenseCompressed()) {
-        // Dense must use shared memory except for testing.
+      if (matrix.IsDense()) {
         CHECK(this->kernel_->force_global);
         launcher(this->kernel_->global_dense_kernel, this->kernel_->grid_sizes[K::kGlobalDense]);
+      } else if (matrix.IsDenseCompressed()) {
+        // Dense must use shared memory except for testing.
+        CHECK(this->kernel_->force_global);
+        launcher(this->kernel_->global_compr_kernel, this->kernel_->grid_sizes[K::kGlobalCompr]);
       } else {
         // Sparse
         launcher(this->kernel_->global_kernel, this->kernel_->grid_sizes[K::kGlobal]);
       }
     } else {  // Use shared memory
       CHECK_NE(this->kernel_->smem_size, 0);
-      if (matrix.IsDenseCompressed()) {
-        // Dense
+      if (matrix.IsDense()) {
         launcher(this->kernel_->shared_dense_kernel, this->kernel_->grid_sizes[K::kSharedDense]);
+      } else if (matrix.IsDenseCompressed()) {
+        // Dense
+        launcher(this->kernel_->shared_compr_kernel, this->kernel_->grid_sizes[K::kSharedCompr]);
       } else {
         // Sparse
         launcher(this->kernel_->shared_kernel, this->kernel_->grid_sizes[K::kShared]);
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
@@ -466,4 +466,32 @@ TEST_P(SparseEllpack, FromGHistIndex) { this->TestFromGHistIndex(GetParam()); }
 TEST_P(SparseEllpack, NumNonMissing) { this->TestNumNonMissing(this->GetParam()); }
 
 INSTANTIATE_TEST_SUITE_P(EllpackPage, SparseEllpack, ::testing::Values(.0f, .2f, .4f, .8f));
+
+TEST(EllpackPage, IsDense) {
+  auto test = [](float sparsity) {
+    auto p_fmat = RandomDataGenerator{64, 16, sparsity}.GenerateDMatrix();
+    auto p = BatchParam{16, tree::TrainParam::DftSparseThreshold()};
+    auto ctx = MakeCUDACtx(0);
+    for (auto const& page : p_fmat->GetBatches<EllpackPage>(&ctx, p)) {
+      auto d_acc = page.Impl()->GetDeviceAccessor(&ctx);
+      if (sparsity == 0.0) {
+        ASSERT_EQ(d_acc.IsDense(), page.Impl()->IsDense());
+        ASSERT_TRUE(d_acc.IsDense());
+        ASSERT_EQ(p.max_bin, d_acc.NullValue());
+      } else {
+        ASSERT_FALSE(d_acc.IsDense());
+        ASSERT_EQ(p.max_bin * p_fmat->Info().num_col_, d_acc.NullValue());
+      }
+      std::vector<common::CompressedByteT> h_storage;
+      auto h_acc = page.Impl()->GetHostAccessor(&ctx, &h_storage);
+      if (sparsity == 0.0) {
+        ASSERT_TRUE(h_acc.IsDense());
+      } else {
+        ASSERT_FALSE(h_acc.IsDense());
+      }
+    }
+  };
+  test(0.0);
+  test(0.5);
+}
 }  // namespace xgboost