Add wrapper for CUDA host pinned memory pool. (#11451)

trivialfis · web-flow · commit bc19741748ce · 2025-05-14T01:46:16.000+08:00
diff --git a/src/common/cuda_pinned_allocator.cu b/src/common/cuda_pinned_allocator.cu
@@ -0,0 +1,73 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#include "cuda_pinned_allocator.h"
+
+#if defined(XGBOOST_USE_CUDA)
+
+#include <cuda_runtime_api.h>  // for cudaMemPoolCreate, cudaMemPoolDestroy
+
+#include <array>    // for array
+#include <cstring>  // for memset
+#include <memory>   // for unique_ptr
+
+#endif  // defined(XGBOOST_USE_CUDA)
+
+#include "common.h"
+#include "cuda_rt_utils.h"  // for CurrentDevice
+
+#if CUDART_VERSION >= 12080
+#define CUDA_HW_DECOM_AVAILABLE 1
+#endif
+
+namespace xgboost::common::cuda_impl {
+[[nodiscard]] MemPoolHdl CreateHostMemPool() {
+  auto mem_pool = std::unique_ptr<cudaMemPool_t, void (*)(cudaMemPool_t*)>{
+      [] {
+        cudaMemPoolProps h_props;
+        std::memset(&h_props, '\0', sizeof(h_props));
+        auto numa_id = curt::GetNumaId();
+        h_props.location.id = numa_id;
+        h_props.location.type = cudaMemLocationTypeHostNuma;
+        h_props.allocType = cudaMemAllocationTypePinned;
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+        h_props.usage = cudaMemPoolCreateUsageHwDecompress;
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
+        h_props.handleTypes = cudaMemHandleTypeNone;
+
+        cudaMemPoolProps d_props;
+        std::memset(&d_props, '\0', sizeof(d_props));
+        auto device_idx = curt::CurrentDevice();
+        d_props.location.id = device_idx;
+        d_props.location.type = cudaMemLocationTypeDevice;
+        d_props.allocType = cudaMemAllocationTypePinned;
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+        d_props.usage = cudaMemPoolCreateUsageHwDecompress;
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
+        d_props.handleTypes = cudaMemHandleTypeNone;
+
+        std::array<cudaMemPoolProps, 2> vprops{h_props, d_props};
+
+        cudaMemPool_t* mem_pool = new cudaMemPool_t;
+        dh::safe_cuda(cudaMemPoolCreate(mem_pool, vprops.data()));
+
+        cudaMemAccessDesc h_desc;
+        h_desc.location = h_props.location;
+        h_desc.flags = cudaMemAccessFlagsProtReadWrite;
+
+        cudaMemAccessDesc d_desc;
+        d_desc.location = d_props.location;
+        d_desc.flags = cudaMemAccessFlagsProtReadWrite;
+
+        std::array<cudaMemAccessDesc, 2> descs{h_desc, d_desc};
+        dh::safe_cuda(cudaMemPoolSetAccess(*mem_pool, descs.data(), descs.size()));
+        return mem_pool;
+      }(),
+      [](cudaMemPool_t* mem_pool) {
+        if (mem_pool) {
+          dh::safe_cuda(cudaMemPoolDestroy(*mem_pool));
+        }
+      }};
+  return mem_pool;
+}
+}  // namespace xgboost::common::cuda_impl
diff --git a/src/common/cuda_pinned_allocator.h b/src/common/cuda_pinned_allocator.h
@@ -1,15 +1,15 @@
 /**
- * Copyright 2022-2024, XGBoost Contributors
+ * Copyright 2022-2025, XGBoost Contributors
  *
  * @brief cuda pinned allocator for usage with thrust containers
  */
-
 #pragma once
 
 #include <cuda_runtime.h>
 
 #include <cstddef>  // for size_t
 #include <limits>   // for numeric_limits
+#include <memory>   // for unique_ptr
 #include <new>      // for bad_array_new_length
 
 #include "common.h"
@@ -103,6 +103,34 @@ struct SamAllocPolicy {
   }
 };
 
+/**
+ * @brief A RAII handle type to the CUDA memory pool.
+ */
+using MemPoolHdl = std::unique_ptr<cudaMemPool_t, void (*)(cudaMemPool_t*)>;
+
+/**
+ * @brief Create a CUDA memory pool for allocating host pinned memory.
+ */
+[[nodiscard]] MemPoolHdl CreateHostMemPool();
+
+/**
+ * @brief C++ wrapper for the CUDA memory pool.
+ */
+class HostPinnedMemPool {
+  MemPoolHdl pool_;
+
+ public:
+  HostPinnedMemPool() : pool_{CreateHostMemPool()} {}
+  void* AllocateAsync(std::size_t n_bytes, cudaStream_t stream) {
+    void* ptr = nullptr;
+    dh::safe_cuda(cudaMallocFromPoolAsync(&ptr, n_bytes, *this->pool_, stream));
+    return ptr;
+  }
+  void DeallocateAsync(void* ptr, cudaStream_t stream) {
+    dh::safe_cuda(cudaFreeAsync(ptr, stream));
+  }
+};
+
 template <typename T, template <typename> typename Policy>
 class CudaHostAllocatorImpl : public Policy<T> {
  public:
diff --git a/src/common/cuda_rt_utils.cc b/src/common/cuda_rt_utils.cc
@@ -5,7 +5,9 @@
 
 #if defined(XGBOOST_USE_CUDA)
 #include <cuda_runtime_api.h>
-#endif  // defined(XGBOOST_USE_CUDA)
+
+#include <algorithm>  // for max
+#endif                // defined(XGBOOST_USE_CUDA)
 
 #include <cstddef>  // for size_t
 #include <cstdint>  // for int32_t
@@ -102,6 +104,13 @@ void DrVersion(std::int32_t* major, std::int32_t* minor) {
   GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaDriverGetVersion(ver)); }, major, minor);
 }
 
+[[nodiscard]] std::int32_t GetNumaId() {
+  std::int32_t numa_id = -1;
+  dh::safe_cuda(cudaDeviceGetAttribute(&numa_id, cudaDevAttrNumaId, curt::CurrentDevice()));
+  numa_id = std::max(numa_id, 0);
+  return numa_id;
+}
+
 #else
 std::int32_t AllVisibleGPUs() { return 0; }
 
@@ -125,5 +134,11 @@ void SetDevice(std::int32_t device) {
     common::AssertGPUSupport();
   }
 }
+
+[[nodiscard]] std::int32_t GetNumaId() {
+  common::AssertGPUSupport();
+  return 0;
+}
+
 #endif  // !defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost::curt
diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h
@@ -34,4 +34,7 @@ void RtVersion(std::int32_t* major, std::int32_t* minor);
 
 // Returns the latest version of CUDA supported by the driver.
 void DrVersion(std::int32_t* major, std::int32_t* minor);
+
+// Get the current device's numa ID.
+[[nodiscard]] std::int32_t GetNumaId();
 }  // namespace xgboost::curt
diff --git a/src/common/io.h b/src/common/io.h
@@ -282,12 +282,13 @@ class ResourceHandler {
  public:
   // RTTI
   enum Kind : std::uint8_t {
-    kMalloc = 0,         // System memory.
-    kMmap = 1,           // Memory mapp.
-    kCudaMalloc = 2,     // CUDA device memory.
-    kCudaMmap = 3,       // CUDA with mmap.
-    kCudaHostCache = 4,  // CUDA pinned host memory.
-    kCudaGrowOnly = 5,   // CUDA virtual memory allocator.
+    kMalloc = 0,             // System memory.
+    kMmap = 1,               // Memory mapp.
+    kCudaMalloc = 2,         // CUDA device memory.
+    kCudaMmap = 3,           // CUDA with mmap.
+    kCudaHostCache = 4,      // CUDA pinned host memory.
+    kCudaGrowOnly = 5,       // CUDA virtual memory allocator.
+    kCudaPinnedMemPool = 6,  // CUDA memory pool for pinned host memory.
   };
 
  private:
@@ -316,6 +317,8 @@ class ResourceHandler {
         return "CudaHostCache";
       case kCudaGrowOnly:
         return "CudaGrowOnly";
+      case kCudaPinnedMemPool:
+        return "CudaPinnedMemPool";
     }
     LOG(FATAL) << "Unreachable.";
     return {};
diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2024, XGBoost Contributors
+ * Copyright 2024-2025, XGBoost Contributors
  */
 #pragma once
 
@@ -43,4 +43,14 @@ template <typename T>
   auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
   return ref;
 }
+
+template <typename T>
+[[nodiscard]] RefResourceView<T> MakeFixedVecWithPinnedMemPool(
+    std::shared_ptr<cuda_impl::HostPinnedMemPool> pool, std::size_t n_elements,
+    dh::CUDAStreamView stream) {
+  auto resource = std::make_shared<common::HostPinnedMemPoolResource>(
+      std::move(pool), n_elements * sizeof(T), stream);
+  auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
+  return ref;
+}
 }  // namespace xgboost::common
diff --git a/src/common/resource.cuh b/src/common/resource.cuh
@@ -1,11 +1,13 @@
 /**
- * Copyright 2024, XGBoost Contributors
+ * Copyright 2024-2025, XGBoost Contributors
  */
 #pragma once
 #include <cstddef>     // for size_t
 #include <functional>  // for function
+#include <utility>     // for move
 
-#include "cuda_pinned_allocator.h"  // for SamAllocator
+#include "cuda_pinned_allocator.h"  // for SamAllocator, HostPinnedMemPool
+#include "device_helpers.cuh"       // for CUDAStreamView
 #include "device_vector.cuh"        // for DeviceUVector, GrowOnlyVirtualMemVec
 #include "io.h"                     // for ResourceHandler, MMAPFile
 #include "xgboost/string_view.h"    // for StringView
@@ -75,6 +77,30 @@ class CudaPinnedResource : public ResourceHandler {
   void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); }
 };
 
+/**
+ * @brief Resource for fixed-size memory allocated by @ref HostPinnedMemPool.
+ *
+ * This container shares the pool but owns the memory.
+ */
+class HostPinnedMemPoolResource : public ResourceHandler {
+  std::shared_ptr<cuda_impl::HostPinnedMemPool> pool_;
+  std::size_t n_bytes_;
+  dh::CUDAStreamView stream_;
+  void* ptr_;
+
+ public:
+  explicit HostPinnedMemPoolResource(std::shared_ptr<cuda_impl::HostPinnedMemPool> pool,
+                                     std::size_t n_bytes, dh::CUDAStreamView stream)
+      : ResourceHandler{kCudaPinnedMemPool},
+        pool_{std::move(pool)},
+        n_bytes_{n_bytes},
+        stream_{stream},
+        ptr_{this->pool_->AllocateAsync(n_bytes, stream)} {}
+  ~HostPinnedMemPoolResource() override { this->pool_->DeallocateAsync(this->ptr_, this->stream_); }
+  [[nodiscard]] std::size_t Size() const override { return this->n_bytes_; }
+  [[nodiscard]] void* Data() override { return this->ptr_; }
+};
+
 class CudaMmapResource : public ResourceHandler {
   std::unique_ptr<MMAPFile, std::function<void(MMAPFile*)>> handle_;
   std::size_t n_;
diff --git a/tests/cpp/common/test_ref_resource_view.cu b/tests/cpp/common/test_ref_resource_view.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2024, XGBoost Contributors
+ * Copyright 2024-2025, XGBoost Contributors
  */
 #if defined(__linux__)
 
@@ -10,7 +10,8 @@
 #include <thrust/sequence.h>                    // for sequence
 
 #include "../../../src/common/ref_resource_view.cuh"
-#include "../helpers.h"  // for MakeCUDACtx
+#include "../../../src/common/threadpool.h"  // for ThreadPool
+#include "../helpers.h"                      // for MakeCUDACtx
 
 namespace xgboost::common {
 class TestCudaGrowOnly : public ::testing::TestWithParam<std::size_t> {
@@ -44,6 +45,48 @@ class TestCudaGrowOnly : public ::testing::TestWithParam<std::size_t> {
 TEST_P(TestCudaGrowOnly, Resize) { this->Run(this->GetParam()); }
 
 INSTANTIATE_TEST_SUITE_P(RefResourceView, TestCudaGrowOnly, ::testing::Values(1 << 20, 1 << 21));
+
+TEST(HostPinnedMemPool, Alloc) {
+  std::vector<RefResourceView<double>> refs;
+
+  {
+    // pool goes out of scope before refs does. Test memory safety.
+    auto pool = std::make_shared<cuda_impl::HostPinnedMemPool>();
+    for (std::size_t i = 0; i < 4; ++i) {
+      auto ref = MakeFixedVecWithPinnedMemPool<double>(pool, 128 + i, dh::DefaultStream());
+      refs.emplace_back(std::move(ref));
+    }
+    for (std::size_t i = 0; i < 4; ++i) {
+      auto const& ref = refs[i];
+      ASSERT_EQ(ref.size(), 128 + i);
+      ASSERT_EQ(ref.size_bytes(), ref.size() * sizeof(double));
+    }
+
+    // Thread safety.
+    auto n_threads = static_cast<std::int32_t>(std::thread::hardware_concurrency());
+    common::ThreadPool workers{"tmempool", n_threads, [] {
+                               }};
+    std::vector<std::future<RefResourceView<double>>> alloc_futs;
+    for (std::int32_t i = 0, n = n_threads * 4; i < n; ++i) {
+      auto fut = workers.Submit([i, pool] {
+        auto ref = MakeFixedVecWithPinnedMemPool<double>(pool, 128 + i, dh::DefaultStream());
+        return ref;
+      });
+      alloc_futs.emplace_back(std::move(fut));
+    }
+    std::vector<std::future<void>> free_futs(alloc_futs.size());
+    for (std::int32_t i = 0, n = n_threads * 4; i < n; ++i) {
+      auto fut = workers.Submit([i, pool, &alloc_futs, &free_futs] {
+        auto ref = alloc_futs[i].get();
+        ASSERT_EQ(ref.size(), 128 + i);
+      });
+      free_futs[i] = std::move(fut);
+    }
+    for (std::int32_t i = 0, n = n_threads * 4; i < n; ++i) {
+      free_futs[i].get();
+    }
+  }
+}
 }  // namespace xgboost::common
 
 #endif  // defined(__linux__)