dmlc
diff --git a/‎doc/tutorials/external_memory.rst
Lines changed: 1 addition & 1 deletion b/‎doc/tutorials/external_memory.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/xgboost/c_api.h
Lines changed: 4 additions & 0 deletions b/‎include/xgboost/c_api.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/xgboost/data.h
Lines changed: 9 additions & 6 deletions b/‎include/xgboost/data.h
Lines changed: 9 additions & 6 deletions
diff --git a/‎python-package/xgboost/core.py
Lines changed: 15 additions & 0 deletions b/‎python-package/xgboost/core.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/c_api/c_api.cc
Lines changed: 8 additions & 2 deletions b/‎src/c_api/c_api.cc
Lines changed: 8 additions & 2 deletions
diff --git a/‎src/common/error_msg.h
Lines changed: 8 additions & 1 deletion b/‎src/common/error_msg.h
Lines changed: 8 additions & 1 deletion
diff --git a/‎src/common/nvtx_utils.h
Lines changed: 11 additions & 0 deletions b/‎src/common/nvtx_utils.h
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/common/ref_resource_view.cuh
Lines changed: 3 additions & 0 deletions b/‎src/common/ref_resource_view.cuh
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/common/ref_resource_view.h
Lines changed: 1 addition & 1 deletion b/‎src/common/ref_resource_view.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/data/batch_utils.cc
Lines changed: 17 additions & 1 deletion b/‎src/data/batch_utils.cc
Lines changed: 17 additions & 1 deletion
@@ -162,7 +162,7 @@ the GPU. Following is a snippet from :ref:`sphx_glr_python_examples_external_mem
     # It's important to use RMM for GPU-based external memory to improve performance.
     # If XGBoost is not built with RMM support, a warning will be raised.
     # We use the pool memory resource here for simplicity, you can also try the
-    `ArenaMemoryResource` for # improved memory fragmentation handling.
+    # `ArenaMemoryResource` for # improved memory fragmentation handling.
     mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
     rmm.mr.set_current_device_resource(mr)
     # Set the allocator for cupy as well.
 
@@ -588,6 +588,10 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand
  *       help bound the memory usage. By default, XGBoost grows new sub-streams
  *       exponentially until batches are exhausted. Only used for the training dataset and
  *       the default is None (unbounded).
+ * - cache_host_ratio (optioinal): For GPU-based inputs, XGBoost can split the cache into
+ *      host and device portitions to reduce the data transfer overhead. This parameter
+ *      specifies the size of host cache compared to the size of the entire cache:
+ *      `host / (host + device)`.
  * @param out The created Quantile DMatrix.
  *
  * @return 0 when success, -1 when failure happens
 
@@ -532,20 +532,23 @@ struct ExtMemConfig {
   // Cache prefix, not used if the cache is in the host memory. (on_host is true)
   std::string cache;
   // Whether the ellpack page is stored in the host memory.
-  bool on_host{true};
+  bool on_host;
+  // Host cache/Total cache for the GPU impl.
+  float cache_host_ratio;
   // Minimum number of of bytes for each ellpack page in cache. Only used for in-host
   // ExtMemQdm.
-  std::int64_t min_cache_page_bytes{0};
+  std::int64_t min_cache_page_bytes;
   // Missing value.
-  float missing{std::numeric_limits<float>::quiet_NaN()};
+  float missing;
   // The number of CPU threads.
   std::int32_t n_threads{0};
 
-  ExtMemConfig() = default;
-  ExtMemConfig(std::string cache, bool on_host, std::int64_t min_cache, float missing,
-               std::int32_t n_threads)
+  ExtMemConfig() = delete;
+  ExtMemConfig(std::string cache, bool on_host, float h_ratio, std::int64_t min_cache,
+               float missing, std::int32_t n_threads)
       : cache{std::move(cache)},
         on_host{on_host},
+        cache_host_ratio{h_ratio},
         min_cache_page_bytes{min_cache},
         missing{missing},
         n_threads{n_threads} {}
 
@@ -1821,6 +1821,7 @@ def __init__(  # pylint: disable=super-init-not-called
         ref: Optional[DMatrix] = None,
         enable_categorical: bool = False,
         max_quantile_batches: Optional[int] = None,
+        cache_host_ratio: Optional[float] = None,
     ) -> None:
         """
         Parameters
@@ -1831,6 +1832,15 @@ def __init__(  # pylint: disable=super-init-not-called
         max_quantile_batches :
             See :py:class:`QuantileDMatrix`.
 
+        cache_host_ratio :
+
+            .. versionadded:: 3.1.0
+
+            Used by the GPU implementation. For GPU-based inputs, XGBoost can split the
+            cache into host and device caches to reduce the data transfer overhead. This
+            parameter specifies the size of host cache compared to the size of the
+            entire cache: :math:`host / (host + device)`.
+
         """
         self.max_bin = max_bin
         self.missing = missing if missing is not None else np.nan
@@ -1841,6 +1851,9 @@ def __init__(  # pylint: disable=super-init-not-called
             ref,
             enable_categorical=enable_categorical,
             max_quantile_blocks=max_quantile_batches,
+            cache_host_ratio=(
+                None if cache_host_ratio is None else float(cache_host_ratio)
+            ),
         )
         assert self.handle is not None
 
@@ -1851,6 +1864,7 @@ def _init(
         *,
         enable_categorical: bool,
         max_quantile_blocks: Optional[int] = None,
+        cache_host_ratio: Optional[float] = None,
     ) -> None:
         args = make_jcargs(
             missing=self.missing,
@@ -1861,6 +1875,7 @@ def _init(
             min_cache_page_bytes=it.min_cache_page_bytes,
             # It's called blocks internally due to block-based quantile sketching.
             max_quantile_blocks=max_quantile_blocks,
+            cache_host_ratio=cache_host_ratio,
         )
         handle = ctypes.c_void_p()
         reset_callback, next_callback = it.get_callbacks(enable_categorical)
 
@@ -325,12 +325,15 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy
                                                                  cuda_impl::MatchingPageBytes());
   CHECK_EQ(min_cache_page_bytes, cuda_impl::MatchingPageBytes())
       << "Page concatenation is not supported by the DMatrix yet.";
+  auto cache_host_ratio =
+      OptionalArg<Number, float>(jconfig, "cache_host_ratio", cuda_impl::AutoHostRatio());
 
   xgboost_CHECK_C_ARG_PTR(next);
   xgboost_CHECK_C_ARG_PTR(reset);
   xgboost_CHECK_C_ARG_PTR(out);
 
-  auto config = ExtMemConfig{cache, on_host, min_cache_page_bytes, missing, n_threads};
+  auto config =
+      ExtMemConfig{cache, on_host, cache_host_ratio, min_cache_page_bytes, missing, n_threads};
   *out = new std::shared_ptr<xgboost::DMatrix>{
       xgboost::DMatrix::Create(iter, proxy, reset, next, config)};
   API_END();
@@ -393,12 +396,15 @@ XGB_DLL int XGExtMemQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr
                                                                  cuda_impl::AutoCachePageBytes());
   auto max_quantile_blocks = OptionalArg<Integer, std::int64_t>(
       jconfig, "max_quantile_blocks", std::numeric_limits<std::int64_t>::max());
+  auto cache_host_ratio =
+      OptionalArg<Number, float>(jconfig, "cache_host_ratio", cuda_impl::AutoHostRatio());
 
   xgboost_CHECK_C_ARG_PTR(next);
   xgboost_CHECK_C_ARG_PTR(reset);
   xgboost_CHECK_C_ARG_PTR(out);
 
-  auto config = ExtMemConfig{cache, on_host, min_cache_page_bytes, missing, n_threads};
+  auto config =
+      ExtMemConfig{cache, on_host, cache_host_ratio, min_cache_page_bytes, missing, n_threads};
   *out = new std::shared_ptr<xgboost::DMatrix>{xgboost::DMatrix::Create(
       iter, proxy, p_ref, reset, next, max_bin, max_quantile_blocks, config)};
   API_END();
 
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023-2024, XGBoost contributors
+ * Copyright 2023-2025, XGBoost contributors
  *
  * \brief Common error message for various checks.
  */
@@ -135,5 +135,12 @@ constexpr StringView NoFloatCat() {
   return "Category index from DataFrame has floating point dtype, consider using strings or "
          "integers instead.";
 }
+
+constexpr StringView CacheHostRatioNotImpl() {
+  return "`cache_host_ratio` is only used by the GPU `ExtMemQuantileDMatrix`.";
+}
+constexpr StringView CacheHostRatioInvalid() {
+  return "`cache_host_ratio` must be in range [0, 1].";
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
@@ -47,7 +47,18 @@ inline auto MakeScopedRange(StringView, Rgb) { return ScopedRange{}; }
 }  // namespace xgboost::nvtx
 
 #if defined(XGBOOST_USE_NVTX)
+
+// Macro for making NVTX function range.
 #define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::nvtx::Domain)
+
+// Macro for making colored NVTX function range.
+#define xgboost_NVTX_FN_RANGE_C(r, g, b) \
+  auto __nvtx_scoped__ = ::xgboost::nvtx::MakeScopedRange(__func__, (nvtx::Rgb((r), (g), (b))))
+
 #else
+
 #define xgboost_NVTX_FN_RANGE()
+
+#define xgboost_NVTX_FN_RANGE_C(r, g, b)
+
 #endif  // defined(XGBOOST_USE_NVTX)
@@ -29,6 +29,9 @@ template <typename T>
   return ref;
 }
 
+/**
+ * @brief Initialize the data in addition to allocation.
+ */
 template <typename T>
 [[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
                                                             std::size_t n_elements, T const& init) {
 
@@ -68,7 +68,7 @@ class RefResourceView {
 
   [[nodiscard]] size_type size() const { return size_; }  // NOLINT
   [[nodiscard]] size_type size_bytes() const {            // NOLINT
-    return Span<const value_type>{data(), static_cast<size_t>(size())}.size_bytes();
+    return Span<const value_type>{data(), static_cast<std::size_t>(size())}.size_bytes();
   }
   [[nodiscard]] value_type* data() { return ptr_; };              // NOLINT
   [[nodiscard]] value_type const* data() const { return ptr_; };  // NOLINT
 
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023-2024, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #include "batch_utils.h"
 
@@ -11,4 +11,20 @@ void CheckParam(BatchParam const& init, BatchParam const& param) {
   CHECK(!param.regen && param.hess.empty())
       << "Only the `hist` tree method can use the `QuantileDMatrix`.";
 }
+
+[[nodiscard]] float DftHostRatio(float cache_host_ratio, bool is_validation) {
+  if (is_validation) {
+    // Don't split the cache if this is a validation dataset.
+    return 1.0;
+  }
+  if (HostRatioIsAuto(cache_host_ratio)) {
+    // Only NVML has the API to detect the topology. We will leave it as-is for now.
+    cache_host_ratio = 1.0;
+    return cache_host_ratio;
+  }
+  // Use user config.
+  CHECK_GE(cache_host_ratio, 0.0f) << error::CacheHostRatioInvalid();
+  CHECK_LE(cache_host_ratio, 1.0f) << error::CacheHostRatioInvalid();
+  return cache_host_ratio;
+}
 }  // namespace xgboost::data::detail
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,9 @@ template <typename T>`
`29`	`29`	`return ref;`
`30`	`30`	`}`
`31`	`31`
	`32`	`+/**`
	`33`	`+ * @brief Initialize the data in addition to allocation.`
	`34`	`+ */`
`32`	`35`	`template <typename T>`
`33`	`36`	`[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,`
`34`	`37`	`std::size_t n_elements, T const& init) {`
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ class RefResourceView {`
`68`	`68`
`69`	`69`	`[[nodiscard]] size_type size() const { return size_; } // NOLINT`
`70`	`70`	`[[nodiscard]] size_type size_bytes() const { // NOLINT`
`71`		`- return Span<const value_type>{data(), static_cast<size_t>(size())}.size_bytes();`
	`71`	`+ return Span<const value_type>{data(), static_cast<std::size_t>(size())}.size_bytes();`
`72`	`72`	`}`
`73`	`73`	`[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT`
`74`	`74`	`[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT`