Skip to content

Commit f786d37

Browse files
authored
[EM] Initial support for splitting up the host cache. (#11453)
- Add a new parameter to specify the portion of the cache. - Split the host cache into host + device cache. Limitations: - Direct access to the cache page has not yet been implemented. - More work is needed to find an optimal split policy. For now, auto means host only.
1 parent bc19741 commit f786d37

27 files changed

+424
-125
lines changed

doc/tutorials/external_memory.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ the GPU. Following is a snippet from :ref:`sphx_glr_python_examples_external_mem
162162
# It's important to use RMM for GPU-based external memory to improve performance.
163163
# If XGBoost is not built with RMM support, a warning will be raised.
164164
# We use the pool memory resource here for simplicity, you can also try the
165-
`ArenaMemoryResource` for # improved memory fragmentation handling.
165+
# `ArenaMemoryResource` for # improved memory fragmentation handling.
166166
mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
167167
rmm.mr.set_current_device_resource(mr)
168168
# Set the allocator for cupy as well.

include/xgboost/c_api.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,10 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand
588588
* help bound the memory usage. By default, XGBoost grows new sub-streams
589589
* exponentially until batches are exhausted. Only used for the training dataset and
590590
* the default is None (unbounded).
591+
* - cache_host_ratio (optioinal): For GPU-based inputs, XGBoost can split the cache into
592+
* host and device portitions to reduce the data transfer overhead. This parameter
593+
* specifies the size of host cache compared to the size of the entire cache:
594+
* `host / (host + device)`.
591595
* @param out The created Quantile DMatrix.
592596
*
593597
* @return 0 when success, -1 when failure happens

include/xgboost/data.h

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -532,20 +532,23 @@ struct ExtMemConfig {
532532
// Cache prefix, not used if the cache is in the host memory. (on_host is true)
533533
std::string cache;
534534
// Whether the ellpack page is stored in the host memory.
535-
bool on_host{true};
535+
bool on_host;
536+
// Host cache/Total cache for the GPU impl.
537+
float cache_host_ratio;
536538
// Minimum number of of bytes for each ellpack page in cache. Only used for in-host
537539
// ExtMemQdm.
538-
std::int64_t min_cache_page_bytes{0};
540+
std::int64_t min_cache_page_bytes;
539541
// Missing value.
540-
float missing{std::numeric_limits<float>::quiet_NaN()};
542+
float missing;
541543
// The number of CPU threads.
542544
std::int32_t n_threads{0};
543545

544-
ExtMemConfig() = default;
545-
ExtMemConfig(std::string cache, bool on_host, std::int64_t min_cache, float missing,
546-
std::int32_t n_threads)
546+
ExtMemConfig() = delete;
547+
ExtMemConfig(std::string cache, bool on_host, float h_ratio, std::int64_t min_cache,
548+
float missing, std::int32_t n_threads)
547549
: cache{std::move(cache)},
548550
on_host{on_host},
551+
cache_host_ratio{h_ratio},
549552
min_cache_page_bytes{min_cache},
550553
missing{missing},
551554
n_threads{n_threads} {}

python-package/xgboost/core.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1821,6 +1821,7 @@ def __init__( # pylint: disable=super-init-not-called
18211821
ref: Optional[DMatrix] = None,
18221822
enable_categorical: bool = False,
18231823
max_quantile_batches: Optional[int] = None,
1824+
cache_host_ratio: Optional[float] = None,
18241825
) -> None:
18251826
"""
18261827
Parameters
@@ -1831,6 +1832,15 @@ def __init__( # pylint: disable=super-init-not-called
18311832
max_quantile_batches :
18321833
See :py:class:`QuantileDMatrix`.
18331834
1835+
cache_host_ratio :
1836+
1837+
.. versionadded:: 3.1.0
1838+
1839+
Used by the GPU implementation. For GPU-based inputs, XGBoost can split the
1840+
cache into host and device caches to reduce the data transfer overhead. This
1841+
parameter specifies the size of host cache compared to the size of the
1842+
entire cache: :math:`host / (host + device)`.
1843+
18341844
"""
18351845
self.max_bin = max_bin
18361846
self.missing = missing if missing is not None else np.nan
@@ -1841,6 +1851,9 @@ def __init__( # pylint: disable=super-init-not-called
18411851
ref,
18421852
enable_categorical=enable_categorical,
18431853
max_quantile_blocks=max_quantile_batches,
1854+
cache_host_ratio=(
1855+
None if cache_host_ratio is None else float(cache_host_ratio)
1856+
),
18441857
)
18451858
assert self.handle is not None
18461859

@@ -1851,6 +1864,7 @@ def _init(
18511864
*,
18521865
enable_categorical: bool,
18531866
max_quantile_blocks: Optional[int] = None,
1867+
cache_host_ratio: Optional[float] = None,
18541868
) -> None:
18551869
args = make_jcargs(
18561870
missing=self.missing,
@@ -1861,6 +1875,7 @@ def _init(
18611875
min_cache_page_bytes=it.min_cache_page_bytes,
18621876
# It's called blocks internally due to block-based quantile sketching.
18631877
max_quantile_blocks=max_quantile_blocks,
1878+
cache_host_ratio=cache_host_ratio,
18641879
)
18651880
handle = ctypes.c_void_p()
18661881
reset_callback, next_callback = it.get_callbacks(enable_categorical)

src/c_api/c_api.cc

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -325,12 +325,15 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy
325325
cuda_impl::MatchingPageBytes());
326326
CHECK_EQ(min_cache_page_bytes, cuda_impl::MatchingPageBytes())
327327
<< "Page concatenation is not supported by the DMatrix yet.";
328+
auto cache_host_ratio =
329+
OptionalArg<Number, float>(jconfig, "cache_host_ratio", cuda_impl::AutoHostRatio());
328330

329331
xgboost_CHECK_C_ARG_PTR(next);
330332
xgboost_CHECK_C_ARG_PTR(reset);
331333
xgboost_CHECK_C_ARG_PTR(out);
332334

333-
auto config = ExtMemConfig{cache, on_host, min_cache_page_bytes, missing, n_threads};
335+
auto config =
336+
ExtMemConfig{cache, on_host, cache_host_ratio, min_cache_page_bytes, missing, n_threads};
334337
*out = new std::shared_ptr<xgboost::DMatrix>{
335338
xgboost::DMatrix::Create(iter, proxy, reset, next, config)};
336339
API_END();
@@ -393,12 +396,15 @@ XGB_DLL int XGExtMemQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr
393396
cuda_impl::AutoCachePageBytes());
394397
auto max_quantile_blocks = OptionalArg<Integer, std::int64_t>(
395398
jconfig, "max_quantile_blocks", std::numeric_limits<std::int64_t>::max());
399+
auto cache_host_ratio =
400+
OptionalArg<Number, float>(jconfig, "cache_host_ratio", cuda_impl::AutoHostRatio());
396401

397402
xgboost_CHECK_C_ARG_PTR(next);
398403
xgboost_CHECK_C_ARG_PTR(reset);
399404
xgboost_CHECK_C_ARG_PTR(out);
400405

401-
auto config = ExtMemConfig{cache, on_host, min_cache_page_bytes, missing, n_threads};
406+
auto config =
407+
ExtMemConfig{cache, on_host, cache_host_ratio, min_cache_page_bytes, missing, n_threads};
402408
*out = new std::shared_ptr<xgboost::DMatrix>{xgboost::DMatrix::Create(
403409
iter, proxy, p_ref, reset, next, max_bin, max_quantile_blocks, config)};
404410
API_END();

src/common/error_msg.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* Copyright 2023-2024, XGBoost contributors
2+
* Copyright 2023-2025, XGBoost contributors
33
*
44
* \brief Common error message for various checks.
55
*/
@@ -135,5 +135,12 @@ constexpr StringView NoFloatCat() {
135135
return "Category index from DataFrame has floating point dtype, consider using strings or "
136136
"integers instead.";
137137
}
138+
139+
constexpr StringView CacheHostRatioNotImpl() {
140+
return "`cache_host_ratio` is only used by the GPU `ExtMemQuantileDMatrix`.";
141+
}
142+
constexpr StringView CacheHostRatioInvalid() {
143+
return "`cache_host_ratio` must be in range [0, 1].";
144+
}
138145
} // namespace xgboost::error
139146
#endif // XGBOOST_COMMON_ERROR_MSG_H_

src/common/nvtx_utils.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,18 @@ inline auto MakeScopedRange(StringView, Rgb) { return ScopedRange{}; }
4747
} // namespace xgboost::nvtx
4848

4949
#if defined(XGBOOST_USE_NVTX)
50+
51+
// Macro for making NVTX function range.
5052
#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::nvtx::Domain)
53+
54+
// Macro for making colored NVTX function range.
55+
#define xgboost_NVTX_FN_RANGE_C(r, g, b) \
56+
auto __nvtx_scoped__ = ::xgboost::nvtx::MakeScopedRange(__func__, (nvtx::Rgb((r), (g), (b))))
57+
5158
#else
59+
5260
#define xgboost_NVTX_FN_RANGE()
61+
62+
#define xgboost_NVTX_FN_RANGE_C(r, g, b)
63+
5364
#endif // defined(XGBOOST_USE_NVTX)

src/common/ref_resource_view.cuh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ template <typename T>
2929
return ref;
3030
}
3131

32+
/**
33+
* @brief Initialize the data in addition to allocation.
34+
*/
3235
template <typename T>
3336
[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
3437
std::size_t n_elements, T const& init) {

src/common/ref_resource_view.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ class RefResourceView {
6868

6969
[[nodiscard]] size_type size() const { return size_; } // NOLINT
7070
[[nodiscard]] size_type size_bytes() const { // NOLINT
71-
return Span<const value_type>{data(), static_cast<size_t>(size())}.size_bytes();
71+
return Span<const value_type>{data(), static_cast<std::size_t>(size())}.size_bytes();
7272
}
7373
[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT
7474
[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT

src/data/batch_utils.cc

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* Copyright 2023-2024, XGBoost Contributors
2+
* Copyright 2023-2025, XGBoost Contributors
33
*/
44
#include "batch_utils.h"
55

@@ -11,4 +11,20 @@ void CheckParam(BatchParam const& init, BatchParam const& param) {
1111
CHECK(!param.regen && param.hess.empty())
1212
<< "Only the `hist` tree method can use the `QuantileDMatrix`.";
1313
}
14+
15+
[[nodiscard]] float DftHostRatio(float cache_host_ratio, bool is_validation) {
16+
if (is_validation) {
17+
// Don't split the cache if this is a validation dataset.
18+
return 1.0;
19+
}
20+
if (HostRatioIsAuto(cache_host_ratio)) {
21+
// Only NVML has the API to detect the topology. We will leave it as-is for now.
22+
cache_host_ratio = 1.0;
23+
return cache_host_ratio;
24+
}
25+
// Use user config.
26+
CHECK_GE(cache_host_ratio, 0.0f) << error::CacheHostRatioInvalid();
27+
CHECK_LE(cache_host_ratio, 1.0f) << error::CacheHostRatioInvalid();
28+
return cache_host_ratio;
29+
}
1430
} // namespace xgboost::data::detail

0 commit comments

Comments
 (0)