dmlc
diff --git a/‎CMakeLists.txt
Lines changed: 12 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 12 additions & 0 deletions
diff --git a/‎cmake/Utils.cmake
Lines changed: 8 additions & 0 deletions b/‎cmake/Utils.cmake
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/common/common.h
Lines changed: 8 additions & 2 deletions b/‎src/common/common.h
Lines changed: 8 additions & 2 deletions
diff --git a/‎src/common/cuda_dr_utils.cc
Lines changed: 3 additions & 1 deletion b/‎src/common/cuda_dr_utils.cc
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/common/cuda_dr_utils.h
Lines changed: 26 additions & 11 deletions b/‎src/common/cuda_dr_utils.h
Lines changed: 26 additions & 11 deletions
diff --git a/‎src/common/cuda_pinned_allocator.cu
Lines changed: 1 addition & 4 deletions b/‎src/common/cuda_pinned_allocator.cu
Lines changed: 1 addition & 4 deletions
@@ -72,6 +72,7 @@ option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binar
 ## CUDA
 option(USE_CUDA  "Build with GPU acceleration" OFF)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
+option(USE_NVCOMP "Build with nvcomp to enable sparse data compression. (experimental)" OFF)
 # This is specifically designed for PyPI binary release and should be disabled for most of the cases.
 option(USE_DLOPEN_NCCL "Whether to load nccl dynamically." OFF)
 option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
@@ -124,6 +125,9 @@ endif()
 if(USE_NCCL AND (NOT USE_CUDA))
   message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.")
 endif()
+if(USE_NVCOMP AND (NOT USE_CUDA))
+  message(SEND_ERROR "`USE_NVCOMP` must be enabled with `USE_CUDA` flag.")
+endif()
 if(USE_DEVICE_DEBUG AND (NOT USE_CUDA))
   message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_CUDA` flag.")
 endif()
@@ -234,6 +238,14 @@ if(USE_CUDA)
   find_package(CUDAToolkit REQUIRED)
 endif()
 
+if(USE_NVCOMP)
+  find_package(nvcomp REQUIRED)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 12.8)
+    message(SEND_ERROR "NVComp support requires CUDA >= 12.8")
+  endif()
+endif()
+
+
 if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
     ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
       (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
 
@@ -236,6 +236,10 @@ macro(xgboost_target_defs target)
   if(PLUGIN_RMM)
     target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1)
   endif()
+
+  if(USE_NVCOMP)
+    target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_NVCOMP=1)
+  endif()
 endmacro()
 
 # handles dependencies
@@ -262,6 +266,10 @@ macro(xgboost_target_link_libraries target)
     target_link_libraries(${target} PRIVATE rmm::rmm)
   endif()
 
+  if(USE_NVCOMP)
+    target_link_libraries(${target} PRIVATE nvcomp::nvcomp)
+  endif()
+
   if(USE_NCCL)
     xgboost_link_nccl(${target})
   endif()
 
@@ -178,13 +178,19 @@ class Range {
 
 inline void AssertGPUSupport() {
 #ifndef XGBOOST_USE_CUDA
-    LOG(FATAL) << "XGBoost version not compiled with GPU support.";
+  LOG(FATAL) << "XGBoost version not compiled with GPU support.";
+#endif  // XGBOOST_USE_CUDA
+}
+
+inline void AssertNvCompSupport() {
+#ifndef XGBOOST_USE_NVCOMP
+  LOG(FATAL) << "XGBoost is not compiled with NVCOMP support.";
 #endif  // XGBOOST_USE_CUDA
 }
 
 inline void AssertNCCLSupport() {
 #if !defined(XGBOOST_USE_NCCL)
-    LOG(FATAL) << "XGBoost version not compiled with NCCL support.";
+  LOG(FATAL) << "XGBoost version not compiled with NCCL support.";
 #endif  // !defined(XGBOOST_USE_NCCL)
 }
 
 
@@ -40,7 +40,9 @@ CuDriverApi::CuDriverApi() {
   safe_load("cuGetErrorName", &this->cuGetErrorName);
   safe_load("cuDeviceGetAttribute", &this->cuDeviceGetAttribute);
   safe_load("cuDeviceGet", &this->cuDeviceGet);
-
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+  safe_load("cuMemBatchDecompressAsync", &this->cuMemBatchDecompressAsync);
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
   CHECK(this->cuMemGetAllocationGranularity);
 }
 
 
@@ -15,6 +15,10 @@
 
 #include "xgboost/string_view.h"  // for StringView
 
+#if CUDART_VERSION >= 12080
+#define CUDA_HW_DECOM_AVAILABLE 1
+#endif
+
 namespace xgboost::cudr {
 /**
  * @brief A struct for retrieving CUDA driver API from the runtime API.
@@ -44,28 +48,39 @@ struct CuDriverApi {
   using DeviceGetAttribute = CUresult(int *pi, CUdevice_attribute attrib, CUdevice dev);
   using DeviceGet = CUresult(CUdevice *device, int ordinal);
 
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+  using BatchDecompressAsync = CUresult(CUmemDecompressParams *paramsArray, size_t count,
+                                        unsigned int flags, size_t *errorIndex, CUstream stream);
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
+
   MemGetAllocationGranularityFn *cuMemGetAllocationGranularity{nullptr};  // NOLINT
   MemCreateFn *cuMemCreate{nullptr};                                      // NOLINT
   /**
    * @param[in] offset - Must be zero.
    */
-  MemMapFn *cuMemMap{nullptr};                                            // NOLINT
+  MemMapFn *cuMemMap{nullptr};  // NOLINT
   /**
    * @param[out] ptr       - Resulting pointer to start of virtual address range allocated
    * @param[in]  size      - Size of the reserved virtual address range requested
    * @param[in]  alignment - Alignment of the reserved virtual address range requested
    * @param[in]  addr      - Fixed starting address range requested
    * @param[in]  flags     - Currently unused, must be zero
    */
-  MemAddressReserveFn *cuMemAddressReserve{nullptr};  // NOLINT
-  MemSetAccessFn *cuMemSetAccess{nullptr};            // NOLINT
-  MemUnmapFn *cuMemUnmap{nullptr};                    // NOLINT
-  MemReleaseFn *cuMemRelease{nullptr};                // NOLINT
-  MemAddressFreeFn *cuMemAddressFree{nullptr};        // NOLINT
-  GetErrorString *cuGetErrorString{nullptr};          // NOLINT
-  GetErrorName *cuGetErrorName{nullptr};              // NOLINT
-  DeviceGetAttribute *cuDeviceGetAttribute{nullptr};  // NOLINT
-  DeviceGet *cuDeviceGet{nullptr};                    // NOLINT
+  MemAddressReserveFn *cuMemAddressReserve{nullptr};      // NOLINT
+  MemSetAccessFn *cuMemSetAccess{nullptr};                // NOLINT
+  MemUnmapFn *cuMemUnmap{nullptr};                        // NOLINT
+  MemReleaseFn *cuMemRelease{nullptr};                    // NOLINT
+  MemAddressFreeFn *cuMemAddressFree{nullptr};            // NOLINT
+  GetErrorString *cuGetErrorString{nullptr};              // NOLINT
+  GetErrorName *cuGetErrorName{nullptr};                  // NOLINT
+  DeviceGetAttribute *cuDeviceGetAttribute{nullptr};      // NOLINT
+  DeviceGet *cuDeviceGet{nullptr};                        // NOLINT
+
+#if defined(CUDA_HW_DECOM_AVAILABLE)
+
+  BatchDecompressAsync *cuMemBatchDecompressAsync{nullptr};  // NOLINT
+
+#endif  // defined(CUDA_HW_DECOM_AVAILABLE)
 
   CuDriverApi();
 
@@ -96,7 +111,7 @@ inline auto GetAllocGranularity(CUmemAllocationProp const *prop) {
 /**
  * @brief Obtain appropriate device ordinal for `CUmemLocation`.
  */
-void MakeCuMemLocation(CUmemLocationType type, CUmemLocation* loc);
+void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc);
 
 /**
  * @brief Construct a `CUmemAllocationProp`.
 
@@ -14,12 +14,9 @@
 #endif  // defined(XGBOOST_USE_CUDA)
 
 #include "common.h"
+#include "cuda_dr_utils.h"  // for CUDA_HW_DECOM_AVAILABLE
 #include "cuda_rt_utils.h"  // for CurrentDevice
 
-#if CUDART_VERSION >= 12080
-#define CUDA_HW_DECOM_AVAILABLE 1
-#endif
-
 namespace xgboost::common::cuda_impl {
 [[nodiscard]] MemPoolHdl CreateHostMemPool() {
   auto mem_pool = std::unique_ptr<cudaMemPool_t, void (*)(cudaMemPool_t*)>{