rapidsai
diff --git a/‎ci/build_wheel_python.sh
Lines changed: 5 additions & 1 deletion b/‎ci/build_wheel_python.sh
Lines changed: 5 additions & 1 deletion
diff --git a/‎conda/recipes/librmm/recipe.yaml
Lines changed: 12 additions & 0 deletions b/‎conda/recipes/librmm/recipe.yaml
Lines changed: 12 additions & 0 deletions
diff --git a/‎cpp/CMakeLists.txt
Lines changed: 21 additions & 12 deletions b/‎cpp/CMakeLists.txt
Lines changed: 21 additions & 12 deletions
diff --git a/‎cpp/include/rmm/aligned.hpp
Lines changed: 10 additions & 32 deletions b/‎cpp/include/rmm/aligned.hpp
Lines changed: 10 additions & 32 deletions
diff --git a/‎cpp/include/rmm/cuda_device.hpp
Lines changed: 9 additions & 42 deletions b/‎cpp/include/rmm/cuda_device.hpp
Lines changed: 9 additions & 42 deletions
@@ -34,8 +34,12 @@ rapids-telemetry-record sccache-stats.txt sccache --show-adv-stats
 
 EXCLUDE_ARGS=(
   --exclude "librapids_logger.so"
+  --exclude "librmm.so"
 )
-python -m auditwheel repair "${EXCLUDE_ARGS[@]}" -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" dist/*
+python -m auditwheel repair \
+    "${EXCLUDE_ARGS[@]}" \
+    -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \
+    dist/*
 
 ../../ci/validate_wheel.sh "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}"
 
 
@@ -69,19 +69,31 @@ outputs:
         - cmake --install cpp/build
       dynamic_linking:
         overlinking_behavior: "error"
+      prefix_detection:
+        ignore:
+          # See https://github.com/rapidsai/build-planning/issues/160
+          - lib/librmm.so
     requirements:
       build:
         - cmake ${{ cmake_version }}
+        - ${{ stdlib("c") }}
       host:
         - cuda-version =${{ cuda_version }}
+        - if: cuda_major == "11"
+          then: cudatoolkit
+          else: cuda-cudart-dev
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - if: cuda_major == "11"
           then: cudatoolkit
+          else: cuda-cudart
         - rapids-logger =0.1
       run_exports:
         - ${{ pin_subpackage("librmm", upper_bound="x.x") }}
       ignore_run_exports:
+        from_package:
+          - if: cuda_major != "11"
+            then: cuda-cudart-dev
         by_name:
           - cuda-version
           - if: cuda_major == "11"
 
@@ -83,31 +83,40 @@ include(cmake/thirdparty/get_nvtx.cmake)
 # ##################################################################################################
 # * library targets --------------------------------------------------------------------------------
 
-add_library(rmm INTERFACE)
+add_library(rmm src/aligned.cpp src/cuda_device.cpp src/cuda_stream_pool.cpp
+                src/cuda_stream_view.cpp src/cuda_stream.cpp)
 add_library(rmm::rmm ALIAS rmm)
 
 target_include_directories(
   rmm
-  INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-            "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>" "$<INSTALL_INTERFACE:include>")
+  PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+         "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>"
+  INTERFACE "$<INSTALL_INTERFACE:include>")
 
 if(CUDA_STATIC_RUNTIME)
   message(STATUS "RMM: Enabling static linking of cudart")
-  target_link_libraries(rmm INTERFACE CUDA::cudart_static)
+  target_link_libraries(rmm PUBLIC CUDA::cudart_static)
 else()
-  target_link_libraries(rmm INTERFACE CUDA::cudart)
+  target_link_libraries(rmm PUBLIC CUDA::cudart)
 endif()
 
-target_link_libraries(rmm INTERFACE CCCL::CCCL)
-target_link_libraries(rmm INTERFACE dl)
-target_link_libraries(rmm INTERFACE nvtx3::nvtx3-cpp)
-target_link_libraries(rmm INTERFACE rapids_logger::rapids_logger)
-target_compile_features(rmm INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
-target_compile_definitions(rmm INTERFACE LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+target_link_libraries(rmm PUBLIC CCCL::CCCL ${CMAKE_DL_LIBS} nvtx3::nvtx3-cpp
+                                 rapids_logger::rapids_logger)
+
+set_target_properties(
+  rmm
+  PROPERTIES BUILD_RPATH "\$ORIGIN"
+             INSTALL_RPATH "\$ORIGIN"
+             CXX_STANDARD 17
+             CXX_STANDARD_REQUIRED ON
+             CXX_VISIBILITY_PRESET hidden
+             POSITION_INDEPENDENT_CODE ON
+             INTERFACE_POSITION_INDEPENDENT_CODE ON)
+target_compile_definitions(rmm PUBLIC LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 # Enable NVTX if necessary
 if(RMM_NVTX)
-  target_compile_definitions(rmm INTERFACE RMM_NVTX)
+  target_compile_definitions(rmm PUBLIC RMM_NVTX)
 endif()
 
 # ##################################################################################################
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 #include <cstddef>
 #include <cstdint>
 
-namespace RMM_NAMESPACE {
+namespace RMM_EXPORT rmm {
 
 /**
  * @addtogroup utilities
@@ -49,10 +49,7 @@ static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT{256};
  *
  * @return True if the input is a power of two with non-negative integer exponent, false otherwise.
  */
-[[nodiscard]] constexpr bool is_pow2(std::size_t value) noexcept
-{
-  return (value != 0U) && ((value & (value - 1)) == 0U);
-}
+[[nodiscard]] bool is_pow2(std::size_t value) noexcept;
 
 /**
  * @brief Returns whether or not `alignment` is a valid memory alignment.
@@ -61,10 +58,7 @@ static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT{256};
  *
  * @return True if the alignment is valid, false otherwise.
  */
-[[nodiscard]] constexpr bool is_supported_alignment(std::size_t alignment) noexcept
-{
-  return is_pow2(alignment);
-}
+[[nodiscard]] bool is_supported_alignment(std::size_t alignment) noexcept;
 
 /**
  * @brief Align up to nearest multiple of specified power of 2
@@ -74,11 +68,7 @@ static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT{256};
  *
  * @return the aligned value
  */
-[[nodiscard]] constexpr std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
-{
-  assert(is_supported_alignment(alignment));
-  return (value + (alignment - 1)) & ~(alignment - 1);
-}
+[[nodiscard]] std::size_t align_up(std::size_t value, std::size_t alignment) noexcept;
 
 /**
  * @brief Align down to the nearest multiple of specified power of 2
@@ -88,11 +78,7 @@ static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT{256};
  *
  * @return the aligned value
  */
-[[nodiscard]] constexpr std::size_t align_down(std::size_t value, std::size_t alignment) noexcept
-{
-  assert(is_supported_alignment(alignment));
-  return value & ~(alignment - 1);
-}
+[[nodiscard]] std::size_t align_down(std::size_t value, std::size_t alignment) noexcept;
 
 /**
  * @brief Checks whether a value is aligned to a multiple of a specified power of 2
@@ -102,11 +88,7 @@ static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT{256};
  *
  * @return true if aligned
  */
-[[nodiscard]] constexpr bool is_aligned(std::size_t value, std::size_t alignment) noexcept
-{
-  assert(is_supported_alignment(alignment));
-  return value == align_down(value, alignment);
-}
+[[nodiscard]] bool is_aligned(std::size_t value, std::size_t alignment) noexcept;
 
 /**
  * @brief Checks whether the provided pointer is aligned to a specified @p alignment
@@ -116,13 +98,9 @@ static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT{256};
  *
  * @return true if the pointer is aligned
  */
-[[nodiscard]] inline bool is_pointer_aligned(
-  void* ptr, std::size_t alignment = CUDA_ALLOCATION_ALIGNMENT) noexcept
-{
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-  return is_aligned(reinterpret_cast<std::uintptr_t>(ptr), alignment);
-}
+[[nodiscard]] bool is_pointer_aligned(void* ptr,
+                                      std::size_t alignment = CUDA_ALLOCATION_ALIGNMENT) noexcept;
 
 /** @} */  // end of group
 
-}  // namespace RMM_NAMESPACE
+}  // namespace RMM_EXPORT rmm
@@ -16,18 +16,15 @@
 #pragma once
 
 #include <rmm/aligned.hpp>
-#include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
 
-#include <cuda_runtime_api.h>
-
 #include <cstddef>
 #include <utility>
 
-namespace RMM_NAMESPACE {
+namespace RMM_EXPORT rmm {
 
 struct cuda_device_id;
-inline cuda_device_id get_current_cuda_device();
+cuda_device_id get_current_cuda_device();
 
 /**
  * @addtogroup cuda_device_management
@@ -96,37 +93,21 @@ struct cuda_device_id {
  *
  * @return `cuda_device_id` for the current device
  */
-inline cuda_device_id get_current_cuda_device()
-{
-  cuda_device_id::value_type dev_id{-1};
-  RMM_ASSERT_CUDA_SUCCESS(cudaGetDevice(&dev_id));
-  return cuda_device_id{dev_id};
-}
+cuda_device_id get_current_cuda_device();
 
 /**
  * @brief Returns the number of CUDA devices in the system
  *
  * @return Number of CUDA devices in the system
  */
-inline int get_num_cuda_devices()
-{
-  cuda_device_id::value_type num_dev{-1};
-  RMM_ASSERT_CUDA_SUCCESS(cudaGetDeviceCount(&num_dev));
-  return num_dev;
-}
+int get_num_cuda_devices();
 
 /**
  * @brief Returns the available and total device memory in bytes for the current device
  *
  * @return The available and total device memory in bytes for the current device as a std::pair.
  */
-inline std::pair<std::size_t, std::size_t> available_device_memory()
-{
-  std::size_t free{};
-  std::size_t total{};
-  RMM_CUDA_TRY(cudaMemGetInfo(&free, &total));
-  return {free, total};
-}
+std::pair<std::size_t, std::size_t> available_device_memory();
 
 /**
  * @brief Returns the approximate specified percent of available device memory on the current CUDA
@@ -136,13 +117,7 @@ inline std::pair<std::size_t, std::size_t> available_device_memory()
  *
  * @return The recommended initial device memory pool size in bytes.
  */
-inline std::size_t percent_of_free_device_memory(int percent)
-{
-  [[maybe_unused]] auto const [free, total] = rmm::available_device_memory();
-  auto fraction                             = static_cast<double>(percent) / 100.0;
-  return rmm::align_down(static_cast<std::size_t>(static_cast<double>(free) * fraction),
-                         rmm::CUDA_ALLOCATION_ALIGNMENT);
-}
+std::size_t percent_of_free_device_memory(int percent);
 
 /**
  * @brief RAII class that sets the current CUDA device to the specified device on construction
@@ -154,19 +129,11 @@ struct cuda_set_device_raii {
    *
    * @param dev_id The device to set as the current CUDA device
    */
-  explicit cuda_set_device_raii(cuda_device_id dev_id)
-    : old_device_{get_current_cuda_device()},
-      needs_reset_{dev_id.value() >= 0 && old_device_ != dev_id}
-  {
-    if (needs_reset_) { RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value())); }
-  }
+  explicit cuda_set_device_raii(cuda_device_id dev_id);
   /**
    * @brief Reactivates the previous CUDA device
    */
-  ~cuda_set_device_raii() noexcept
-  {
-    if (needs_reset_) { RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value())); }
-  }
+  ~cuda_set_device_raii() noexcept;
 
   cuda_set_device_raii(cuda_set_device_raii const&)            = delete;
   cuda_set_device_raii& operator=(cuda_set_device_raii const&) = delete;
@@ -179,4 +146,4 @@ struct cuda_set_device_raii {
 };
 
 /** @} */  // end of group
-}  // namespace RMM_NAMESPACE
+}  // namespace RMM_EXPORT rmm