rapidsai · rapids-bot · May 5, 2025 · Apr 23, 2025 · Apr 23, 2025 · Apr 23, 2025
@@ -5,17 +5,22 @@
 #pragma once
 
 #include <array>
+#include <atomic>
 #include <memory>
+#include <mutex>
 #include <variant>
 #include <vector>
 
+#include <cuda_runtime.h>
+
 #include <rmm/device_buffer.hpp>
 
 #include <rapidsmpf/error.hpp>
 
 namespace rapidsmpf {
 
 class BufferResource;
+class Event;
 
 /// @brief Enum representing the type of memory.
 enum class MemoryType : int {
@@ -37,6 +42,44 @@ class Buffer {
     friend class BufferResource;
 
   public:
+    /**
+     * @brief CUDA event to provide synchronization among set of chunks.
+     *
+     * This event is used to serve as a synchronization point for a set of chunks
+     * given a user-specified stream.
+     */
+    class Event {
+      public:
+        /**
+         * @brief Construct a CUDA event for a given stream.
+         *
+         * @param stream CUDA stream used for device memory operations
+         */
+        Event(rmm::cuda_stream_view stream);
+
+        /**
+         * @brief Destructor for Event.
+         *
+         * Cleans up the CUDA event if one was created.
+         */
+        ~Event();
+
+        /**
+         * @brief Check if the CUDA event has been completed.
+         *
+         * @return true if the event has been completed, false otherwise.
+         */
+        [[nodiscard]] bool is_ready();
+
+      private:
+        cudaEvent_t event_;  ///< CUDA event used to track device memory allocation
+        std::atomic<bool> done_{false
+        };  ///< Cache of the event status to avoid unnecessary queries.
+        mutable std::mutex mutex_;  ///< Protects access to event_
+        std::atomic<bool> destroying_{false
+        };  ///< Flag to indicate destruction in progress
+    };
+
     /// @brief  Storage type for the device buffer.
     using DeviceStorageT = std::unique_ptr<rmm::device_buffer>;
 
@@ -122,8 +165,16 @@ class Buffer {
         );
     }
 
-    /// @brief Buffer has a move ctor but no copy or assign operator.
-    Buffer(Buffer&&) = default;
+    /**
+     * @brief Check if the device memory operation has completed.
+     *
+     * @return true if the device memory operation has completed or no device
+     * memory operation was performed, false if it is still in progress.
+     */
+    [[nodiscard]] bool is_ready() const;
+
+    /// @brief Delete move and copy constructors and assignment operators.
+    Buffer(Buffer&&) = delete;
     Buffer(Buffer const&) = delete;
     Buffer& operator=(Buffer& o) = delete;
     Buffer& operator=(Buffer&& o) = delete;
@@ -143,13 +194,20 @@ class Buffer {
      * @brief Construct a Buffer from device memory.
      *
      * @param device_buffer A unique pointer to a device buffer.
+     * @param stream CUDA stream used for the device buffer allocation.
      * @param br Buffer resource for memory allocation.
+     * @param event The shared event to use for the buffer.
      *
      * @throws std::invalid_argument if `device_buffer` is null.
      * @throws std::invalid_argument if `stream` or `br->mr` isn't the same used by
      * `device_buffer`.
      */
-    Buffer(std::unique_ptr<rmm::device_buffer> device_buffer, BufferResource* br);
+    Buffer(
+        std::unique_ptr<rmm::device_buffer> device_buffer,
+        rmm::cuda_stream_view stream,
+        BufferResource* br,
+        std::shared_ptr<Event> event = nullptr
+    );
 
     /**
      * @brief Access the underlying host memory buffer.
@@ -184,7 +242,7 @@ class Buffer {
     /**
      * @brief Create a copy of this buffer using the same memory type.
      *
-     * @param stream CUDA stream used for device memory operations.
+     * @param stream CUDA stream used for the device buffer allocation and copy.
      * @return A unique pointer to a new Buffer containing the copied data.
      */
     [[nodiscard]] std::unique_ptr<Buffer> copy(rmm::cuda_stream_view stream) const;
@@ -193,7 +251,7 @@ class Buffer {
      * @brief Create a copy of this buffer using the specified memory type.
      *
      * @param target The target memory type.
-     * @param stream CUDA stream used for device memory operations.
+     * @param stream CUDA stream used for device bufferallocation and copy.
      * @return A unique pointer to a new Buffer containing the copied data.
      */
     [[nodiscard]] std::unique_ptr<Buffer> copy(
@@ -208,6 +266,8 @@ class Buffer {
     /// @brief The underlying storage host memory or device memory buffer (where
     /// applicable).
     StorageT storage_;
+    /// @brief CUDA event used to track copy operations
+    std::shared_ptr<Event> event_;
 };
 
 }  // namespace rapidsmpf
@@ -256,9 +256,15 @@ class BufferResource {
      * @brief Move device buffer data into a Buffer.
      *
      * @param data A unique pointer to the device buffer.
+     * @param stream CUDA stream used for the data allocation, copy, and/or move.
+     * @param event The event to use for the buffer.
      * @return A unique pointer to the resulting Buffer.
      */
-    std::unique_ptr<Buffer> move(std::unique_ptr<rmm::device_buffer> data);
+    std::unique_ptr<Buffer> move(
+        std::unique_ptr<rmm::device_buffer> data,
+        rmm::cuda_stream_view stream,
+        std::shared_ptr<Buffer::Event> event = nullptr
+    );
 
     /**
      * @brief Move a Buffer to the specified memory type.
@@ -267,7 +273,7 @@ class BufferResource {
      *
      * @param target The target memory type.
      * @param buffer The buffer to move.
-     * @param stream CUDA stream for the operation.
+     * @param stream CUDA stream used for the buffer allocation, copy, and/or move.
      * @param reservation The reservation to use for memory allocations.
      * @return A unique pointer to the moved Buffer.
      *
@@ -287,7 +293,7 @@ class BufferResource {
      * If and only if moving between different memory types will this perform a copy.
      *
      * @param buffer The buffer to move.
-     * @param stream CUDA stream for the operation.
+     * @param stream CUDA stream used for the buffer allocation, copy, and/or move.
      * @param reservation The reservation to use for memory allocations.
      * @return A unique pointer to the resulting device buffer.
      *
@@ -307,7 +313,7 @@ class BufferResource {
      * If and only if moving between different memory types will this perform a copy.
      *
      * @param buffer The buffer to move.
-     * @param stream CUDA stream for the operation.
+     * @param stream CUDA stream used for the buffer allocation, copy, and/or move.
      * @param reservation The reservation to use for memory allocations.
      * @return A unique pointer to the resulting host vector.
      *
@@ -328,7 +334,7 @@ class BufferResource {
      *
      * @param target The target memory type.
      * @param buffer The buffer to copy.
-     * @param stream CUDA stream for the operation.
+     * @param stream CUDA stream used for the buffer allocation and copy.
      * @param reservation The reservation to use for memory allocations.
      * @return A unique pointer to the new Buffer.
      *

@@ -426,6 +426,11 @@ class Communicator {
      * @param rank The destination rank.
      * @param tag Message tag for identification.
      * @return A unique pointer to a `Future` representing the asynchronous operation.
+     *
+     * @warning The caller is responsible to ensure the underlying `Buffer` allocation
+     * and data are already valid before calling, for example, when a CUDA allocation
+     * and/or copy are done asynchronously. Specifically, the caller should ensure
+     * `Buffer::is_ready()` returns true before calling this function.
      */
     [[nodiscard]] virtual std::unique_ptr<Future> send(
         std::unique_ptr<Buffer> msg, Rank rank, Tag tag
@@ -438,6 +443,11 @@ class Communicator {
      * @param tag Message tag for identification.
      * @param recv_buffer The receive buffer.
      * @return A unique pointer to a `Future` representing the asynchronous operation.
+     *
+     * @warning The caller is responsible to ensure the underlying `Buffer` allocation
+     * is already valid before calling, for example, when a CUDA allocation
+     * and/or copy are done asynchronously. Specifically, the caller should ensure
+     * `Buffer::is_ready()` returns true before calling this function.
      */
     [[nodiscard]] virtual std::unique_ptr<Future> recv(
         Rank rank, Tag tag, std::unique_ptr<Buffer> recv_buffer

@@ -4,14 +4,17 @@
  */
 #pragma once
 
+#include <atomic>
 #include <memory>
+#include <mutex>
 #include <sstream>
 #include <vector>
 
 #include <cudf/contiguous_split.hpp>
 #include <cudf/table/table.hpp>
 
 #include <rapidsmpf/buffer/buffer.hpp>
+#include <rapidsmpf/communicator/communicator.hpp>
 #include <rapidsmpf/shuffler/partition.hpp>
 
 namespace rapidsmpf::shuffler::detail {
@@ -27,6 +30,7 @@ using ChunkID = std::uint64_t;
 class Chunk {
   public:
     PartID const pid;  ///< Partition ID that this chunk belongs to.
+
     ChunkID const cid;  ///< Unique ID of this chunk.
 
     /// If not zero, the number of chunks of the partition expected to get from the
@@ -126,6 +130,18 @@ class Chunk {
         std::size_t max_nbytes = 512,
         rmm::cuda_stream_view stream = cudf::get_default_stream()
     ) const;
+
+    /**
+     * @brief Returns true if the chunk is ready for consumption.
+     *
+     * Checks that the gpu_data's CUDA event is ready, if gpu_data contains a valid
+     * buffer. The CUDA event is used to synchronize the chunk's data to ensure
+     * any allocation or copy (e.g., spilling) is complete before the chunk is
+     * consumed.
+     *
+     * @return true if the chunk is ready, false otherwise.
+     */
+    [[nodiscard]] bool is_ready() const;
 };
 
 /**

@@ -55,11 +55,11 @@ class PostBox {
     std::unordered_map<ChunkID, Chunk> extract(PartID pid);
 
     /**
-     * @brief Extracts all chunks from the PostBox.
+     * @brief Extracts all ready chunks from the PostBox.
      *
-     * @return A vector of all chunks in the PostBox.
+     * @return A vector of all ready chunks in the PostBox.
      */
-    std::vector<Chunk> extract_all();
+    std::vector<Chunk> extract_all_ready();
 
     /**
      * @brief Checks if the PostBox is empty.

@@ -228,19 +228,23 @@ class Shuffler {
      * @param pid The partition ID of the new chunk.
      * @param metadata The metadata of the new chunk, can be null.
      * @param gpu_data The gpu data of the new chunk, can be null.
+     * @param stream The CUDA stream for BufferResource memory operations.
+     * @param event The event to use for the new chunk.
      */
     [[nodiscard]] detail::Chunk create_chunk(
         PartID pid,
         std::unique_ptr<std::vector<uint8_t>> metadata,
-        std::unique_ptr<rmm::device_buffer> gpu_data
+        std::unique_ptr<rmm::device_buffer> gpu_data,
+        rmm::cuda_stream_view stream,
+        std::shared_ptr<Buffer::Event> event
     ) {
         return detail::Chunk{
             pid,
             get_new_cid(),
             0,  // expected_num_chunks
             gpu_data ? gpu_data->size() : 0,  // gpu_data_size
             std::move(metadata),
-            br_->move(std::move(gpu_data))
+            br_->move(std::move(gpu_data), stream, event)
         };
     }