rapidsai · rapids-bot · May 5, 2025 · Apr 23, 2025 · Apr 23, 2025 · Apr 23, 2025
@@ -9,6 +9,8 @@
 #include <variant>
 #include <vector>
 
+#include <cuda_runtime.h>
+
 #include <rmm/device_buffer.hpp>
 
 #include <rapidsmpf/error.hpp>
@@ -122,12 +124,27 @@ class Buffer {
         );
     }
 
-    /// @brief Buffer has a move ctor but no copy or assign operator.
-    Buffer(Buffer&&) = default;
+    /**
+     * @brief Check if the device memory operation has completed.
+     *
+     * @return true if the device memory operation has completed or no device
+     * memory operation was performed, false if it is still in progress.
+     */
+    [[nodiscard]] bool is_ready() const;
+
+    /// @brief Delete move and copy constructors and assignment operators.
+    Buffer(Buffer&&) = delete;
     Buffer(Buffer const&) = delete;
     Buffer& operator=(Buffer& o) = delete;
     Buffer& operator=(Buffer&& o) = delete;
 
+    /**
+     * @brief Destructor for Buffer.
+     *
+     * Cleans up any allocated resources.
+     */
+    ~Buffer();
+
   private:
     /**
      * @brief Construct a Buffer from host memory.
@@ -143,13 +160,18 @@ class Buffer {
      * @brief Construct a Buffer from device memory.
      *
      * @param device_buffer A unique pointer to a device buffer.
+     * @param stream CUDA stream for the operation.
      * @param br Buffer resource for memory allocation.
      *
      * @throws std::invalid_argument if `device_buffer` is null.
      * @throws std::invalid_argument if `stream` or `br->mr` isn't the same used by
      * `device_buffer`.
      */
-    Buffer(std::unique_ptr<rmm::device_buffer> device_buffer, BufferResource* br);
+    Buffer(
+        std::unique_ptr<rmm::device_buffer> device_buffer,
+        rmm::cuda_stream_view stream,
+        BufferResource* br
+    );
 
     /**
      * @brief Access the underlying host memory buffer.
@@ -208,6 +230,8 @@ class Buffer {
     /// @brief The underlying storage host memory or device memory buffer (where
     /// applicable).
     StorageT storage_;
+    /// @brief CUDA event used to track copy operations
+    cudaEvent_t cuda_event_;
 };
 
 }  // namespace rapidsmpf
@@ -256,9 +256,12 @@ class BufferResource {
      * @brief Move device buffer data into a Buffer.
      *
      * @param data A unique pointer to the device buffer.
+     * @param stream CUDA stream for the operation.
      * @return A unique pointer to the resulting Buffer.
      */
-    std::unique_ptr<Buffer> move(std::unique_ptr<rmm::device_buffer> data);
+    std::unique_ptr<Buffer> move(
+        std::unique_ptr<rmm::device_buffer> data, rmm::cuda_stream_view stream
+    );
 
     /**
      * @brief Move a Buffer to the specified memory type.

@@ -426,6 +426,11 @@ class Communicator {
      * @param rank The destination rank.
      * @param tag Message tag for identification.
      * @return A unique pointer to a `Future` representing the asynchronous operation.
+     *
+     * @warning The caller is responsible to ensure the underlying `Buffer` allocation
+     * and data are already valid before calling, for example, when a CUDA allocation
+     * and/or copy are done asynchronously. Specifically, the caller should ensure
+     * `Buffer::is_ready()` returns true before calling this function.
      */
     [[nodiscard]] virtual std::unique_ptr<Future> send(
         std::unique_ptr<Buffer> msg, Rank rank, Tag tag
@@ -438,6 +443,11 @@ class Communicator {
      * @param tag Message tag for identification.
      * @param recv_buffer The receive buffer.
      * @return A unique pointer to a `Future` representing the asynchronous operation.
+     *
+     * @warning The caller is responsible to ensure the underlying `Buffer` allocation
+     * is already valid before calling, for example, when a CUDA allocation
+     * and/or copy are done asynchronously. Specifically, the caller should ensure
+     * `Buffer::is_ready()` returns true before calling this function.
      */
     [[nodiscard]] virtual std::unique_ptr<Future> recv(
         Rank rank, Tag tag, std::unique_ptr<Buffer> recv_buffer

@@ -4,6 +4,7 @@
  */
 #pragma once
 
+#include <atomic>
 #include <memory>
 #include <sstream>
 #include <vector>
@@ -12,6 +13,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rapidsmpf/buffer/buffer.hpp>
+#include <rapidsmpf/communicator/communicator.hpp>
 #include <rapidsmpf/shuffler/partition.hpp>
 
 namespace rapidsmpf::shuffler::detail {
@@ -26,7 +28,47 @@ using ChunkID = std::uint64_t;
  */
 class Chunk {
   public:
+    /**
+     * @brief CUDA event to provide synchronization among set of chunks.
+     *
+     * This event is used to serve as a synchronization point for a set of chunks
+     * given a user-specified stream.
+     */
+    class Event {
+      public:
+        /**
+         * @brief Construct a CUDA event for a given stream.
+         *
+         * @param stream CUDA stream used for device memory operations
+         * @param log Logger to warn if object is destroyed before event is ready.
+         */
+        Event(rmm::cuda_stream_view stream, Communicator::Logger& log);
+
+        /**
+         * @brief Destructor for Event.
+         *
+         * Cleans up the CUDA event if one was created. If the event is not done,
+         * it will log a warning.
+         */
+        ~Event();
+
+        /**
+         * @brief Check if the CUDA event has been completed.
+         *
+         * @return true if the event has been completed, false otherwise.
+         */
+        [[nodiscard]] bool is_ready();
+
+      private:
+        cudaEvent_t event_;  ///< CUDA event used to track device memory allocation
+        Communicator::Logger&
+            log_;  ///< Logger to warn if object is destroyed before event is ready
+        std::atomic<bool> done_{false
+        };  ///< Cache of the event status to avoid unnecessary queries.
+    };
+
     PartID const pid;  ///< Partition ID that this chunk belongs to.
+
     ChunkID const cid;  ///< Unique ID of this chunk.
 
     /// If not zero, the number of chunks of the partition expected to get from the
@@ -42,6 +84,9 @@ class Chunk {
     /// GPU data buffer of the packed `cudf::table` associated with this chunk.
     std::unique_ptr<Buffer> gpu_data;
 
+    /// CUDA event to provide synchronization among set of chunks.
+    std::shared_ptr<Event> event;
+
     /**
      * @brief Construct a new chunk of a partition.
      *
@@ -54,14 +99,16 @@ class Chunk {
      * chunk.
      *  @param gpu_data The gpu_data of the packed `cudf::table` that makes up this
      * chunk.
+     * @param event CUDA event to provide synchronization among set of chunks.
      */
     Chunk(
         PartID pid,
         ChunkID cid,
         std::size_t expected_num_chunks,
         std::size_t gpu_data_size,
         std::unique_ptr<std::vector<uint8_t>> metadata,
-        std::unique_ptr<Buffer> gpu_data
+        std::unique_ptr<Buffer> gpu_data,
+        std::shared_ptr<Event> event
     );
 
     /**
@@ -126,6 +173,18 @@ class Chunk {
         std::size_t max_nbytes = 512,
         rmm::cuda_stream_view stream = cudf::get_default_stream()
     ) const;
+
+    /**
+     * @brief Returns true if the chunk is ready for consumption.
+     *
+     * Checks that the shared CUDA event and the buffer's CUDA event are both ready.
+     * The shared CUDA event is used to synchronize the chunk's data across a set of
+     * chunks, while the buffer's CUDA event is used to synchronize the chunk's data
+     * if any spilling is involved.
+     *
+     * @return true if the chunk is ready, false otherwise.
+     */
+    [[nodiscard]] bool is_ready() const;
 };
 
 /**

@@ -55,11 +55,11 @@ class PostBox {
     std::unordered_map<ChunkID, Chunk> extract(PartID pid);
 
     /**
-     * @brief Extracts all chunks from the PostBox.
+     * @brief Extracts all ready chunks from the PostBox.
      *
-     * @return A vector of all chunks in the PostBox.
+     * @return A vector of all ready chunks in the PostBox.
      */
-    std::vector<Chunk> extract_all();
+    std::vector<Chunk> extract_all_ready();
 
     /**
      * @brief Checks if the PostBox is empty.

@@ -228,19 +228,24 @@ class Shuffler {
      * @param pid The partition ID of the new chunk.
      * @param metadata The metadata of the new chunk, can be null.
      * @param gpu_data The gpu data of the new chunk, can be null.
+     * @param stream The CUDA stream for BufferResource memory operations.
+     * @param event The event to use for the new chunk.
      */
     [[nodiscard]] detail::Chunk create_chunk(
         PartID pid,
         std::unique_ptr<std::vector<uint8_t>> metadata,
-        std::unique_ptr<rmm::device_buffer> gpu_data
+        std::unique_ptr<rmm::device_buffer> gpu_data,
+        rmm::cuda_stream_view stream,
+        std::shared_ptr<detail::Chunk::Event> event
     ) {
         return detail::Chunk{
             pid,
             get_new_cid(),
             0,  // expected_num_chunks
             gpu_data ? gpu_data->size() : 0,  // gpu_data_size
             std::move(metadata),
-            br_->move(std::move(gpu_data))
+            br_->move(std::move(gpu_data), stream),
+            std::move(event)
         };
     }
 

@@ -4,6 +4,8 @@
  */
 #include <stdexcept>
 
+#include <cuda_runtime.h>
+
 #include <rapidsmpf/buffer/buffer.hpp>
 #include <rapidsmpf/buffer/resource.hpp>
 
@@ -16,28 +18,48 @@ template <typename T>
     RAPIDSMPF_EXPECTS(ptr, "unique pointer cannot be null", std::invalid_argument);
     return ptr;
 }
+
+// Helper to create and record a CUDA event
+cudaEvent_t create_and_record_event(rmm::cuda_stream_view stream) {
+    cudaEvent_t event;
+    RAPIDSMPF_CUDA_TRY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+    RAPIDSMPF_CUDA_TRY(cudaEventRecord(event, stream));
+    return event;
+}
 }  // namespace
 
 Buffer::Buffer(std::unique_ptr<std::vector<uint8_t>> host_buffer, BufferResource* br)
     : br{br},
       size{host_buffer ? host_buffer->size() : 0},
-      storage_{std::move(host_buffer)} {
+      storage_{std::move(host_buffer)},
+      cuda_event_{nullptr} {
     RAPIDSMPF_EXPECTS(
         std::get<HostStorageT>(storage_) != nullptr, "the host_buffer cannot be NULL"
     );
     RAPIDSMPF_EXPECTS(br != nullptr, "the BufferResource cannot be NULL");
 }
 
-Buffer::Buffer(std::unique_ptr<rmm::device_buffer> device_buffer, BufferResource* br)
+Buffer::Buffer(
+    std::unique_ptr<rmm::device_buffer> device_buffer,
+    rmm::cuda_stream_view stream,
+    BufferResource* br
+)
     : br{br},
       size{device_buffer ? device_buffer->size() : 0},
-      storage_{std::move(device_buffer)} {
+      storage_{std::move(device_buffer)},
+      cuda_event_{create_and_record_event(stream)} {
     RAPIDSMPF_EXPECTS(
         std::get<DeviceStorageT>(storage_) != nullptr, "the device buffer cannot be NULL"
     );
     RAPIDSMPF_EXPECTS(br != nullptr, "the BufferResource cannot be NULL");
 }
 
+Buffer::~Buffer() {
+    if (cuda_event_ != nullptr) {
+        cudaEventDestroy(cuda_event_);
+    }
+}
+
 void* Buffer::data() {
     return std::visit([](auto&& storage) -> void* { return storage->data(); }, storage_);
 }
@@ -55,12 +77,14 @@ std::unique_ptr<Buffer> Buffer::copy(rmm::cuda_stream_view stream) const {
                 );
             },
             [&](const DeviceStorageT& storage) -> std::unique_ptr<Buffer> {
-                return std::unique_ptr<Buffer>(new Buffer{
+                auto new_buffer = std::unique_ptr<Buffer>(new Buffer{
                     std::make_unique<rmm::device_buffer>(
                         storage->data(), storage->size(), stream, br->device_mr()
                     ),
+                    stream,
                     br
                 });
+                return new_buffer;
             }
         },
         storage_
@@ -76,12 +100,14 @@ std::unique_ptr<Buffer> Buffer::copy(MemoryType target, rmm::cuda_stream_view st
     return std::visit(
         overloaded{
             [&](const HostStorageT& storage) -> std::unique_ptr<Buffer> {
-                return std::unique_ptr<Buffer>(new Buffer{
+                auto new_buffer = std::unique_ptr<Buffer>(new Buffer{
                     std::make_unique<rmm::device_buffer>(
                         storage->data(), storage->size(), stream, br->device_mr()
                     ),
+                    stream,
                     br
                 });
+                return new_buffer;
             },
             [&](const DeviceStorageT& storage) -> std::unique_ptr<Buffer> {
                 auto ret = std::make_unique<std::vector<uint8_t>>(storage->size());
@@ -92,11 +118,32 @@ std::unique_ptr<Buffer> Buffer::copy(MemoryType target, rmm::cuda_stream_view st
                     cudaMemcpyDeviceToHost,
                     stream
                 ));
-                return std::unique_ptr<Buffer>(new Buffer{std::move(ret), br});
+                auto new_buffer = std::unique_ptr<Buffer>(new Buffer{std::move(ret), br});
+
+                // The event is created here instead of the constructor because the
+                // memcpy is async, but the buffer is created on the host.
+                new_buffer->cuda_event_ = create_and_record_event(stream);
+
+                return new_buffer;
             }
         },
         storage_
     );
 }
 
+bool Buffer::is_ready() const {
+    if (cuda_event_ == nullptr) {
+        return true;  // No device memory operation was performed
+    }
+    cudaError_t status = cudaEventQuery(cuda_event_);
+    if (status == cudaSuccess) {
+        return true;
+    } else if (status == cudaErrorNotReady) {
+        return false;
+    } else {
+        RAPIDSMPF_CUDA_TRY(status);
+        return false;  // This line is unreachable due to the throw above
+    }
+}
+
 }  // namespace rapidsmpf