rapidsai · rapids-bot · May 5, 2025 · Apr 23, 2025 · Apr 23, 2025 · Apr 23, 2025
@@ -9,6 +9,8 @@
 #include <variant>
 #include <vector>
 
+#include <cuda_runtime.h>
+
 #include <rmm/device_buffer.hpp>
 
 #include <rapidsmpf/error.hpp>
@@ -122,12 +124,27 @@ class Buffer {
         );
     }
 
+    /**
+     * @brief Check if the last copy operation has completed.
+     *
+     * @return true if the copy operation has completed or no copy operation
+     * was performed, false if it is still in progress.
+     */
+    [[nodiscard]] bool is_copy_complete() const;
+
     /// @brief Buffer has a move ctor but no copy or assign operator.
     Buffer(Buffer&&) = default;
     Buffer(Buffer const&) = delete;
     Buffer& operator=(Buffer& o) = delete;
     Buffer& operator=(Buffer&& o) = delete;
 
+    /**
+     * @brief Destructor for Buffer.
+     *
+     * Cleans up any allocated resources.
+     */
+    ~Buffer();
+
   private:
     /**
      * @brief Construct a Buffer from host memory.
@@ -208,6 +225,8 @@ class Buffer {
     /// @brief The underlying storage host memory or device memory buffer (where
     /// applicable).
     StorageT storage_;
+    /// @brief CUDA event used to track copy operations
+    cudaEvent_t cuda_event_;
 };
 
 }  // namespace rapidsmpf
@@ -426,6 +426,8 @@ class Communicator {
      * @param rank The destination rank.
      * @param tag Message tag for identification.
      * @return A unique pointer to a `Future` representing the asynchronous operation.
+     *
+     * @throw std::logic_error if the buffer copy is not complete yet.
      */
     [[nodiscard]] virtual std::unique_ptr<Future> send(
         std::unique_ptr<Buffer> msg, Rank rank, Tag tag

@@ -4,6 +4,8 @@
  */
 #include <stdexcept>
 
+#include <cuda_runtime.h>
+
 #include <rapidsmpf/buffer/buffer.hpp>
 #include <rapidsmpf/buffer/resource.hpp>
 
@@ -16,12 +18,19 @@ template <typename T>
     RAPIDSMPF_EXPECTS(ptr, "unique pointer cannot be null", std::invalid_argument);
     return ptr;
 }
+
+// Helper to create and record a CUDA event
+void create_and_record_event(cudaEvent_t& event, rmm::cuda_stream_view stream) {
+    RAPIDSMPF_CUDA_TRY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+    RAPIDSMPF_CUDA_TRY(cudaEventRecord(event, stream));
+}
 }  // namespace
 
 Buffer::Buffer(std::unique_ptr<std::vector<uint8_t>> host_buffer, BufferResource* br)
     : br{br},
       size{host_buffer ? host_buffer->size() : 0},
-      storage_{std::move(host_buffer)} {
+      storage_{std::move(host_buffer)},
+      cuda_event_{nullptr} {
     RAPIDSMPF_EXPECTS(
         std::get<HostStorageT>(storage_) != nullptr, "the host_buffer cannot be NULL"
     );
@@ -31,13 +40,20 @@ Buffer::Buffer(std::unique_ptr<std::vector<uint8_t>> host_buffer, BufferResource
 Buffer::Buffer(std::unique_ptr<rmm::device_buffer> device_buffer, BufferResource* br)
     : br{br},
       size{device_buffer ? device_buffer->size() : 0},
-      storage_{std::move(device_buffer)} {
+      storage_{std::move(device_buffer)},
+      cuda_event_{nullptr} {
     RAPIDSMPF_EXPECTS(
         std::get<DeviceStorageT>(storage_) != nullptr, "the device buffer cannot be NULL"
     );
     RAPIDSMPF_EXPECTS(br != nullptr, "the BufferResource cannot be NULL");
 }
 
+Buffer::~Buffer() {
+    if (cuda_event_ != nullptr) {
+        cudaEventDestroy(cuda_event_);
+    }
+}
+
 void* Buffer::data() {
     return std::visit([](auto&& storage) -> void* { return storage->data(); }, storage_);
 }
@@ -55,12 +71,14 @@ std::unique_ptr<Buffer> Buffer::copy(rmm::cuda_stream_view stream) const {
                 );
             },
             [&](const DeviceStorageT& storage) -> std::unique_ptr<Buffer> {
-                return std::unique_ptr<Buffer>(new Buffer{
+                auto new_buffer = std::unique_ptr<Buffer>(new Buffer{
                     std::make_unique<rmm::device_buffer>(
                         storage->data(), storage->size(), stream, br->device_mr()
                     ),
                     br
                 });
+                create_and_record_event(new_buffer->cuda_event_, stream);
+                return new_buffer;
             }
         },
         storage_
@@ -76,12 +94,14 @@ std::unique_ptr<Buffer> Buffer::copy(MemoryType target, rmm::cuda_stream_view st
     return std::visit(
         overloaded{
             [&](const HostStorageT& storage) -> std::unique_ptr<Buffer> {
-                return std::unique_ptr<Buffer>(new Buffer{
+                auto new_buffer = std::unique_ptr<Buffer>(new Buffer{
                     std::make_unique<rmm::device_buffer>(
                         storage->data(), storage->size(), stream, br->device_mr()
                     ),
                     br
                 });
+                create_and_record_event(new_buffer->cuda_event_, stream);
+                return new_buffer;
             },
             [&](const DeviceStorageT& storage) -> std::unique_ptr<Buffer> {
                 auto ret = std::make_unique<std::vector<uint8_t>>(storage->size());
@@ -92,11 +112,28 @@ std::unique_ptr<Buffer> Buffer::copy(MemoryType target, rmm::cuda_stream_view st
                     cudaMemcpyDeviceToHost,
                     stream
                 ));
-                return std::unique_ptr<Buffer>(new Buffer{std::move(ret), br});
+                auto new_buffer = std::unique_ptr<Buffer>(new Buffer{std::move(ret), br});
+                create_and_record_event(new_buffer->cuda_event_, stream);
+                return new_buffer;
             }
         },
         storage_
     );
 }
 
+bool Buffer::is_copy_complete() const {
+    if (cuda_event_ == nullptr) {
+        return true;  // No copy operation was performed
+    }
+    cudaError_t status = cudaEventQuery(cuda_event_);
+    if (status == cudaSuccess) {
+        return true;
+    } else if (status == cudaErrorNotReady) {
+        return false;
+    } else {
+        RAPIDSMPF_CUDA_TRY(status);
+        return false;  // This line is unreachable due to the throw above
+    }
+}
+
 }  // namespace rapidsmpf
@@ -116,6 +116,7 @@ std::unique_ptr<Communicator::Future> MPI::send(
 std::unique_ptr<Communicator::Future> MPI::send(
     std::unique_ptr<Buffer> msg, Rank rank, Tag tag
 ) {
+    RAPIDSMPF_EXPECTS(msg->is_copy_complete(), "buffer copy has not completed yet");
     RAPIDSMPF_EXPECTS(
         msg->size <= std::numeric_limits<int>::max(),
         "send buffer size exceeds MPI max count"

@@ -1074,7 +1074,6 @@ std::shared_ptr<::ucxx::Endpoint> UCXX::get_endpoint(Rank rank) {
 std::unique_ptr<Communicator::Future> UCXX::send(
     std::unique_ptr<std::vector<uint8_t>> msg, Rank rank, Tag tag, BufferResource* br
 ) {
-    RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
     auto req = get_endpoint(rank)->tagSend(
         msg->data(),
         msg->size(),
@@ -1086,7 +1085,7 @@ std::unique_ptr<Communicator::Future> UCXX::send(
 std::unique_ptr<Communicator::Future> UCXX::send(
     std::unique_ptr<Buffer> msg, Rank rank, Tag tag
 ) {
-    RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
+    RAPIDSMPF_EXPECTS(msg->is_copy_complete(), "buffer copy has not completed yet");
     auto req = get_endpoint(rank)->tagSend(
         msg->data(), msg->size, tag_with_rank(shared_resources_->rank(), tag)
     );
@@ -1148,9 +1147,6 @@ std::vector<std::size_t> UCXX::test_some(
             completed.push_back(i);
         }
     }
-    if (completed.size() > 0) {
-        RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
-    }
     return completed;
 }
 
@@ -1167,9 +1163,6 @@ std::vector<std::size_t> UCXX::test_some(
             completed.push_back(key);
         }
     }
-    if (completed.size() > 0) {
-        RAPIDSMPF_CUDA_TRY(cudaDeviceSynchronize());
-    }
     return completed;
 }