PaddlePaddle
diff --git a/‎paddle/fluid/distributed/collective/CMakeLists.txt
Lines changed: 11 additions & 2 deletions b/‎paddle/fluid/distributed/collective/CMakeLists.txt
Lines changed: 11 additions & 2 deletions
diff --git a/‎paddle/fluid/distributed/collective/ProcessGroup.cc
Lines changed: 7 additions & 1 deletion b/‎paddle/fluid/distributed/collective/ProcessGroup.cc
Lines changed: 7 additions & 1 deletion
diff --git a/‎paddle/fluid/distributed/collective/ProcessGroup.h
Lines changed: 23 additions & 4 deletions b/‎paddle/fluid/distributed/collective/ProcessGroup.h
Lines changed: 23 additions & 4 deletions
diff --git a/‎paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
Lines changed: 130 additions & 3 deletions b/‎paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
Lines changed: 130 additions & 3 deletions
diff --git a/‎paddle/fluid/distributed/collective/ProcessGroupNCCL.h
Lines changed: 27 additions & 3 deletions b/‎paddle/fluid/distributed/collective/ProcessGroupNCCL.h
Lines changed: 27 additions & 3 deletions
diff --git a/‎paddle/fluid/distributed/collective/ProcessGroupStream.cc
Lines changed: 49 additions & 0 deletions b/‎paddle/fluid/distributed/collective/ProcessGroupStream.cc
Lines changed: 49 additions & 0 deletions
@@ -2,10 +2,14 @@ cc_library(
   processgroup
   SRCS ProcessGroup.cc
   DEPS dense_tensor)
+cc_library(
+  processgroup_stream
+  SRCS ProcessGroupStream.cc
+  DEPS dense_tensor)
 cc_library(
   eager_reducer
   SRCS reducer.cc
-  DEPS eager_api processgroup phi_api string_helper)
+  DEPS eager_api processgroup processgroup_stream phi_api string_helper)
 
 if(WITH_DISTRIBUTE)
   cc_library(
@@ -18,7 +22,12 @@ if(WITH_NCCL OR WITH_RCCL)
   cc_library(
     processgroup_nccl
     SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc
-    DEPS processgroup place enforce collective_helper device_context
+    DEPS processgroup
+         processgroup_stream
+         place
+         enforce
+         collective_helper
+         device_context
          dense_tensor)
   if(WITH_DISTRIBUTE AND WITH_PSCORE)
     if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
 
@@ -18,10 +18,16 @@ namespace paddle {
 namespace distributed {
 
 ProcessGroup::Task::Task(int rank,
-                         const std::vector<phi::DenseTensor>& inputTensors,
+                         const std::vector<phi::DenseTensor>& inputs,
                          CommType comm_type)
     : rank_(rank), comm_type_(comm_type) {}
 
+ProcessGroup::Task::Task(int rank,
+                         const std::vector<phi::DenseTensor>& inputs,
+                         CommType comm_type,
+                         bool sync_op)
+    : rank_(rank), comm_type_(comm_type), sync_op_(sync_op) {}
+
 ProcessGroup::Task::~Task() = default;
 
 bool ProcessGroup::Task::IsCompleted() {
 
@@ -55,19 +55,27 @@ class ProcessGroup {
   class Task {
    public:
     Task(int rank,
-         const std::vector<phi::DenseTensor>& inputTensors,
-         CommType opType = CommType::UNKNOWN);
+         const std::vector<phi::DenseTensor>& inputs,
+         CommType comm_type);
+    Task(int rank,
+         const std::vector<phi::DenseTensor>& inputs,
+         CommType comm_type,
+         bool sync_op);
 
     virtual ~Task();
     virtual bool IsCompleted();
     virtual bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
     virtual void Synchronize();
+    bool IsSync() const { return sync_op_; }
 
    protected:
     const int rank_;
-    CommType comm_type_;
+    CommType comm_type_{CommType::UNKNOWN};
     std::mutex mutex_;
-    bool is_completed_ = false;
+    bool is_completed_{false};
+
+   private:
+    bool sync_op_{true};
   };
 
   explicit ProcessGroup(int rank,
@@ -82,6 +90,7 @@ class ProcessGroup {
 
   virtual const std::string GetBackendName() const = 0;
 
+  // TODO(liyurui): This API will be moved later
   virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
       std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
       std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
@@ -90,6 +99,16 @@ class ProcessGroup {
         "ProcessGroup%s does not support allreduce", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const AllreduceOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support allreduce with sync_op flag",
+        GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
       std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
       std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
 
@@ -55,7 +55,20 @@ ProcessGroupNCCL::NCCLTask::NCCLTask(
     int rank,
     CommType CommType,
     const std::vector<phi::DenseTensor>& inputs)
-    : Task(rank, inputs, CommType), places_(places) {
+    : TaskStream(rank, inputs, CommType), places_(places) {
+  control_events_.resize(places.size());
+  ncclComms_.resize(places.size());
+}
+
+ProcessGroupNCCL::NCCLTask::NCCLTask(
+    const std::vector<Place>& places,
+    int rank,
+    CommType comm_type,
+    const std::vector<phi::DenseTensor>& inputs,
+    bool sync_op,
+    bool use_calc_stream)
+    : TaskStream(rank, inputs, comm_type, sync_op, use_calc_stream),
+      places_(places) {
   control_events_.resize(places.size());
   ncclComms_.resize(places.size());
 }
@@ -116,6 +129,13 @@ void ProcessGroupNCCL::CheckSplitSizes(std::vector<int64_t>* split_sizes,
 
 // TODO(sheniang03): Add timeout for wait, now timeout unused
 bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
+  // Warning here when use calc stream but also invoke waiting explicitly.
+  if (UseCalcStream()) {
+    VLOG(3) << "Warning: The communication is on calc stream, wait here is "
+               "useless.";
+    return true;
+  }
+
   SynchronizeStreams();
   if (FLAGS_nccl_blocking_wait) {
     // NOTE(shenliang03): It will block host for sync
@@ -146,7 +166,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr<Store>& store,
                                    int size,
                                    const platform::Place& place,
                                    int gid)
-    : ProcessGroup(rank, size, place, gid), store_(store) {
+    : ProcessGroupStream(rank, size, place, gid), store_(store) {
   platform::SetDeviceId(place_.device);
 }
 
@@ -223,6 +243,81 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
   places_to_ctx_.emplace(places_key, std::move(dev_ctx));
 }
 
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    Fn fn,
+    CommType comm_type,
+    bool sync_op,
+    bool use_calc_stream) {
+  const auto& places = GetPlaceList(inputs);
+  const auto& key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
+      CreateNCCLManagerCache(key, places);
+    }
+  }
+
+  auto& nccl_comms = places_to_ncclcomm_[key];
+
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+
+  auto task = std::make_shared<ProcessGroupNCCL::NCCLTask>(
+      places, rank_, comm_type, inputs, sync_op, use_calc_stream);
+
+  platform::CUDADeviceGuard cuda_guard;
+
+  {
+    platform::NCCLGroupGuard nccl_guard;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+
+      gpuStream_t nccl_stream;
+      if (use_calc_stream) {
+        nccl_stream =
+            static_cast<phi::GPUContext*>(
+                platform::DeviceContextPool::Instance().Get(places[i]))
+                ->stream();
+      } else {
+        nccl_stream = places_to_ctx_[key][i]->stream();
+      }
+
+      fn(inputs[i], outputs[i], nccl_comms[i]->GetNcclComm(), nccl_stream);
+    }
+  }
+
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+
+      gpuStream_t nccl_stream;
+      if (use_calc_stream) {
+        nccl_stream =
+            static_cast<phi::GPUContext*>(
+                platform::DeviceContextPool::Instance().Get(places[i]))
+                ->stream();
+      } else {
+        nccl_stream = places_to_ctx_[key][i]->stream();
+      }
+
+      memory::RecordStream(inputs[i].Holder(), nccl_stream);
+    }
+  }
+
+  // Adding stream event dependency only when use comm stream
+  if (!use_calc_stream) {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      task->control_events_[i].Record(*places_to_ctx_[key][i]);
+    }
+  }
+
+  return task;
+}
+
 template <typename Fn>
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
     std::vector<phi::DenseTensor>& inputs,
@@ -386,6 +481,37 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
       CommType::ALLREDUCE);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const AllreduceOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](const phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        return platform::dynload::ncclAllReduce(
+            input.data(),
+            output.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.type()),
+            ToNCCLRedType(opts.reduce_op),
+            comm,
+            stream);
+      },
+      CommType::ALLREDUCE,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors,
@@ -432,7 +558,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
         new paddle::experimental::DefaultAllocator(place));
     barrierTensors.emplace_back(allocator.get(), meta);
   }
-  auto task = ProcessGroupNCCL::AllReduce(barrierTensors, barrierTensors);
+  auto task = ProcessGroupNCCL::AllReduce(
+      barrierTensors, barrierTensors, AllreduceOptions());
   auto nccl_task = dynamic_cast<ProcessGroupNCCL::NCCLTask*>(task.get());
   nccl_task->barrierTensors_ = std::move(barrierTensors);
   return task;
 
@@ -21,7 +21,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
 #include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -46,16 +46,23 @@ namespace distributed {
 
 using Place = paddle::platform::Place;
 
-class ProcessGroupNCCL : public ProcessGroup {
+class ProcessGroupNCCL : public ProcessGroupStream {
  public:
-  class NCCLTask : public ProcessGroup::Task,
+  class NCCLTask : public ProcessGroupStream::TaskStream,
                    public std::enable_shared_from_this<NCCLTask> {
    public:
     NCCLTask(const std::vector<Place>& places,
              int rank,
              CommType CommType,
              const std::vector<phi::DenseTensor>& inputs);
 
+    NCCLTask(const std::vector<Place>& places,
+             int rank,
+             CommType comm_type,
+             const std::vector<phi::DenseTensor>& inputs,
+             bool is_sync,
+             bool use_calc_stream);
+
     bool IsCompleted();
 
     void SynchronizeStreams();
@@ -89,6 +96,14 @@ class ProcessGroupNCCL : public ProcessGroup {
     return std::string(NCCL_BACKEND_NAME);
   }
 
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const AllreduceOptions& options,
+      bool sync_op,
+      bool use_calc_stream) override;
+
+  // TODO(liyurui): This API will be moved later
   std::shared_ptr<ProcessGroup::Task> AllReduce(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors,
@@ -194,6 +209,15 @@ class ProcessGroupNCCL : public ProcessGroup {
       Fn fn,
       CommType op_type);
 
+  template <typename Fn>
+  std::shared_ptr<ProcessGroupStream::Task> Collective(
+      std::vector<phi::DenseTensor>& inputs,   // NOLINT
+      std::vector<phi::DenseTensor>& outputs,  // NOLINT
+      Fn fn,
+      CommType comm_type,
+      bool sync_op,
+      bool use_calc_stream);
+
   template <typename Fn>
   void Collective(const phi::DenseTensor*,
                   phi::DenseTensor*,
 
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
+
+namespace paddle {
+namespace distributed {
+
+ProcessGroupStream::ProcessGroupStream(int rank,
+                                       int size,
+                                       const platform::Place& place,
+                                       int gid)
+    : ProcessGroup(rank, size, place, gid) {}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllReduce(
+    std::vector<phi::DenseTensor>& input_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& output_tensors,  // NOLINT
+    const AllreduceOptions& options,
+    bool sync_op) {
+  return AllReduce(input_tensors,
+                   output_tensors,
+                   options,
+                   sync_op,
+                   /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllReduce(
+    std::vector<phi::DenseTensor>& input_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& output_tensors,  // NOLINT
+    const AllreduceOptions& options,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do allreduce", GetBackendName()));
+}
+
+}  // namespace distributed
+}  // namespace paddle