From 3f7047e70621592dbd5c1a999009bf98269bb487 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 2 Apr 2024 17:29:15 +0000 Subject: [PATCH 01/25] [dist-rg] adds distributed row-gatherer Signed-off-by: Marcel Koch --- core/CMakeLists.txt | 1 + core/distributed/row_gatherer.cpp | 215 ++++++++++++++++++ .../ginkgo/core/distributed/row_gatherer.hpp | 174 ++++++++++++++ 3 files changed, 390 insertions(+) create mode 100644 core/distributed/row_gatherer.cpp create mode 100644 include/ginkgo/core/distributed/row_gatherer.hpp diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 8c915c1f7ff..b7342bb03d8 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -154,6 +154,7 @@ if(GINKGO_BUILD_MPI) distributed/matrix.cpp distributed/neighborhood_communicator.cpp distributed/partition_helpers.cpp + distributed/row_gatherer.cpp distributed/vector.cpp distributed/preconditioner/schwarz.cpp ) diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp new file mode 100644 index 00000000000..b8fdd86815c --- /dev/null +++ b/core/distributed/row_gatherer.cpp @@ -0,0 +1,215 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "ginkgo/core/distributed/row_gatherer.hpp" + +#include +#include +#include +#include +#include + +#include "core/base/dispatch_helper.hpp" + +namespace gko { +namespace experimental { +namespace distributed { + + +#if GINKGO_HAVE_OPENMPI_POST_4_1_X +using DefaultCollComm = mpi::NeighborhoodCommunicator; +#else +using DefaultCollComm = mpi::DenseCommunicator; +#endif + + +template +void RowGatherer::apply_impl(const LinOp* b, LinOp* x) const +{ + apply_async(b, x).wait(); +} + + +template +void RowGatherer::apply_impl(const LinOp* alpha, const LinOp* b, + const LinOp* beta, LinOp* x) const + GKO_NOT_IMPLEMENTED; + + +template +std::future RowGatherer::apply_async( + ptr_param b, ptr_param x) const +{ + auto op = [b = b.get(), x = x.get(), rg = this->shared_from_this(), + id = current_id_++] { + // ensure that the communications are executed in the order + // the apply_async were called + while (id > rg->active_id_.load()) { + std::this_thread::yield(); + } + + // dispatch global vector + run, std::complex>( + b, [&](const auto* b_global) { + using ValueType = + typename std::decay_t::value_type; + // dispatch local vector with the same precision as the global + // vector + ::gko::precision_dispatch( + [&](auto* x_local) { + auto exec = rg->get_executor(); + + auto b_local = b_global->get_local_vector(); + rg->send_buffer.template init( + b_local->get_executor(), + dim<2>(rg->coll_comm_->get_send_size(), + b_local->get_size()[1])); + rg->send_buffer.template get()->fill(0.0); + b_local->row_gather( + &rg->send_idxs_, + rg->send_buffer.template get()); + + auto recv_ptr = x_local->get_values(); + auto send_ptr = + rg->send_buffer.template get() + ->get_values(); + + exec->synchronize(); + mpi::contiguous_type type( + b_local->get_size()[1], + mpi::type_impl::get_type()); + auto g = exec->get_scoped_device_id_guard(); + auto req = rg->coll_comm_->i_all_to_all_v( + exec, send_ptr, type.get(), recv_ptr, type.get()); + req.wait(); + }, + x); + }); + + rg->active_id_++; + }; + return std::async(std::launch::async, op); +} + + +template +std::shared_ptr +RowGatherer::get_collective_communicator() const +{ + return coll_comm_; +} + + +template +template +RowGatherer::RowGatherer( + std::shared_ptr exec, + std::shared_ptr coll_comm, + const index_map& imap) + : EnableLinOp( + exec, dim<2>{imap.get_non_local_size(), imap.get_global_size()}), + DistributedBase(coll_comm->get_base_communicator()), + coll_comm_(std::move(coll_comm)), + send_idxs_(exec) +{ + // check that the coll_comm_ and imap have the same recv size + // the same check for the send size is not possible, since the + // imap doesn't store send indices + GKO_THROW_IF_INVALID( + coll_comm_->get_recv_size() == imap.get_non_local_size(), + "The collective communicator doesn't match the index map."); + + auto comm = coll_comm_->get_base_communicator(); + auto inverse_comm = coll_comm_->create_inverse(); + + send_idxs_.resize_and_reset(coll_comm_->get_send_size()); + inverse_comm + ->i_all_to_all_v(exec, + imap.get_remote_local_idxs().get_const_flat_data(), + send_idxs_.get_data()) + .wait(); +} + + +template +RowGatherer::RowGatherer(std::shared_ptr exec, + mpi::communicator comm) + : EnableLinOp(exec), + DistributedBase(comm), + coll_comm_(std::make_shared(comm)), + send_idxs_(exec) +{} + + +template +RowGatherer::RowGatherer(RowGatherer&& o) noexcept + : EnableLinOp(o.get_executor()), + DistributedBase(o.get_communicator()), + send_idxs_(o.get_executor()) +{ + *this = std::move(o); +} + + +template +RowGatherer& RowGatherer::operator=( + const RowGatherer& o) +{ + if (this != &o) { + this->set_size(o.get_size()); + coll_comm_ = o.coll_comm_; + send_idxs_ = o.send_idxs_; + } + return *this; +} + + +template +RowGatherer& RowGatherer::operator=( + RowGatherer&& o) +{ + if (this != &o) { + this->set_size(o.get_size()); + o.set_size({}); + coll_comm_ = std::exchange( + o.coll_comm_, + std::make_shared(o.get_communicator())); + send_idxs_ = std::move(o.send_idxs_); + } + return *this; +} + + +template +RowGatherer::RowGatherer(const RowGatherer& o) + : EnableLinOp(o.get_executor()), + DistributedBase(o.get_communicator()), + send_idxs_(o.get_executor()) +{ + *this = o; +} + + +#define GKO_DECLARE_ROW_GATHERER(_itype) class RowGatherer<_itype> + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_ROW_GATHERER); + +#undef GKO_DECLARE_ROW_GATHERER + + +#define GKO_DECLARE_ROW_GATHERER_CONSTRUCTOR(_ltype, _gtype) \ + RowGatherer<_ltype>::RowGatherer( \ + std::shared_ptr exec, \ + std::shared_ptr coll_comm, \ + const index_map<_ltype, _gtype>& imap) + +GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_ROW_GATHERER_CONSTRUCTOR); + +#undef GKO_DECLARE_ROW_GATHERER_CONSTRUCTOR + + +} // namespace distributed +} // namespace experimental +} // namespace gko diff --git a/include/ginkgo/core/distributed/row_gatherer.hpp b/include/ginkgo/core/distributed/row_gatherer.hpp new file mode 100644 index 00000000000..680bfc15070 --- /dev/null +++ b/include/ginkgo/core/distributed/row_gatherer.hpp @@ -0,0 +1,174 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_ROW_GATHERER_HPP_ +#define GKO_PUBLIC_CORE_DISTRIBUTED_ROW_GATHERER_HPP_ + + +#include + + +#if GINKGO_BUILD_MPI + + +#include + +#include +#include +#include +#include +#include +#include + + +namespace gko { +namespace experimental { +namespace distributed { + + +/** + * The distributed::RowGatherer gathers the rows of distributed::Vector that + * are located on other processes. + * + * Example usage: + * ```c++ + * auto coll_comm = std::make_shared(comm, + * imap); auto rg = distributed::RowGatherer::create(exec, coll_comm, + * imap); + * + * auto b = distributed::Vector::create(...); + * auto x = matrix::Dense::create(...); + * + * auto future = rg->apply_async(b, x); + * // do some computation that doesn't modify b, or access x + * future.wait(); + * // x now contains the gathered rows of b + * ``` + * Using the apply instead of the apply_async will lead to a blocking + * communication. + * + * @note Objects of this class are only available as shared_ptr, since the class + * is derived from std::enable_shared_from_this. + * + * @tparam LocalIndexType the index type for the stored indices + */ +template +class RowGatherer final + : public EnableLinOp>, + public DistributedBase, + public std::enable_shared_from_this> { + friend class EnablePolymorphicObject; + +public: + /** + * Asynchronous version of LinOp::apply. + * + * It is asynchronous only wrt. the calling thread. Multiple calls to this + * function will execute in order, they are not asynchronous with each + * other. + * + * @param b the input distributed::Vector + * @param x the output matrix::Dense with the rows gathered from b + * @return a future for this task. The task is guarantueed to completed + * after `.wait()` has been called on the future. + */ + std::future apply_async(ptr_param b, + ptr_param x) const; + + /** + * Get the used collective communicator. + * + * @return the used collective communicator + */ + std::shared_ptr + get_collective_communicator() const; + + /** + * Creates a distributed::RowGatherer from a given collective communicator + * and index map. + * + * @TODO: using a segmented array instead of the imap would probably be + * more general + * + * @tparam GlobalIndexType the global index type of the index map + * + * @param exec the executor + * @param coll_comm the collective communicator + * @param imap the index map defining which rows to gather + * + * @note The coll_comm and imap have to be compatible. The coll_comm must + * send and recv exactly as many rows as the imap defines. + * + * @return a shared_ptr to the created distributed::RowGatherer + */ + template = + sizeof(LocalIndexType)>> + static std::shared_ptr create( + std::shared_ptr exec, + std::shared_ptr coll_comm, + const index_map& imap) + { + return std::shared_ptr( + new RowGatherer(std::move(exec), std::move(coll_comm), imap)); + } + + /* + * Create method for an empty RowGatherer. + */ + static std::shared_ptr create( + std::shared_ptr exec, mpi::communicator comm) + { + return std::shared_ptr( + new RowGatherer(std::move(exec), std::move(comm))); + } + + RowGatherer(const RowGatherer& o); + + RowGatherer(RowGatherer&& o) noexcept; + + RowGatherer& operator=(const RowGatherer& o); + + RowGatherer& operator=(RowGatherer&& o); + +protected: + void apply_impl(const LinOp* b, LinOp* x) const override; + + void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta, + LinOp* x) const override; + +private: + /** + * @copydoc RowGatherer::create(std::shared_ptr, std::shared_ptr, + * const index_map&) + */ + template + RowGatherer(std::shared_ptr exec, + std::shared_ptr coll_comm, + const index_map& imap); + + /** + * @copydoc RowGatherer::create(std::shared_ptr, mpi::communicator) + */ + RowGatherer(std::shared_ptr exec, mpi::communicator comm); + + std::shared_ptr coll_comm_; + + array send_idxs_; + + detail::AnyDenseCache send_buffer; + + mutable int64 current_id_{0}; + mutable std::atomic active_id_{0}; +}; + + +} // namespace distributed +} // namespace experimental +} // namespace gko + +#endif +#endif // GKO_PUBLIC_CORE_DISTRIBUTED_ROW_GATHERER_HPP_ From 0c1ffafcd74aa901afd3d0c4489d7cf7cefd88b2 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 19 Apr 2024 17:32:04 +0200 Subject: [PATCH 02/25] [dist-rg] handle copy to host buffer --- core/distributed/row_gatherer.cpp | 41 ++++++++++++++----- .../ginkgo/core/distributed/row_gatherer.hpp | 3 +- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index b8fdd86815c..ebd6ea5efbd 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -60,29 +60,48 @@ std::future RowGatherer::apply_async( [&](auto* x_local) { auto exec = rg->get_executor(); + auto use_host_buffer = mpi::requires_host_buffer( + exec, rg->coll_comm_->get_base_communicator()); + auto mpi_exec = + use_host_buffer ? exec->get_master() : exec; + auto b_local = b_global->get_local_vector(); - rg->send_buffer.template init( - b_local->get_executor(), - dim<2>(rg->coll_comm_->get_send_size(), - b_local->get_size()[1])); - rg->send_buffer.template get()->fill(0.0); + rg->send_buffer_.template init( + mpi_exec, dim<2>(rg->coll_comm_->get_send_size(), + b_local->get_size()[1])); + rg->send_buffer_.template get()->fill(0.0); b_local->row_gather( &rg->send_idxs_, - rg->send_buffer.template get()); - - auto recv_ptr = x_local->get_values(); + rg->send_buffer_.template get()); + + if (use_host_buffer) { + rg->recv_buffer_.template init( + mpi_exec, x_local->get_size()); + } + + auto recv_ptr = + use_host_buffer + ? rg->recv_buffer_.template get() + ->get_values() + : x_local->get_values(); auto send_ptr = - rg->send_buffer.template get() + rg->send_buffer_.template get() ->get_values(); - exec->synchronize(); + mpi_exec->synchronize(); mpi::contiguous_type type( b_local->get_size()[1], mpi::type_impl::get_type()); auto g = exec->get_scoped_device_id_guard(); auto req = rg->coll_comm_->i_all_to_all_v( - exec, send_ptr, type.get(), recv_ptr, type.get()); + mpi_exec, send_ptr, type.get(), recv_ptr, + type.get()); req.wait(); + + if (use_host_buffer) { + x_local->copy_from( + rg->recv_buffer_.template get()); + } }, x); }); diff --git a/include/ginkgo/core/distributed/row_gatherer.hpp b/include/ginkgo/core/distributed/row_gatherer.hpp index 680bfc15070..fbf68027756 100644 --- a/include/ginkgo/core/distributed/row_gatherer.hpp +++ b/include/ginkgo/core/distributed/row_gatherer.hpp @@ -159,7 +159,8 @@ class RowGatherer final array send_idxs_; - detail::AnyDenseCache send_buffer; + detail::AnyDenseCache send_buffer_; + detail::AnyDenseCache recv_buffer_; mutable int64 current_id_{0}; mutable std::atomic active_id_{0}; From 6031ddc78d0794d857419ce1bcb947107435a53d Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 30 Apr 2024 13:30:46 +0200 Subject: [PATCH 03/25] [dist-rg] use mpi request instead of future --- core/distributed/row_gatherer.cpp | 162 ++++++++++-------- .../ginkgo/core/distributed/row_gatherer.hpp | 45 +++-- 2 files changed, 117 insertions(+), 90 deletions(-) diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index ebd6ea5efbd..6f335441868 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -38,77 +38,85 @@ void RowGatherer::apply_impl(const LinOp* alpha, const LinOp* b, template -std::future RowGatherer::apply_async( - ptr_param b, ptr_param x) const +mpi::request RowGatherer::apply_async(ptr_param b, + ptr_param x) const { - auto op = [b = b.get(), x = x.get(), rg = this->shared_from_this(), - id = current_id_++] { - // ensure that the communications are executed in the order - // the apply_async were called - while (id > rg->active_id_.load()) { - std::this_thread::yield(); - } - - // dispatch global vector - run, std::complex>( - b, [&](const auto* b_global) { - using ValueType = - typename std::decay_t::value_type; - // dispatch local vector with the same precision as the global - // vector - ::gko::precision_dispatch( - [&](auto* x_local) { - auto exec = rg->get_executor(); - - auto use_host_buffer = mpi::requires_host_buffer( - exec, rg->coll_comm_->get_base_communicator()); - auto mpi_exec = - use_host_buffer ? exec->get_master() : exec; - - auto b_local = b_global->get_local_vector(); - rg->send_buffer_.template init( - mpi_exec, dim<2>(rg->coll_comm_->get_send_size(), - b_local->get_size()[1])); - rg->send_buffer_.template get()->fill(0.0); - b_local->row_gather( - &rg->send_idxs_, - rg->send_buffer_.template get()); - - if (use_host_buffer) { - rg->recv_buffer_.template init( - mpi_exec, x_local->get_size()); - } - - auto recv_ptr = - use_host_buffer - ? rg->recv_buffer_.template get() - ->get_values() - : x_local->get_values(); - auto send_ptr = - rg->send_buffer_.template get() - ->get_values(); - - mpi_exec->synchronize(); - mpi::contiguous_type type( - b_local->get_size()[1], - mpi::type_impl::get_type()); - auto g = exec->get_scoped_device_id_guard(); - auto req = rg->coll_comm_->i_all_to_all_v( - mpi_exec, send_ptr, type.get(), recv_ptr, - type.get()); - req.wait(); - - if (use_host_buffer) { - x_local->copy_from( - rg->recv_buffer_.template get()); - } - }, - x); - }); - - rg->active_id_++; - }; - return std::async(std::launch::async, op); + int is_inactive; + MPI_Status status; + GKO_ASSERT_NO_MPI_ERRORS( + MPI_Request_get_status(req_listener_, &is_inactive, &status)); + // This is untestable. Some processes might complete the previous request + // while others don't, so it's impossible to create a predictable behavior + // for a test. + GKO_THROW_IF_INVALID(is_inactive, + "Tried to call RowGatherer::apply_async while there " + "is already an active communication. Please use the " + "overload with a workspace to handle multiple " + "connections."); + + auto req = apply_async(b, x, send_workspace_); + req_listener_ = *req.get(); + return req; +} + + +template +mpi::request RowGatherer::apply_async( + ptr_param b, ptr_param x, array& workspace) const +{ + mpi::request req; + + // dispatch global vector + run, std::complex>( + b.get(), [&](const auto* b_global) { + using ValueType = + typename std::decay_t::value_type; + // dispatch local vector with the same precision as the global + // vector + ::gko::precision_dispatch( + [&](auto* x_local) { + auto exec = this->get_executor(); + + auto use_host_buffer = mpi::requires_host_buffer( + exec, coll_comm_->get_base_communicator()); + auto mpi_exec = use_host_buffer ? exec->get_master() : exec; + + GKO_THROW_IF_INVALID( + !use_host_buffer || mpi_exec->memory_accessible( + x_local->get_executor()), + "The receive buffer uses device memory, but MPI " + "support of device memory is not available. Please " + "provide a host buffer or enable MPI support for " + "device memory."); + + auto b_local = b_global->get_local_vector(); + + dim<2> send_size(coll_comm_->get_send_size(), + b_local->get_size()[1]); + workspace.set_executor(mpi_exec); + workspace.resize_and_reset(sizeof(ValueType) * + send_size[0] * send_size[1]); + auto send_buffer = matrix::Dense::create( + mpi_exec, send_size, + make_array_view( + mpi_exec, send_size[0] * send_size[1], + reinterpret_cast(workspace.get_data())), + send_size[1]); + b_local->row_gather(&send_idxs_, send_buffer); + + auto recv_ptr = x_local->get_values(); + auto send_ptr = send_buffer->get_values(); + + mpi_exec->synchronize(); + mpi::contiguous_type type( + b_local->get_size()[1], + mpi::type_impl::get_type()); + req = coll_comm_->i_all_to_all_v( + mpi_exec, send_ptr, type.get(), recv_ptr, type.get()); + }, + x.get()); + }); + return req; } @@ -130,7 +138,9 @@ RowGatherer::RowGatherer( exec, dim<2>{imap.get_non_local_size(), imap.get_global_size()}), DistributedBase(coll_comm->get_base_communicator()), coll_comm_(std::move(coll_comm)), - send_idxs_(exec) + send_idxs_(exec), + send_workspace_(exec), + req_listener_(MPI_REQUEST_NULL) { // check that the coll_comm_ and imap have the same recv size // the same check for the send size is not possible, since the @@ -157,7 +167,9 @@ RowGatherer::RowGatherer(std::shared_ptr exec, : EnableLinOp(exec), DistributedBase(comm), coll_comm_(std::make_shared(comm)), - send_idxs_(exec) + send_idxs_(exec), + send_workspace_(exec), + req_listener_(MPI_REQUEST_NULL) {} @@ -165,7 +177,9 @@ template RowGatherer::RowGatherer(RowGatherer&& o) noexcept : EnableLinOp(o.get_executor()), DistributedBase(o.get_communicator()), - send_idxs_(o.get_executor()) + send_idxs_(o.get_executor()), + send_workspace_(o.get_executor()), + req_listener_(MPI_REQUEST_NULL) { *this = std::move(o); } @@ -195,6 +209,8 @@ RowGatherer& RowGatherer::operator=( o.coll_comm_, std::make_shared(o.get_communicator())); send_idxs_ = std::move(o.send_idxs_); + send_workspace_ = std::move(o.send_workspace_); + req_listener_ = std::exchange(o.req_listener_, MPI_REQUEST_NULL); } return *this; } diff --git a/include/ginkgo/core/distributed/row_gatherer.hpp b/include/ginkgo/core/distributed/row_gatherer.hpp index fbf68027756..844dae40717 100644 --- a/include/ginkgo/core/distributed/row_gatherer.hpp +++ b/include/ginkgo/core/distributed/row_gatherer.hpp @@ -12,8 +12,6 @@ #if GINKGO_BUILD_MPI -#include - #include #include #include @@ -54,27 +52,42 @@ namespace distributed { * @tparam LocalIndexType the index type for the stored indices */ template -class RowGatherer final - : public EnableLinOp>, - public DistributedBase, - public std::enable_shared_from_this> { +class RowGatherer final : public EnableLinOp>, + public DistributedBase { friend class EnablePolymorphicObject; public: /** * Asynchronous version of LinOp::apply. * - * It is asynchronous only wrt. the calling thread. Multiple calls to this - * function will execute in order, they are not asynchronous with each - * other. + * @warning Only one mpi::request can be active at any given time. This + * function will throw if another request is already active. + * + * @param b the input distributed::Vector + * @param x the output matrix::Dense with the rows gathered from b + * @return a mpi::request for this task. The task is guaranteed to + * be completed only after `.wait()` has been called on it. + */ + mpi::request apply_async(ptr_param b, + ptr_param x) const; + + /** + * Asynchronous version of LinOp::apply. + * + * @warning Calling this multiple times with the same workspace and without + * waiting on each previous request will lead to incorrect + * data transfers. * * @param b the input distributed::Vector * @param x the output matrix::Dense with the rows gathered from b - * @return a future for this task. The task is guarantueed to completed - * after `.wait()` has been called on the future. + * @param workspace a workspace to store temporary data for the operation. + * This might not be modified before the request is + * waited on. + * @return a mpi::request for this task. The task is guaranteed to + * be completed only after `.wait()` has been called on it. */ - std::future apply_async(ptr_param b, - ptr_param x) const; + mpi::request apply_async(ptr_param b, ptr_param x, + array& workspace) const; /** * Get the used collective communicator. @@ -159,11 +172,9 @@ class RowGatherer final array send_idxs_; - detail::AnyDenseCache send_buffer_; - detail::AnyDenseCache recv_buffer_; + mutable array send_workspace_; - mutable int64 current_id_{0}; - mutable std::atomic active_id_{0}; + mutable MPI_Request req_listener_{MPI_REQUEST_NULL}; }; From 85fff10410d14c280162ab5c9ede26d28274563e Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 4 Apr 2024 10:08:56 +0000 Subject: [PATCH 04/25] [dist-rg] adds distributed row-gatherer tests Signed-off-by: Marcel Koch --- core/test/gtest/ginkgo_mpi_main.cpp | 10 +- core/test/mpi/distributed/CMakeLists.txt | 1 + core/test/mpi/distributed/row_gatherer.cpp | 266 +++++++++++++++++++++ 3 files changed, 275 insertions(+), 2 deletions(-) create mode 100644 core/test/mpi/distributed/row_gatherer.cpp diff --git a/core/test/gtest/ginkgo_mpi_main.cpp b/core/test/gtest/ginkgo_mpi_main.cpp index 07a1c2c343d..eeaa6578bce 100644 --- a/core/test/gtest/ginkgo_mpi_main.cpp +++ b/core/test/gtest/ginkgo_mpi_main.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -356,7 +356,13 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - MPI_Init(&argc, &argv); + int provided_thread_support; + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, + &provided_thread_support); + if (provided_thread_support != MPI_THREAD_MULTIPLE) { + throw std::runtime_error( + "This test requires an thread compliant MPI implementation."); + } MPI_Comm comm(MPI_COMM_WORLD); int rank; int size; diff --git a/core/test/mpi/distributed/CMakeLists.txt b/core/test/mpi/distributed/CMakeLists.txt index 4186a6c5617..7d0cd5c748f 100644 --- a/core/test/mpi/distributed/CMakeLists.txt +++ b/core/test/mpi/distributed/CMakeLists.txt @@ -1,6 +1,7 @@ ginkgo_create_test(helpers MPI_SIZE 1 LABELS distributed) ginkgo_create_test(matrix MPI_SIZE 1 LABELS distributed) ginkgo_create_test(collective_communicator MPI_SIZE 6 LABELS distributed) +ginkgo_create_test(row_gatherer MPI_SIZE 6 LABELS distributed) ginkgo_create_test(vector_cache MPI_SIZE 3 LABELS distributed) add_subdirectory(preconditioner) diff --git a/core/test/mpi/distributed/row_gatherer.cpp b/core/test/mpi/distributed/row_gatherer.cpp new file mode 100644 index 00000000000..9241e4d20a6 --- /dev/null +++ b/core/test/mpi/distributed/row_gatherer.cpp @@ -0,0 +1,266 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include +#include +#include + +#include "core/test/utils.hpp" +#include "core/test/utils/assertions.hpp" + + +template +class RowGatherer : public ::testing::Test { +protected: + using index_type = IndexType; + using part_type = + gko::experimental::distributed::Partition; + using map_type = + gko::experimental::distributed::index_map; + using row_gatherer_type = + gko::experimental::distributed::RowGatherer; + + RowGatherer() + { + int rank = this->comm.rank(); + auto part = gko::share(part_type::build_from_global_size_uniform( + this->ref, this->comm.size(), this->comm.size() * 3)); + auto recv_connections = + this->template create_recv_connections()[rank]; + auto imap = + map_type{this->ref, part, this->comm.rank(), recv_connections}; + auto coll_comm = + std::make_shared( + this->comm, imap); + rg = row_gatherer_type::create(ref, coll_comm, imap); + } + + void SetUp() override { ASSERT_EQ(comm.size(), 6); } + + template + std::array, 6> create_recv_connections() + { + return {gko::array{ref, {3, 5, 10, 11}}, + gko::array{ref, {0, 1, 7, 12, 13}}, + gko::array{ref, {3, 4, 17}}, + gko::array{ref, {1, 2, 12, 14}}, + gko::array{ref, {4, 5, 9, 10, 15, 16}}, + gko::array{ref, {8, 12, 13, 14}}}; + } + + std::shared_ptr ref = gko::ReferenceExecutor::create(); + gko::experimental::mpi::communicator comm = MPI_COMM_WORLD; + std::shared_ptr rg; +}; + +TYPED_TEST_SUITE(RowGatherer, gko::test::IndexTypes, TypenameNameGenerator); + +TYPED_TEST(RowGatherer, CanDefaultConstruct) +{ + using RowGatherer = typename TestFixture::row_gatherer_type; + + auto rg = RowGatherer::create(this->ref, this->comm); + + GKO_ASSERT_EQUAL_DIMENSIONS(rg, gko::dim<2>()); +} + + +TYPED_TEST(RowGatherer, CanConstructWithEmptCollectiveCommAndIndexMap) +{ + using RowGatherer = typename TestFixture::row_gatherer_type; + using IndexMap = typename TestFixture::map_type; + auto coll_comm = + std::make_shared( + this->comm); + auto map = IndexMap{this->ref}; + + auto rg = RowGatherer::create(this->ref, coll_comm, map); + + GKO_ASSERT_EQUAL_DIMENSIONS(rg, gko::dim<2>()); +} + + +TYPED_TEST(RowGatherer, CanConstructFromCollectiveCommAndIndexMap) +{ + using RowGatherer = typename TestFixture::row_gatherer_type; + using Part = typename TestFixture::part_type; + using IndexMap = typename TestFixture::map_type; + int rank = this->comm.rank(); + auto part = gko::share(Part::build_from_global_size_uniform( + this->ref, this->comm.size(), this->comm.size() * 3)); + auto recv_connections = + this->template create_recv_connections()[rank]; + auto imap = IndexMap{this->ref, part, this->comm.rank(), recv_connections}; + auto coll_comm = + std::make_shared( + this->comm, imap); + + auto rg = RowGatherer::create(this->ref, coll_comm, imap); + + gko::dim<2> size{recv_connections.get_size(), 18}; + GKO_ASSERT_EQUAL_DIMENSIONS(rg, size); +} + + +TYPED_TEST(RowGatherer, CanApply) +{ + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + int rank = this->comm.rank(); + auto offset = static_cast(rank * 3); + auto b = Vector::create( + this->ref, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->ref)); + auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); + + this->rg->apply(b, x); + + auto expected = this->template create_recv_connections()[rank]; + auto expected_vec = Dense::create( + this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); +} + + +TYPED_TEST(RowGatherer, CanApplyAsync) +{ + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + int rank = this->comm.rank(); + auto offset = static_cast(rank * 3); + auto b = Vector::create( + this->ref, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->ref)); + auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); + + auto req = this->rg->apply_async(b, x); + req.wait(); + + auto expected = this->template create_recv_connections()[rank]; + auto expected_vec = Dense::create( + this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); +} + + +TYPED_TEST(RowGatherer, CanApplyAsyncConsequetively) +{ + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + int rank = this->comm.rank(); + auto offset = static_cast(rank * 3); + auto b = Vector::create( + this->ref, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->ref)); + auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); + + this->rg->apply_async(b, x).wait(); + this->rg->apply_async(b, x).wait(); + + auto expected = this->template create_recv_connections()[rank]; + auto expected_vec = Dense::create( + this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); +} + + +TYPED_TEST(RowGatherer, CanApplyAsyncWithWorkspace) +{ + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + int rank = this->comm.rank(); + auto offset = static_cast(rank * 3); + auto b = Vector::create( + this->ref, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->ref)); + auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); + gko::array workspace(this->ref); + + auto req = this->rg->apply_async(b, x, workspace); + req.wait(); + + auto expected = this->template create_recv_connections()[rank]; + auto expected_vec = Dense::create( + this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); +} + + +TYPED_TEST(RowGatherer, CanApplyAsyncMultipleTimesWithWorkspace) +{ + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + int rank = this->comm.rank(); + auto offset = static_cast(rank * 3); + auto b1 = Vector::create( + this->ref, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->ref)); + auto b2 = gko::clone(b1); + b2->scale(gko::initialize({-1}, this->ref)); + auto x1 = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); + auto x2 = gko::clone(x1); + gko::array workspace1(this->ref); + gko::array workspace2(this->ref); + + auto req1 = this->rg->apply_async(b1, x1, workspace1); + auto req2 = this->rg->apply_async(b2, x2, workspace2); + req1.wait(); + req2.wait(); + + auto expected = this->template create_recv_connections()[rank]; + auto expected_vec1 = Dense::create( + this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + auto expected_vec2 = gko::clone(expected_vec1); + expected_vec2->scale(gko::initialize({-1}, this->ref)); + GKO_ASSERT_MTX_NEAR(x1, expected_vec1, 0.0); + GKO_ASSERT_MTX_NEAR(x2, expected_vec2, 0.0); +} + + +TYPED_TEST(RowGatherer, CanApplyAsyncWithMultipleColumns) +{ + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + int rank = this->comm.rank(); + auto offset = static_cast(rank * 3); + auto b = Vector::create( + this->ref, this->comm, gko::dim<2>{18, 2}, + gko::initialize({{offset, offset * offset}, + {offset + 1, offset * offset + 1}, + {offset + 2, offset * offset + 2}}, + this->ref)); + auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 2}); + + this->rg->apply_async(b, x).wait(); + + gko::array expected[] = { + gko::array{this->ref, {3, 9, 5, 11, 10, 82, 11, 83}}, + gko::array{this->ref, {0, 0, 1, 1, 7, 37, 12, 144, 13, 145}}, + gko::array{this->ref, {3, 9, 4, 10, 17, 227}}, + gko::array{this->ref, {1, 1, 2, 2, 12, 144, 14, 146}}, + gko::array{this->ref, + {4, 10, 5, 11, 9, 81, 10, 82, 15, 225, 16, 226}}, + gko::array{this->ref, {8, 38, 12, 144, 13, 145, 14, 146}}}; + auto expected_vec = + Dense::create(this->ref, gko::dim<2>{expected[rank].get_size() / 2, 2}, + expected[rank], 2); + GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); +} + + +TYPED_TEST(RowGatherer, ThrowsOnAdvancedApply) +{ + using RowGatherer = typename TestFixture::row_gatherer_type; + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + auto rg = RowGatherer::create(this->ref, this->comm); + auto b = Vector::create(this->ref, this->comm); + auto x = Dense::create(this->ref); + auto alpha = Dense::create(this->ref, gko::dim<2>{1, 1}); + auto beta = Dense::create(this->ref, gko::dim<2>{1, 1}); + + ASSERT_THROW(rg->apply(alpha, b, beta, x), gko::NotImplemented); +} From 5212fb2eb3156514c212f4fb120f00b42f9c33d4 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 30 Apr 2024 14:00:27 +0200 Subject: [PATCH 05/25] [dist-mat] use row-gatherer --- core/distributed/matrix.cpp | 246 ++++++--------------- include/ginkgo/core/distributed/matrix.hpp | 20 +- 2 files changed, 70 insertions(+), 196 deletions(-) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 6e264c1b765..1edc8b33d30 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -33,57 +34,14 @@ GKO_REGISTER_OPERATION(separate_local_nonlocal, template void initialize_communication_pattern( - std::shared_ptr exec, mpi::communicator comm, const index_map& imap, - std::vector& recv_sizes, - std::vector& recv_offsets, - std::vector& send_sizes, - std::vector& send_offsets, - array& gather_idxs) + std::shared_ptr>& row_gatherer) { - // exchange step 1: determine recv_sizes, send_sizes, send_offsets - auto host_recv_targets = - make_temporary_clone(exec->get_master(), &imap.get_remote_target_ids()); - auto host_offsets = make_temporary_clone( - exec->get_master(), &imap.get_remote_global_idxs().get_offsets()); - auto compute_recv_sizes = [](const auto* recv_targets, size_type size, - const auto* offsets, auto& recv_sizes) { - for (size_type i = 0; i < size; ++i) { - recv_sizes[recv_targets[i]] = offsets[i + 1] - offsets[i]; - } - }; - std::fill(recv_sizes.begin(), recv_sizes.end(), 0); - compute_recv_sizes(host_recv_targets->get_const_data(), - host_recv_targets->get_size(), - host_offsets->get_const_data(), recv_sizes); - std::partial_sum(recv_sizes.begin(), recv_sizes.end(), - recv_offsets.begin() + 1); - comm.all_to_all(exec, recv_sizes.data(), 1, send_sizes.data(), 1); - std::partial_sum(send_sizes.begin(), send_sizes.end(), - send_offsets.begin() + 1); - send_offsets[0] = 0; - recv_offsets[0] = 0; - - // exchange step 2: exchange gather_idxs from receivers to senders - auto recv_gather_idxs = - make_const_array_view( - imap.get_executor(), imap.get_non_local_size(), - imap.get_remote_local_idxs().get_const_flat_data()) - .copy_to_array(); - auto use_host_buffer = mpi::requires_host_buffer(exec, comm); - if (use_host_buffer) { - recv_gather_idxs.set_executor(exec->get_master()); - gather_idxs.clear(); - gather_idxs.set_executor(exec->get_master()); - } - gather_idxs.resize_and_reset(send_offsets.back()); - comm.all_to_all_v(use_host_buffer ? exec->get_master() : exec, - recv_gather_idxs.get_const_data(), recv_sizes.data(), - recv_offsets.data(), gather_idxs.get_data(), - send_sizes.data(), send_offsets.data()); - if (use_host_buffer) { - gather_idxs.set_executor(exec); - } + row_gatherer = RowGatherer::create( + row_gatherer->get_executor(), + row_gatherer->get_collective_communicator()->create_with_same_type( + row_gatherer->get_communicator(), imap), + imap); } @@ -103,12 +61,8 @@ Matrix::Matrix( ptr_param non_local_matrix_template) : EnableLinOp{exec}, DistributedBase{comm}, - imap_(exec), - send_offsets_(comm.size() + 1), - send_sizes_(comm.size()), - recv_offsets_(comm.size() + 1), - recv_sizes_(comm.size()), - gather_idxs_{exec}, + row_gatherer_{RowGatherer::create(exec, comm)}, + imap_{exec}, one_scalar_{}, local_mtx_{local_matrix_template->clone(exec)}, non_local_mtx_{non_local_matrix_template->clone(exec)} @@ -129,12 +83,8 @@ Matrix::Matrix( std::shared_ptr local_linop) : EnableLinOp{exec}, DistributedBase{comm}, - imap_(exec), - send_offsets_(comm.size() + 1), - send_sizes_(comm.size()), - recv_offsets_(comm.size() + 1), - recv_sizes_(comm.size()), - gather_idxs_{exec}, + row_gatherer_{RowGatherer::create(exec, comm)}, + imap_{exec}, one_scalar_{}, non_local_mtx_(::gko::matrix::Coo::create( exec, dim<2>{local_linop->get_size()[0], 0})) @@ -152,12 +102,8 @@ Matrix::Matrix( std::shared_ptr local_linop, std::shared_ptr non_local_linop) : EnableLinOp{exec}, DistributedBase{comm}, + row_gatherer_(RowGatherer::create(exec, comm)), imap_(std::move(imap)), - send_offsets_(comm.size() + 1), - send_sizes_(comm.size()), - recv_offsets_(comm.size() + 1), - recv_sizes_(comm.size()), - gather_idxs_{exec}, one_scalar_{} { this->set_size({imap_.get_global_size(), imap_.get_global_size()}); @@ -166,9 +112,7 @@ Matrix::Matrix( one_scalar_.init(exec, dim<2>{1, 1}); one_scalar_->fill(one()); - initialize_communication_pattern( - this->get_executor(), this->get_communicator(), imap_, recv_sizes_, - recv_offsets_, send_sizes_, send_offsets_, gather_idxs_); + initialize_communication_pattern(imap_, row_gatherer_); } @@ -274,12 +218,8 @@ void Matrix::convert_to( result->get_communicator().size()); result->local_mtx_->copy_from(this->local_mtx_); result->non_local_mtx_->copy_from(this->non_local_mtx_); + result->row_gatherer_->copy_from(this->row_gatherer_); result->imap_ = this->imap_; - result->gather_idxs_ = this->gather_idxs_; - result->send_offsets_ = this->send_offsets_; - result->recv_offsets_ = this->recv_offsets_; - result->recv_sizes_ = this->recv_sizes_; - result->send_sizes_ = this->send_sizes_; result->set_size(this->get_size()); } @@ -293,12 +233,8 @@ void Matrix::move_to( result->get_communicator().size()); result->local_mtx_->move_from(this->local_mtx_); result->non_local_mtx_->move_from(this->non_local_mtx_); + result->row_gatherer_->move_from(this->row_gatherer_); result->imap_ = std::move(this->imap_); - result->gather_idxs_ = std::move(this->gather_idxs_); - result->send_offsets_ = std::move(this->send_offsets_); - result->recv_offsets_ = std::move(this->recv_offsets_); - result->recv_sizes_ = std::move(this->recv_sizes_); - result->send_sizes_ = std::move(this->send_sizes_); result->set_size(this->get_size()); this->set_size({}); } @@ -314,11 +250,7 @@ void Matrix::convert_to( result->get_communicator().size()); result->local_mtx_->copy_from(this->local_mtx_.get()); result->non_local_mtx_->copy_from(this->non_local_mtx_.get()); - result->gather_idxs_ = this->gather_idxs_; - result->send_offsets_ = this->send_offsets_; - result->recv_offsets_ = this->recv_offsets_; - result->recv_sizes_ = this->recv_sizes_; - result->send_sizes_ = this->send_sizes_; + result->row_gatherer_->copy_from(this->row_gatherer_); result->imap_ = this->imap_; result->set_size(this->get_size()); } @@ -333,11 +265,7 @@ void Matrix::move_to( result->get_communicator().size()); result->local_mtx_->move_from(this->local_mtx_.get()); result->non_local_mtx_->move_from(this->non_local_mtx_.get()); - result->gather_idxs_ = std::move(this->gather_idxs_); - result->send_offsets_ = std::move(this->send_offsets_); - result->recv_offsets_ = std::move(this->recv_offsets_); - result->recv_sizes_ = std::move(this->recv_sizes_); - result->send_sizes_ = std::move(this->send_sizes_); + result->row_gatherer_->move_from(this->row_gatherer_); result->imap_ = std::move(this->imap_); result->set_size(this->get_size()); this->set_size({}); @@ -462,9 +390,7 @@ void Matrix::read_distributed( as>(this->non_local_mtx_) ->read(std::move(non_local_data)); - initialize_communication_pattern(exec, comm, imap_, recv_sizes_, - recv_offsets_, send_sizes_, send_offsets_, - gather_idxs_); + initialize_communication_pattern(imap_, row_gatherer_); } @@ -509,55 +435,6 @@ void Matrix::read_distributed( } -template -mpi::request Matrix::communicate( - const local_vector_type* local_b) const -{ - // This function can never return early! - // Even if the non-local part is empty, i.e. this process doesn't need - // any data from other processes, the used MPI calls are collective - // operations. They need to be called on all processes, even if a process - // might not communicate any data. - auto exec = this->get_executor(); - const auto comm = this->get_communicator(); - auto num_cols = local_b->get_size()[1]; - auto send_size = send_offsets_.back(); - auto recv_size = recv_offsets_.back(); - auto send_dim = dim<2>{static_cast(send_size), num_cols}; - auto recv_dim = dim<2>{static_cast(recv_size), num_cols}; - recv_buffer_.init(exec, recv_dim); - send_buffer_.init(exec, send_dim); - - local_b->row_gather(&gather_idxs_, send_buffer_.get()); - - auto use_host_buffer = mpi::requires_host_buffer(exec, comm); - if (use_host_buffer) { - host_recv_buffer_.init(exec->get_master(), recv_dim); - host_send_buffer_.init(exec->get_master(), send_dim); - host_send_buffer_->copy_from(send_buffer_.get()); - } - - mpi::contiguous_type type(num_cols, mpi::type_impl::get_type()); - auto send_ptr = use_host_buffer ? host_send_buffer_->get_const_values() - : send_buffer_->get_const_values(); - auto recv_ptr = use_host_buffer ? host_recv_buffer_->get_values() - : recv_buffer_->get_values(); - exec->synchronize(); -#ifdef GINKGO_HAVE_OPENMPI_PRE_4_1_X - comm.all_to_all_v(use_host_buffer ? exec->get_master() : exec, send_ptr, - send_sizes_.data(), send_offsets_.data(), type.get(), - recv_ptr, recv_sizes_.data(), recv_offsets_.data(), - type.get()); - return {}; -#else - return comm.i_all_to_all_v( - use_host_buffer ? exec->get_master() : exec, send_ptr, - send_sizes_.data(), send_offsets_.data(), type.get(), recv_ptr, - recv_sizes_.data(), recv_offsets_.data(), type.get()); -#endif -} - - template void Matrix::apply_impl( const LinOp* b, LinOp* x) const @@ -573,16 +450,22 @@ void Matrix::apply_impl( dense_x->get_local_values()), dense_x->get_local_vector()->get_stride()); + auto exec = this->get_executor(); auto comm = this->get_communicator(); - auto req = this->communicate(dense_b->get_local_vector()); + auto recv_dim = + dim<2>{static_cast( + row_gatherer_->get_collective_communicator() + ->get_recv_size()), + dense_b->get_size()[1]}; + auto recv_exec = mpi::requires_host_buffer(exec, comm) + ? exec->get_master() + : exec; + recv_buffer_.init(recv_exec, recv_dim); + auto req = + this->row_gatherer_->apply_async(dense_b, recv_buffer_.get()); local_mtx_->apply(dense_b->get_local_vector(), local_x); req.wait(); - auto exec = this->get_executor(); - auto use_host_buffer = mpi::requires_host_buffer(exec, comm); - if (use_host_buffer) { - recv_buffer_->copy_from(host_recv_buffer_.get()); - } non_local_mtx_->apply(one_scalar_.get(), recv_buffer_.get(), one_scalar_.get(), local_x); }, @@ -606,17 +489,23 @@ void Matrix::apply_impl( dense_x->get_local_values()), dense_x->get_local_vector()->get_stride()); + auto exec = this->get_executor(); auto comm = this->get_communicator(); - auto req = this->communicate(dense_b->get_local_vector()); + auto recv_dim = + dim<2>{static_cast( + row_gatherer_->get_collective_communicator() + ->get_recv_size()), + dense_b->get_size()[1]}; + auto recv_exec = mpi::requires_host_buffer(exec, comm) + ? exec->get_master() + : exec; + recv_buffer_.init(recv_exec, recv_dim); + auto req = + this->row_gatherer_->apply_async(dense_b, recv_buffer_.get()); local_mtx_->apply(local_alpha, dense_b->get_local_vector(), local_beta, local_x); req.wait(); - auto exec = this->get_executor(); - auto use_host_buffer = mpi::requires_host_buffer(exec, comm); - if (use_host_buffer) { - recv_buffer_->copy_from(host_recv_buffer_.get()); - } non_local_mtx_->apply(local_alpha, recv_buffer_.get(), one_scalar_.get(), local_x); }, @@ -634,33 +523,38 @@ void Matrix::col_scale( auto comm = this->get_communicator(); size_type n_local_cols = local_mtx_->get_size()[1]; size_type n_non_local_cols = non_local_mtx_->get_size()[1]; + std::unique_ptr scaling_factors_single_stride; - auto stride = scaling_factors->get_stride(); - if (stride != 1) { + auto scaling_stride = scaling_factors->get_stride(); + if (scaling_stride != 1) { scaling_factors_single_stride = global_vector_type::create(exec, comm); scaling_factors_single_stride->copy_from(scaling_factors.get()); } - const auto scale_values = - stride == 1 ? scaling_factors->get_const_local_values() - : scaling_factors_single_stride->get_const_local_values(); + const global_vector_type* scaling_factors_ptr = + scaling_stride == 1 ? scaling_factors.get() + : scaling_factors_single_stride.get(); const auto scale_diag = gko::matrix::Diagonal::create_const( exec, n_local_cols, - make_const_array_view(exec, n_local_cols, scale_values)); - - auto req = this->communicate( - stride == 1 ? scaling_factors->get_local_vector() - : scaling_factors_single_stride->get_local_vector()); + make_const_array_view(exec, n_local_cols, + scaling_factors_ptr->get_const_local_values())); + + auto recv_dim = dim<2>{ + static_cast( + row_gatherer_->get_collective_communicator()->get_recv_size()), + scaling_factors->get_size()[1]}; + auto recv_exec = + mpi::requires_host_buffer(exec, comm) ? exec->get_master() : exec; + recv_buffer_.init(recv_exec, recv_dim); + + auto req = + row_gatherer_->apply_async(scaling_factors_ptr, recv_buffer_.get()); scale_diag->rapply(local_mtx_, local_mtx_); req.wait(); if (n_non_local_cols > 0) { - auto use_host_buffer = mpi::requires_host_buffer(exec, comm); - if (use_host_buffer) { - recv_buffer_->copy_from(host_recv_buffer_.get()); - } const auto non_local_scale_diag = gko::matrix::Diagonal::create_const( exec, n_non_local_cols, - make_const_array_view(exec, n_non_local_cols, + make_const_array_view(recv_exec, n_non_local_cols, recv_buffer_->get_const_values())); non_local_scale_diag->rapply(non_local_mtx_, non_local_mtx_); } @@ -699,6 +593,8 @@ Matrix::Matrix(const Matrix& other) : EnableLinOp>{other.get_executor()}, DistributedBase{other.get_communicator()}, + row_gatherer_{RowGatherer::create( + other.get_executor(), other.get_communicator())}, imap_(other.get_executor()) { *this = other; @@ -711,6 +607,8 @@ Matrix::Matrix( : EnableLinOp>{other.get_executor()}, DistributedBase{other.get_communicator()}, + row_gatherer_{RowGatherer::create( + other.get_executor(), other.get_communicator())}, imap_(other.get_executor()) { *this = std::move(other); @@ -728,12 +626,8 @@ Matrix::operator=( this->set_size(other.get_size()); local_mtx_->copy_from(other.local_mtx_); non_local_mtx_->copy_from(other.non_local_mtx_); + row_gatherer_->copy_from(other.row_gatherer_); imap_ = other.imap_; - gather_idxs_ = other.gather_idxs_; - send_offsets_ = other.send_offsets_; - recv_offsets_ = other.recv_offsets_; - send_sizes_ = other.send_sizes_; - recv_sizes_ = other.recv_sizes_; one_scalar_.init(this->get_executor(), dim<2>{1, 1}); one_scalar_->fill(one()); } @@ -752,12 +646,8 @@ Matrix::operator=(Matrix&& other) other.set_size({}); local_mtx_->move_from(other.local_mtx_); non_local_mtx_->move_from(other.non_local_mtx_); + row_gatherer_->move_from(other.row_gatherer_); imap_ = std::move(other.imap_); - gather_idxs_ = std::move(other.gather_idxs_); - send_offsets_ = std::move(other.send_offsets_); - recv_offsets_ = std::move(other.recv_offsets_); - send_sizes_ = std::move(other.send_sizes_); - recv_sizes_ = std::move(other.recv_sizes_); one_scalar_.init(this->get_executor(), dim<2>{1, 1}); one_scalar_->fill(one()); } diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index 4bb6d1881b8..8d253fc3379 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -18,6 +18,7 @@ #include #include #include +#include namespace gko { @@ -689,32 +690,15 @@ class Matrix std::shared_ptr local_linop, std::shared_ptr non_local_linop); - /** - * Starts a non-blocking communication of the values of b that are shared - * with other processors. - * - * @param local_b The full local vector to be communicated. The subset of - * shared values is automatically extracted. - * @return MPI request for the non-blocking communication. - */ - mpi::request communicate(const local_vector_type* local_b) const; - void apply_impl(const LinOp* b, LinOp* x) const override; void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta, LinOp* x) const override; private: + std::shared_ptr> row_gatherer_; index_map imap_; - std::vector send_offsets_; - std::vector send_sizes_; - std::vector recv_offsets_; - std::vector recv_sizes_; - array gather_idxs_; gko::detail::DenseCache one_scalar_; - gko::detail::DenseCache host_send_buffer_; - gko::detail::DenseCache host_recv_buffer_; - gko::detail::DenseCache send_buffer_; gko::detail::DenseCache recv_buffer_; std::shared_ptr local_mtx_; std::shared_ptr non_local_mtx_; From a41bc82f5017e8d0afef7f986dafedb9faf73dfc Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 15 Jul 2024 17:07:25 +0200 Subject: [PATCH 06/25] [dist-rg] review update: - only allocate if necessary - synchronize correct executor Co-authored-by: Pratik Nayak --- core/distributed/row_gatherer.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index 6f335441868..4fe09fbab02 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -93,9 +93,13 @@ mpi::request RowGatherer::apply_async( dim<2> send_size(coll_comm_->get_send_size(), b_local->get_size()[1]); + auto send_size_in_bytes = + sizeof(ValueType) * send_size[0] * send_size[1]; workspace.set_executor(mpi_exec); - workspace.resize_and_reset(sizeof(ValueType) * - send_size[0] * send_size[1]); + if (send_size_in_bytes > workspace.get_size()) { + workspace.resize_and_reset(sizeof(ValueType) * + send_size[0] * send_size[1]); + } auto send_buffer = matrix::Dense::create( mpi_exec, send_size, make_array_view( @@ -107,7 +111,7 @@ mpi::request RowGatherer::apply_async( auto recv_ptr = x_local->get_values(); auto send_ptr = send_buffer->get_values(); - mpi_exec->synchronize(); + b_local->get_executor()->synchronize(); mpi::contiguous_type type( b_local->get_size()[1], mpi::type_impl::get_type()); From bbbb253a86f6945b00f3890ec248330137d02099 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 23 Oct 2024 16:17:03 +0200 Subject: [PATCH 07/25] [dist-rg] review updates: - split tests into core and backend part - fix formatting - fix openmpi pre 4.1.x macro Co-authored-by: Pratik Nayak Co-authored-by: Yu-Hsiang M. Tsai Signed-off-by: Marcel Koch --- core/distributed/row_gatherer.cpp | 12 +- core/test/mpi/distributed/row_gatherer.cpp | 178 +------------ include/ginkgo/core/base/mpi.hpp | 1 - .../ginkgo/core/distributed/row_gatherer.hpp | 11 +- test/mpi/CMakeLists.txt | 6 +- test/mpi/distributed/CMakeLists.txt | 5 + test/mpi/{ => distributed}/assembly.cpp | 0 test/mpi/{ => distributed}/matrix.cpp | 0 .../{ => distributed}/partition_helpers.cpp | 2 +- test/mpi/distributed/row_gatherer.cpp | 234 ++++++++++++++++++ test/mpi/{ => distributed}/vector.cpp | 0 11 files changed, 253 insertions(+), 196 deletions(-) create mode 100644 test/mpi/distributed/CMakeLists.txt rename test/mpi/{ => distributed}/assembly.cpp (100%) rename test/mpi/{ => distributed}/matrix.cpp (100%) rename test/mpi/{ => distributed}/partition_helpers.cpp (98%) create mode 100644 test/mpi/distributed/row_gatherer.cpp rename test/mpi/{ => distributed}/vector.cpp (100%) diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index 4fe09fbab02..3370c3ecc8c 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -17,10 +17,10 @@ namespace experimental { namespace distributed { -#if GINKGO_HAVE_OPENMPI_POST_4_1_X -using DefaultCollComm = mpi::NeighborhoodCommunicator; -#else +#if GINKGO_HAVE_OPENMPI_PRE_4_1_X using DefaultCollComm = mpi::DenseCommunicator; +#else +using DefaultCollComm = mpi::NeighborhoodCommunicator; #endif @@ -85,9 +85,9 @@ mpi::request RowGatherer::apply_async( !use_host_buffer || mpi_exec->memory_accessible( x_local->get_executor()), "The receive buffer uses device memory, but MPI " - "support of device memory is not available. Please " - "provide a host buffer or enable MPI support for " - "device memory."); + "support of device memory is not available or host " + "buffer were explicitly requested. Please provide a " + "host buffer or enable MPI support for device memory."); auto b_local = b_global->get_local_vector(); diff --git a/core/test/mpi/distributed/row_gatherer.cpp b/core/test/mpi/distributed/row_gatherer.cpp index 9241e4d20a6..028f9989b24 100644 --- a/core/test/mpi/distributed/row_gatherer.cpp +++ b/core/test/mpi/distributed/row_gatherer.cpp @@ -23,21 +23,6 @@ class RowGatherer : public ::testing::Test { using row_gatherer_type = gko::experimental::distributed::RowGatherer; - RowGatherer() - { - int rank = this->comm.rank(); - auto part = gko::share(part_type::build_from_global_size_uniform( - this->ref, this->comm.size(), this->comm.size() * 3)); - auto recv_connections = - this->template create_recv_connections()[rank]; - auto imap = - map_type{this->ref, part, this->comm.rank(), recv_connections}; - auto coll_comm = - std::make_shared( - this->comm, imap); - rg = row_gatherer_type::create(ref, coll_comm, imap); - } - void SetUp() override { ASSERT_EQ(comm.size(), 6); } template @@ -53,11 +38,11 @@ class RowGatherer : public ::testing::Test { std::shared_ptr ref = gko::ReferenceExecutor::create(); gko::experimental::mpi::communicator comm = MPI_COMM_WORLD; - std::shared_ptr rg; }; TYPED_TEST_SUITE(RowGatherer, gko::test::IndexTypes, TypenameNameGenerator); + TYPED_TEST(RowGatherer, CanDefaultConstruct) { using RowGatherer = typename TestFixture::row_gatherer_type; @@ -103,164 +88,3 @@ TYPED_TEST(RowGatherer, CanConstructFromCollectiveCommAndIndexMap) gko::dim<2> size{recv_connections.get_size(), 18}; GKO_ASSERT_EQUAL_DIMENSIONS(rg, size); } - - -TYPED_TEST(RowGatherer, CanApply) -{ - using Dense = gko::matrix::Dense; - using Vector = gko::experimental::distributed::Vector; - int rank = this->comm.rank(); - auto offset = static_cast(rank * 3); - auto b = Vector::create( - this->ref, this->comm, gko::dim<2>{18, 1}, - gko::initialize({offset, offset + 1, offset + 2}, this->ref)); - auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); - - this->rg->apply(b, x); - - auto expected = this->template create_recv_connections()[rank]; - auto expected_vec = Dense::create( - this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); - GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); -} - - -TYPED_TEST(RowGatherer, CanApplyAsync) -{ - using Dense = gko::matrix::Dense; - using Vector = gko::experimental::distributed::Vector; - int rank = this->comm.rank(); - auto offset = static_cast(rank * 3); - auto b = Vector::create( - this->ref, this->comm, gko::dim<2>{18, 1}, - gko::initialize({offset, offset + 1, offset + 2}, this->ref)); - auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); - - auto req = this->rg->apply_async(b, x); - req.wait(); - - auto expected = this->template create_recv_connections()[rank]; - auto expected_vec = Dense::create( - this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); - GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); -} - - -TYPED_TEST(RowGatherer, CanApplyAsyncConsequetively) -{ - using Dense = gko::matrix::Dense; - using Vector = gko::experimental::distributed::Vector; - int rank = this->comm.rank(); - auto offset = static_cast(rank * 3); - auto b = Vector::create( - this->ref, this->comm, gko::dim<2>{18, 1}, - gko::initialize({offset, offset + 1, offset + 2}, this->ref)); - auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); - - this->rg->apply_async(b, x).wait(); - this->rg->apply_async(b, x).wait(); - - auto expected = this->template create_recv_connections()[rank]; - auto expected_vec = Dense::create( - this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); - GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); -} - - -TYPED_TEST(RowGatherer, CanApplyAsyncWithWorkspace) -{ - using Dense = gko::matrix::Dense; - using Vector = gko::experimental::distributed::Vector; - int rank = this->comm.rank(); - auto offset = static_cast(rank * 3); - auto b = Vector::create( - this->ref, this->comm, gko::dim<2>{18, 1}, - gko::initialize({offset, offset + 1, offset + 2}, this->ref)); - auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); - gko::array workspace(this->ref); - - auto req = this->rg->apply_async(b, x, workspace); - req.wait(); - - auto expected = this->template create_recv_connections()[rank]; - auto expected_vec = Dense::create( - this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); - GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); -} - - -TYPED_TEST(RowGatherer, CanApplyAsyncMultipleTimesWithWorkspace) -{ - using Dense = gko::matrix::Dense; - using Vector = gko::experimental::distributed::Vector; - int rank = this->comm.rank(); - auto offset = static_cast(rank * 3); - auto b1 = Vector::create( - this->ref, this->comm, gko::dim<2>{18, 1}, - gko::initialize({offset, offset + 1, offset + 2}, this->ref)); - auto b2 = gko::clone(b1); - b2->scale(gko::initialize({-1}, this->ref)); - auto x1 = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); - auto x2 = gko::clone(x1); - gko::array workspace1(this->ref); - gko::array workspace2(this->ref); - - auto req1 = this->rg->apply_async(b1, x1, workspace1); - auto req2 = this->rg->apply_async(b2, x2, workspace2); - req1.wait(); - req2.wait(); - - auto expected = this->template create_recv_connections()[rank]; - auto expected_vec1 = Dense::create( - this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); - auto expected_vec2 = gko::clone(expected_vec1); - expected_vec2->scale(gko::initialize({-1}, this->ref)); - GKO_ASSERT_MTX_NEAR(x1, expected_vec1, 0.0); - GKO_ASSERT_MTX_NEAR(x2, expected_vec2, 0.0); -} - - -TYPED_TEST(RowGatherer, CanApplyAsyncWithMultipleColumns) -{ - using Dense = gko::matrix::Dense; - using Vector = gko::experimental::distributed::Vector; - int rank = this->comm.rank(); - auto offset = static_cast(rank * 3); - auto b = Vector::create( - this->ref, this->comm, gko::dim<2>{18, 2}, - gko::initialize({{offset, offset * offset}, - {offset + 1, offset * offset + 1}, - {offset + 2, offset * offset + 2}}, - this->ref)); - auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 2}); - - this->rg->apply_async(b, x).wait(); - - gko::array expected[] = { - gko::array{this->ref, {3, 9, 5, 11, 10, 82, 11, 83}}, - gko::array{this->ref, {0, 0, 1, 1, 7, 37, 12, 144, 13, 145}}, - gko::array{this->ref, {3, 9, 4, 10, 17, 227}}, - gko::array{this->ref, {1, 1, 2, 2, 12, 144, 14, 146}}, - gko::array{this->ref, - {4, 10, 5, 11, 9, 81, 10, 82, 15, 225, 16, 226}}, - gko::array{this->ref, {8, 38, 12, 144, 13, 145, 14, 146}}}; - auto expected_vec = - Dense::create(this->ref, gko::dim<2>{expected[rank].get_size() / 2, 2}, - expected[rank], 2); - GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); -} - - -TYPED_TEST(RowGatherer, ThrowsOnAdvancedApply) -{ - using RowGatherer = typename TestFixture::row_gatherer_type; - using Dense = gko::matrix::Dense; - using Vector = gko::experimental::distributed::Vector; - auto rg = RowGatherer::create(this->ref, this->comm); - auto b = Vector::create(this->ref, this->comm); - auto x = Dense::create(this->ref); - auto alpha = Dense::create(this->ref, gko::dim<2>{1, 1}); - auto beta = Dense::create(this->ref, gko::dim<2>{1, 1}); - - ASSERT_THROW(rg->apply(alpha, b, beta, x), gko::NotImplemented); -} diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 5c33fca03c1..1215d58b123 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -380,7 +380,6 @@ class request { return status; } - private: MPI_Request req_; }; diff --git a/include/ginkgo/core/distributed/row_gatherer.hpp b/include/ginkgo/core/distributed/row_gatherer.hpp index 844dae40717..c2b99bc91e1 100644 --- a/include/ginkgo/core/distributed/row_gatherer.hpp +++ b/include/ginkgo/core/distributed/row_gatherer.hpp @@ -32,19 +32,18 @@ namespace distributed { * Example usage: * ```c++ * auto coll_comm = std::make_shared(comm, - * imap); auto rg = distributed::RowGatherer::create(exec, coll_comm, - * imap); + * imap); + * auto rg = distributed::RowGatherer::create(exec, coll_comm, imap); * * auto b = distributed::Vector::create(...); * auto x = matrix::Dense::create(...); * - * auto future = rg->apply_async(b, x); + * auto req = rg->apply_async(b, x); * // do some computation that doesn't modify b, or access x - * future.wait(); + * req.wait(); * // x now contains the gathered rows of b * ``` - * Using the apply instead of the apply_async will lead to a blocking - * communication. + * Using apply instead of apply_async will lead to a blocking communication. * * @note Objects of this class are only available as shared_ptr, since the class * is derived from std::enable_shared_from_this. diff --git a/test/mpi/CMakeLists.txt b/test/mpi/CMakeLists.txt index 346aba200f7..93e6c5c451b 100644 --- a/test/mpi/CMakeLists.txt +++ b/test/mpi/CMakeLists.txt @@ -1,8 +1,4 @@ -ginkgo_create_common_and_reference_test(assembly MPI_SIZE 3 LABELS distributed) -ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3 LABELS distributed) -ginkgo_create_common_and_reference_test(partition_helpers MPI_SIZE 3 LABELS distributed) -ginkgo_create_common_and_reference_test(vector MPI_SIZE 3 LABELS distributed) - +add_subdirectory(distributed) add_subdirectory(preconditioner) add_subdirectory(solver) add_subdirectory(multigrid) diff --git a/test/mpi/distributed/CMakeLists.txt b/test/mpi/distributed/CMakeLists.txt new file mode 100644 index 00000000000..6010b9a7560 --- /dev/null +++ b/test/mpi/distributed/CMakeLists.txt @@ -0,0 +1,5 @@ +ginkgo_create_common_and_reference_test(assembly MPI_SIZE 3 LABELS distributed) +ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3 LABELS distributed) +ginkgo_create_common_and_reference_test(partition_helpers MPI_SIZE 3 LABELS distributed) +ginkgo_create_common_and_reference_test(vector MPI_SIZE 3 LABELS distributed) +ginkgo_create_common_and_reference_test(row_gatherer MPI_SIZE 6 LABELS distributed) diff --git a/test/mpi/assembly.cpp b/test/mpi/distributed/assembly.cpp similarity index 100% rename from test/mpi/assembly.cpp rename to test/mpi/distributed/assembly.cpp diff --git a/test/mpi/matrix.cpp b/test/mpi/distributed/matrix.cpp similarity index 100% rename from test/mpi/matrix.cpp rename to test/mpi/distributed/matrix.cpp diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/distributed/partition_helpers.cpp similarity index 98% rename from test/mpi/partition_helpers.cpp rename to test/mpi/distributed/partition_helpers.cpp index 43b4783d896..9a2baeabca5 100644 --- a/test/mpi/partition_helpers.cpp +++ b/test/mpi/distributed/partition_helpers.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/test/mpi/distributed/row_gatherer.cpp b/test/mpi/distributed/row_gatherer.cpp new file mode 100644 index 00000000000..726f579c7bb --- /dev/null +++ b/test/mpi/distributed/row_gatherer.cpp @@ -0,0 +1,234 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +#include "core/test/utils.hpp" +#include "ginkgo/core/base/exception.hpp" +#include "test/utils/mpi/common_fixture.hpp" + + +#if GINKGO_HAVE_OPENMPI_PRE_4_1_X +using CollCommType = gko::experimental::mpi::DenseCommunicator; +#else +using CollCommType = gko::experimental::mpi::NeighborhoodCommunicator; +#endif + + +template +class RowGatherer : public ::testing::Test { +protected: + using index_type = IndexType; + using part_type = + gko::experimental::distributed::Partition; + using map_type = + gko::experimental::distributed::index_map; + using row_gatherer_type = + gko::experimental::distributed::RowGatherer; + + RowGatherer() + { + int rank = this->comm.rank(); + auto part = gko::share(part_type::build_from_global_size_uniform( + this->ref, this->comm.size(), this->comm.size() * 3)); + auto recv_connections = + this->template create_recv_connections()[rank]; + auto imap = + map_type{this->ref, part, this->comm.rank(), recv_connections}; + auto coll_comm = std::make_shared(this->comm, imap); + rg = row_gatherer_type::create(ref, coll_comm, imap); + } + + void SetUp() override { ASSERT_EQ(comm.size(), 6); } + + template + std::array, 6> create_recv_connections() + { + return {gko::array{ref, {3, 5, 10, 11}}, + gko::array{ref, {0, 1, 7, 12, 13}}, + gko::array{ref, {3, 4, 17}}, + gko::array{ref, {1, 2, 12, 14}}, + gko::array{ref, {4, 5, 9, 10, 15, 16}}, + gko::array{ref, {8, 12, 13, 14}}}; + } + + std::shared_ptr ref = gko::ReferenceExecutor::create(); + gko::experimental::mpi::communicator comm = MPI_COMM_WORLD; + std::shared_ptr rg; +}; + +TYPED_TEST_SUITE(RowGatherer, gko::test::IndexTypes, TypenameNameGenerator); + + +TYPED_TEST(RowGatherer, CanApply) +{ + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + int rank = this->comm.rank(); + auto offset = static_cast(rank * 3); + auto b = Vector::create( + this->ref, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->ref)); + auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); + + this->rg->apply(b, x); + + auto expected = this->template create_recv_connections()[rank]; + auto expected_vec = Dense::create( + this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); +} + + +TYPED_TEST(RowGatherer, CanApplyAsync) +{ + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + int rank = this->comm.rank(); + auto offset = static_cast(rank * 3); + auto b = Vector::create( + this->ref, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->ref)); + auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); + + auto req = this->rg->apply_async(b, x); + req.wait(); + + auto expected = this->template create_recv_connections()[rank]; + auto expected_vec = Dense::create( + this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); +} + + +TYPED_TEST(RowGatherer, CanApplyAsyncConsequetively) +{ + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + int rank = this->comm.rank(); + auto offset = static_cast(rank * 3); + auto b = Vector::create( + this->ref, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->ref)); + auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); + + this->rg->apply_async(b, x).wait(); + this->rg->apply_async(b, x).wait(); + + auto expected = this->template create_recv_connections()[rank]; + auto expected_vec = Dense::create( + this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); +} + + +TYPED_TEST(RowGatherer, CanApplyAsyncWithWorkspace) +{ + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + int rank = this->comm.rank(); + auto offset = static_cast(rank * 3); + auto b = Vector::create( + this->ref, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->ref)); + auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); + gko::array workspace(this->ref); + + auto req = this->rg->apply_async(b, x, workspace); + req.wait(); + + auto expected = this->template create_recv_connections()[rank]; + auto expected_vec = Dense::create( + this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); + ASSERT_GT(workspace.get_size(), 0); +} + + +TYPED_TEST(RowGatherer, CanApplyAsyncMultipleTimesWithWorkspace) +{ + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + int rank = this->comm.rank(); + auto offset = static_cast(rank * 3); + auto b1 = Vector::create( + this->ref, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->ref)); + auto b2 = gko::clone(b1); + b2->scale(gko::initialize({-1}, this->ref)); + auto x1 = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); + auto x2 = gko::clone(x1); + gko::array workspace1(this->ref); + gko::array workspace2(this->ref); + + auto req1 = this->rg->apply_async(b1, x1, workspace1); + auto req2 = this->rg->apply_async(b2, x2, workspace2); + req1.wait(); + req2.wait(); + + auto expected = this->template create_recv_connections()[rank]; + auto expected_vec1 = Dense::create( + this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + auto expected_vec2 = gko::clone(expected_vec1); + expected_vec2->scale(gko::initialize({-1}, this->ref)); + GKO_ASSERT_MTX_NEAR(x1, expected_vec1, 0.0); + GKO_ASSERT_MTX_NEAR(x2, expected_vec2, 0.0); +} + + +TYPED_TEST(RowGatherer, CanApplyAsyncWithMultipleColumns) +{ + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + int rank = this->comm.rank(); + auto offset = static_cast(rank * 3); + auto b = Vector::create( + this->ref, this->comm, gko::dim<2>{18, 2}, + gko::initialize({{offset, offset * offset}, + {offset + 1, offset * offset + 1}, + {offset + 2, offset * offset + 2}}, + this->ref)); + auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 2}); + + this->rg->apply_async(b, x).wait(); + + gko::array expected[] = { + gko::array{this->ref, {3, 9, 5, 11, 10, 82, 11, 83}}, + gko::array{this->ref, {0, 0, 1, 1, 7, 37, 12, 144, 13, 145}}, + gko::array{this->ref, {3, 9, 4, 10, 17, 227}}, + gko::array{this->ref, {1, 1, 2, 2, 12, 144, 14, 146}}, + gko::array{this->ref, + {4, 10, 5, 11, 9, 81, 10, 82, 15, 225, 16, 226}}, + gko::array{this->ref, {8, 38, 12, 144, 13, 145, 14, 146}}}; + auto expected_vec = + Dense::create(this->ref, gko::dim<2>{expected[rank].get_size() / 2, 2}, + expected[rank], 2); + GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); +} + + +TYPED_TEST(RowGatherer, ThrowsOnAdvancedApply) +{ + using RowGatherer = typename TestFixture::row_gatherer_type; + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + auto rg = RowGatherer::create(this->ref, this->comm); + auto b = Vector::create(this->ref, this->comm); + auto x = Dense::create(this->ref); + auto alpha = Dense::create(this->ref, gko::dim<2>{1, 1}); + auto beta = Dense::create(this->ref, gko::dim<2>{1, 1}); + + ASSERT_THROW(rg->apply(alpha, b, beta, x), gko::NotImplemented); +} diff --git a/test/mpi/vector.cpp b/test/mpi/distributed/vector.cpp similarity index 100% rename from test/mpi/vector.cpp rename to test/mpi/distributed/vector.cpp From ce9fd728f2af4feb1238f0905a1ff25d3992463b Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 28 May 2024 17:24:32 +0200 Subject: [PATCH 08/25] [pgm] use row-gatherer from matrix --- core/distributed/row_gatherer.cpp | 7 +++++++ core/multigrid/pgm.cpp | 21 ++++++++----------- .../ginkgo/core/distributed/row_gatherer.hpp | 7 +++++++ 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index 3370c3ecc8c..96006beb347 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -165,6 +165,13 @@ RowGatherer::RowGatherer( } +template +const LocalIndexType* RowGatherer::get_const_row_idxs() const +{ + return send_idxs_.get_const_data(); +} + + template RowGatherer::RowGatherer(std::shared_ptr exec, mpi::communicator comm) diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index fddefd3ddf0..a6f79bcc044 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -279,18 +279,15 @@ array Pgm::communicate_non_local_agg( { auto exec = matrix->get_executor(); const auto comm = matrix->get_communicator(); - auto send_sizes = matrix->send_sizes_; - auto recv_sizes = matrix->recv_sizes_; - auto send_offsets = matrix->send_offsets_; - auto recv_offsets = matrix->recv_offsets_; - auto gather_idxs = matrix->gather_idxs_; - auto total_send_size = send_offsets.back(); - auto total_recv_size = recv_offsets.back(); + auto coll_comm = matrix->row_gatherer_->get_collective_communicator(); + auto total_send_size = coll_comm->get_send_size(); + auto total_recv_size = coll_comm->get_recv_size(); + auto row_gatherer = matrix->row_gatherer_; array send_agg(exec, total_send_size); exec->run(pgm::make_gather_index( send_agg.get_size(), local_agg.get_const_data(), - gather_idxs.get_const_data(), send_agg.get_data())); + row_gatherer->get_const_row_idxs(), send_agg.get_data())); // There is no index map on the coarse level yet, so map the local indices // to global indices on the coarse level manually @@ -312,16 +309,16 @@ array Pgm::communicate_non_local_agg( send_global_agg.get_data(), host_send_buffer.get_data()); } - auto type = experimental::mpi::type_impl::get_type(); const auto send_ptr = use_host_buffer ? host_send_buffer.get_const_data() : send_global_agg.get_const_data(); auto recv_ptr = use_host_buffer ? host_recv_buffer.get_data() : non_local_agg.get_data(); exec->synchronize(); - comm.all_to_all_v(use_host_buffer ? exec->get_master() : exec, send_ptr, - send_sizes.data(), send_offsets.data(), type, recv_ptr, - recv_sizes.data(), recv_offsets.data(), type); + coll_comm + ->i_all_to_all_v(use_host_buffer ? exec->get_master() : exec, send_ptr, + recv_ptr) + .wait(); if (use_host_buffer) { exec->copy_from(exec->get_master(), total_recv_size, recv_ptr, non_local_agg.get_data()); diff --git a/include/ginkgo/core/distributed/row_gatherer.hpp b/include/ginkgo/core/distributed/row_gatherer.hpp index c2b99bc91e1..2e73f7cd394 100644 --- a/include/ginkgo/core/distributed/row_gatherer.hpp +++ b/include/ginkgo/core/distributed/row_gatherer.hpp @@ -96,6 +96,13 @@ class RowGatherer final : public EnableLinOp>, std::shared_ptr get_collective_communicator() const; + /** + * Read access to the (local) rows indices + * + * @return the (local) row indices that are gathered + */ + const LocalIndexType* get_const_row_idxs() const; + /** * Creates a distributed::RowGatherer from a given collective communicator * and index map. From bf098450bda7b9d0a1e6202b55a711515ba3420d Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 14 Feb 2025 15:36:41 +0100 Subject: [PATCH 09/25] [dist] review updates: - add copy/move tests - undo using MPI_Init_thread - add extra host_recv_buffer_ - create row-gatherer as unique_ptr Co-authored-by: Yu-Hsiang M. Tsai --- core/distributed/matrix.cpp | 56 ++++++------ core/distributed/row_gatherer.cpp | 23 +++-- core/multigrid/pgm.cpp | 2 +- core/test/gtest/ginkgo_mpi_main.cpp | 8 +- core/test/mpi/distributed/row_gatherer.cpp | 87 ++++++++++++++----- include/ginkgo/core/distributed/matrix.hpp | 1 + .../ginkgo/core/distributed/row_gatherer.hpp | 33 +++---- 7 files changed, 127 insertions(+), 83 deletions(-) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 1edc8b33d30..57191427f2e 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -32,19 +32,6 @@ GKO_REGISTER_OPERATION(separate_local_nonlocal, } // namespace matrix -template -void initialize_communication_pattern( - const index_map& imap, - std::shared_ptr>& row_gatherer) -{ - row_gatherer = RowGatherer::create( - row_gatherer->get_executor(), - row_gatherer->get_collective_communicator()->create_with_same_type( - row_gatherer->get_communicator(), imap), - imap); -} - - template Matrix::Matrix( std::shared_ptr exec, mpi::communicator comm) @@ -112,7 +99,11 @@ Matrix::Matrix( one_scalar_.init(exec, dim<2>{1, 1}); one_scalar_->fill(one()); - initialize_communication_pattern(imap_, row_gatherer_); + row_gatherer_ = RowGatherer::create( + row_gatherer_->get_executor(), + row_gatherer_->get_collective_communicator()->create_with_same_type( + row_gatherer_->get_communicator(), imap_), + imap_); } @@ -390,7 +381,11 @@ void Matrix::read_distributed( as>(this->non_local_mtx_) ->read(std::move(non_local_data)); - initialize_communication_pattern(imap_, row_gatherer_); + row_gatherer_ = RowGatherer::create( + row_gatherer_->get_executor(), + row_gatherer_->get_collective_communicator()->create_with_same_type( + row_gatherer_->get_communicator(), imap_), + imap_); } @@ -457,15 +452,18 @@ void Matrix::apply_impl( row_gatherer_->get_collective_communicator() ->get_recv_size()), dense_b->get_size()[1]}; - auto recv_exec = mpi::requires_host_buffer(exec, comm) - ? exec->get_master() - : exec; - recv_buffer_.init(recv_exec, recv_dim); - auto req = - this->row_gatherer_->apply_async(dense_b, recv_buffer_.get()); + recv_buffer_.init(exec, recv_dim); + host_recv_buffer_.init(exec->get_master(), recv_dim); + auto recv_ptr = mpi::requires_host_buffer(exec, comm) + ? host_recv_buffer_.get() + : recv_buffer_.get(); + auto req = this->row_gatherer_->apply_async(dense_b, recv_ptr); local_mtx_->apply(dense_b->get_local_vector(), local_x); req.wait(); + if (recv_ptr != recv_buffer_.get()) { + recv_buffer_->copy_from(host_recv_buffer_.get()); + } non_local_mtx_->apply(one_scalar_.get(), recv_buffer_.get(), one_scalar_.get(), local_x); }, @@ -496,16 +494,20 @@ void Matrix::apply_impl( row_gatherer_->get_collective_communicator() ->get_recv_size()), dense_b->get_size()[1]}; - auto recv_exec = mpi::requires_host_buffer(exec, comm) - ? exec->get_master() - : exec; - recv_buffer_.init(recv_exec, recv_dim); - auto req = - this->row_gatherer_->apply_async(dense_b, recv_buffer_.get()); + ; + recv_buffer_.init(exec, recv_dim); + host_recv_buffer_.init(exec->get_master(), recv_dim); + auto recv_ptr = mpi::requires_host_buffer(exec, comm) + ? host_recv_buffer_.get() + : recv_buffer_.get(); + auto req = this->row_gatherer_->apply_async(dense_b, recv_ptr); local_mtx_->apply(local_alpha, dense_b->get_local_vector(), local_beta, local_x); req.wait(); + if (recv_ptr != recv_buffer_.get()) { + recv_buffer_->copy_from(host_recv_buffer_.get()); + } non_local_mtx_->apply(local_alpha, recv_buffer_.get(), one_scalar_.get(), local_x); }, diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index 96006beb347..a9bd41c1dc3 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -15,8 +15,6 @@ namespace gko { namespace experimental { namespace distributed { - - #if GINKGO_HAVE_OPENMPI_PRE_4_1_X using DefaultCollComm = mpi::DenseCommunicator; #else @@ -95,10 +93,12 @@ mpi::request RowGatherer::apply_async( b_local->get_size()[1]); auto send_size_in_bytes = sizeof(ValueType) * send_size[0] * send_size[1]; - workspace.set_executor(mpi_exec); - if (send_size_in_bytes > workspace.get_size()) { - workspace.resize_and_reset(sizeof(ValueType) * - send_size[0] * send_size[1]); + if (!mpi_exec->memory_accessible( + workspace.get_executor()) || + send_size_in_bytes > workspace.get_size()) { + workspace = array( + mpi_exec, + sizeof(ValueType) * send_size[0] * send_size[1]); } auto send_buffer = matrix::Dense::create( mpi_exec, send_size, @@ -166,12 +166,19 @@ RowGatherer::RowGatherer( template -const LocalIndexType* RowGatherer::get_const_row_idxs() const +const LocalIndexType* RowGatherer::get_const_send_idxs() const { return send_idxs_.get_const_data(); } +template +size_type RowGatherer::get_num_send_idxs() const +{ + return send_idxs_.get_size(); +} + + template RowGatherer::RowGatherer(std::shared_ptr exec, mpi::communicator comm) @@ -254,8 +261,6 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_ROW_GATHERER_CONSTRUCTOR); #undef GKO_DECLARE_ROW_GATHERER_CONSTRUCTOR - - } // namespace distributed } // namespace experimental } // namespace gko diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index a6f79bcc044..bbff7f67a64 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -287,7 +287,7 @@ array Pgm::communicate_non_local_agg( array send_agg(exec, total_send_size); exec->run(pgm::make_gather_index( send_agg.get_size(), local_agg.get_const_data(), - row_gatherer->get_const_row_idxs(), send_agg.get_data())); + row_gatherer->get_const_send_idxs(), send_agg.get_data())); // There is no index map on the coarse level yet, so map the local indices // to global indices on the coarse level manually diff --git a/core/test/gtest/ginkgo_mpi_main.cpp b/core/test/gtest/ginkgo_mpi_main.cpp index eeaa6578bce..83af86b681f 100644 --- a/core/test/gtest/ginkgo_mpi_main.cpp +++ b/core/test/gtest/ginkgo_mpi_main.cpp @@ -356,13 +356,7 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - int provided_thread_support; - MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, - &provided_thread_support); - if (provided_thread_support != MPI_THREAD_MULTIPLE) { - throw std::runtime_error( - "This test requires an thread compliant MPI implementation."); - } + MPI_Init(&argc, &argv); MPI_Comm comm(MPI_COMM_WORLD); int rank; int size; diff --git a/core/test/mpi/distributed/row_gatherer.cpp b/core/test/mpi/distributed/row_gatherer.cpp index 028f9989b24..8799b4939ce 100644 --- a/core/test/mpi/distributed/row_gatherer.cpp +++ b/core/test/mpi/distributed/row_gatherer.cpp @@ -17,27 +17,34 @@ class RowGatherer : public ::testing::Test { protected: using index_type = IndexType; using part_type = - gko::experimental::distributed::Partition; + gko::experimental::distributed::Partition; using map_type = - gko::experimental::distributed::index_map; + gko::experimental::distributed::index_map; using row_gatherer_type = gko::experimental::distributed::RowGatherer; void SetUp() override { ASSERT_EQ(comm.size(), 6); } - template - std::array, 6> create_recv_connections() + std::array, 6> create_recv_connections() { - return {gko::array{ref, {3, 5, 10, 11}}, - gko::array{ref, {0, 1, 7, 12, 13}}, - gko::array{ref, {3, 4, 17}}, - gko::array{ref, {1, 2, 12, 14}}, - gko::array{ref, {4, 5, 9, 10, 15, 16}}, - gko::array{ref, {8, 12, 13, 14}}}; + return {gko::array{ref, {3, 5, 10, 11}}, + gko::array{ref, {0, 1, 7, 12, 13}}, + gko::array{ref, {3, 4, 17}}, + gko::array{ref, {1, 2, 12, 14}}, + gko::array{ref, {4, 5, 9, 10, 15, 16}}, + gko::array{ref, {8, 12, 13, 14}}}; } std::shared_ptr ref = gko::ReferenceExecutor::create(); gko::experimental::mpi::communicator comm = MPI_COMM_WORLD; + std::shared_ptr part = part_type::build_from_global_size_uniform( + this->ref, this->comm.size(), this->comm.size() * 3); + map_type imap = map_type{ref, part, comm.rank(), + create_recv_connections()[comm.rank()]}; + std::shared_ptr + coll_comm = + std::make_shared( + this->comm, imap); }; TYPED_TEST_SUITE(RowGatherer, gko::test::IndexTypes, TypenameNameGenerator); @@ -53,7 +60,7 @@ TYPED_TEST(RowGatherer, CanDefaultConstruct) } -TYPED_TEST(RowGatherer, CanConstructWithEmptCollectiveCommAndIndexMap) +TYPED_TEST(RowGatherer, CanConstructWithEmptyCollectiveCommAndIndexMap) { using RowGatherer = typename TestFixture::row_gatherer_type; using IndexMap = typename TestFixture::map_type; @@ -71,20 +78,54 @@ TYPED_TEST(RowGatherer, CanConstructWithEmptCollectiveCommAndIndexMap) TYPED_TEST(RowGatherer, CanConstructFromCollectiveCommAndIndexMap) { using RowGatherer = typename TestFixture::row_gatherer_type; - using Part = typename TestFixture::part_type; - using IndexMap = typename TestFixture::map_type; - int rank = this->comm.rank(); - auto part = gko::share(Part::build_from_global_size_uniform( - this->ref, this->comm.size(), this->comm.size() * 3)); - auto recv_connections = - this->template create_recv_connections()[rank]; - auto imap = IndexMap{this->ref, part, this->comm.rank(), recv_connections}; - auto coll_comm = - std::make_shared( - this->comm, imap); - auto rg = RowGatherer::create(this->ref, coll_comm, imap); + auto rg = RowGatherer::create(this->ref, this->coll_comm, this->imap); + int rank = this->comm.rank(); + auto recv_connections = this->create_recv_connections()[rank]; gko::dim<2> size{recv_connections.get_size(), 18}; GKO_ASSERT_EQUAL_DIMENSIONS(rg, size); } + + +TYPED_TEST(RowGatherer, CanCopy) +{ + using RowGatherer = typename TestFixture::row_gatherer_type; + auto rg = RowGatherer::create(this->ref, this->coll_comm, this->imap); + + auto copy = gko::clone(rg); + + GKO_ASSERT_EQUAL_DIMENSIONS(rg, copy); + auto copy_coll_comm = std::dynamic_pointer_cast< + const gko::experimental::mpi::NeighborhoodCommunicator>( + copy->get_collective_communicator()); + ASSERT_EQ(*this->coll_comm, *copy_coll_comm); + auto send_idxs = gko::make_const_array_view( + rg->get_executor(), rg->get_num_send_idxs(), rg->get_const_send_idxs()); + auto copy_send_idxs = gko::make_const_array_view( + copy->get_executor(), copy->get_num_send_idxs(), + copy->get_const_send_idxs()); + GKO_ASSERT_ARRAY_EQ(send_idxs, copy_send_idxs); +} + + +TYPED_TEST(RowGatherer, CanMove) +{ + using RowGatherer = typename TestFixture::row_gatherer_type; + auto rg = RowGatherer::create(this->ref, this->coll_comm, this->imap); + auto orig_send_idxs = rg->get_const_send_idxs(); + auto orig_coll_comm = rg->get_collective_communicator(); + auto copy = gko::clone(rg); + + auto move = RowGatherer::create(this->ref, this->comm); + move->move_from(rg); + + GKO_ASSERT_EQUAL_DIMENSIONS(move, copy); + GKO_ASSERT_EQUAL_DIMENSIONS(rg, gko::dim<2>()); + ASSERT_EQ(orig_send_idxs, move->get_const_send_idxs()); + ASSERT_EQ(orig_coll_comm, move->get_collective_communicator()); + ASSERT_EQ(copy->get_num_send_idxs(), move->get_num_send_idxs()); + ASSERT_EQ(rg->get_const_send_idxs(), nullptr); + ASSERT_EQ(rg->get_num_send_idxs(), 0); + ASSERT_NE(rg->get_collective_communicator(), nullptr); +} diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index 8d253fc3379..577d73c7463 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -700,6 +700,7 @@ class Matrix index_map imap_; gko::detail::DenseCache one_scalar_; gko::detail::DenseCache recv_buffer_; + gko::detail::DenseCache host_recv_buffer_; std::shared_ptr local_mtx_; std::shared_ptr non_local_mtx_; }; diff --git a/include/ginkgo/core/distributed/row_gatherer.hpp b/include/ginkgo/core/distributed/row_gatherer.hpp index 2e73f7cd394..97e8672fd54 100644 --- a/include/ginkgo/core/distributed/row_gatherer.hpp +++ b/include/ginkgo/core/distributed/row_gatherer.hpp @@ -45,8 +45,8 @@ namespace distributed { * ``` * Using apply instead of apply_async will lead to a blocking communication. * - * @note Objects of this class are only available as shared_ptr, since the class - * is derived from std::enable_shared_from_this. + * @note The apply and apply_async function will *not* ensure that the in/output + * vectors use the same executor as the this object. * * @tparam LocalIndexType the index type for the stored indices */ @@ -64,6 +64,7 @@ class RowGatherer final : public EnableLinOp>, * * @param b the input distributed::Vector * @param x the output matrix::Dense with the rows gathered from b + * * @return a mpi::request for this task. The task is guaranteed to * be completed only after `.wait()` has been called on it. */ @@ -82,6 +83,7 @@ class RowGatherer final : public EnableLinOp>, * @param workspace a workspace to store temporary data for the operation. * This might not be modified before the request is * waited on. + * * @return a mpi::request for this task. The task is guaranteed to * be completed only after `.wait()` has been called on it. */ @@ -90,18 +92,19 @@ class RowGatherer final : public EnableLinOp>, /** * Get the used collective communicator. - * - * @return the used collective communicator */ std::shared_ptr get_collective_communicator() const; /** * Read access to the (local) rows indices - * - * @return the (local) row indices that are gathered */ - const LocalIndexType* get_const_row_idxs() const; + const LocalIndexType* get_const_send_idxs() const; + + /** + * Returns the number of (local) row indices. + */ + size_type get_num_send_idxs() const; /** * Creates a distributed::RowGatherer from a given collective communicator @@ -124,22 +127,22 @@ class RowGatherer final : public EnableLinOp>, template = sizeof(LocalIndexType)>> - static std::shared_ptr create( + static std::unique_ptr create( std::shared_ptr exec, std::shared_ptr coll_comm, const index_map& imap) { - return std::shared_ptr( + return std::unique_ptr( new RowGatherer(std::move(exec), std::move(coll_comm), imap)); } /* * Create method for an empty RowGatherer. */ - static std::shared_ptr create( + static std::unique_ptr create( std::shared_ptr exec, mpi::communicator comm) { - return std::shared_ptr( + return std::unique_ptr( new RowGatherer(std::move(exec), std::move(comm))); } @@ -175,15 +178,13 @@ class RowGatherer final : public EnableLinOp>, RowGatherer(std::shared_ptr exec, mpi::communicator comm); std::shared_ptr coll_comm_; - array send_idxs_; - mutable array send_workspace_; - + // This object might not hold an actual MPI request, so we can't use the + // always owning mpi::request. It's destructor would otherwise make the + // program crash. mutable MPI_Request req_listener_{MPI_REQUEST_NULL}; }; - - } // namespace distributed } // namespace experimental } // namespace gko From e000040bd7c5d0ad237c6258752b65fa2ffe4762 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Feb 2025 13:54:22 +0100 Subject: [PATCH 10/25] [core] allow empty diagonal mtx created from array --- core/matrix/diagonal.cpp | 2 +- core/test/matrix/diagonal.cpp | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index 2bf11f77128..cf798cae4c6 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -376,7 +376,7 @@ Diagonal::Diagonal(std::shared_ptr exec, : EnableLinOp(exec, dim<2>(size)), values_{exec, std::move(values)} { - GKO_ENSURE_IN_BOUNDS(size - 1, values_.get_size()); + GKO_ENSURE_COMPATIBLE_BOUNDS(size, values_.get_size()); } diff --git a/core/test/matrix/diagonal.cpp b/core/test/matrix/diagonal.cpp index de03a9350bb..a7b26980584 100644 --- a/core/test/matrix/diagonal.cpp +++ b/core/test/matrix/diagonal.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -71,6 +71,16 @@ TYPED_TEST(Diagonal, CanBeEmpty) } +TYPED_TEST(Diagonal, CanBeEmptyFromArray) +{ + using Diag = typename TestFixture::Diag; + using value_type = typename Diag::value_type; + auto diag = Diag::create(this->exec, 0, gko::array{this->exec}); + + this->assert_empty(diag.get()); +} + + TYPED_TEST(Diagonal, CanBeCreatedFromExistingData) { using Diag = typename TestFixture::Diag; From b3e8d5b96c49913ec0ee84ee187d0e8c7dd9f123 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Feb 2025 14:49:45 +0100 Subject: [PATCH 11/25] [dist] allocate recv buffers only once --- core/distributed/matrix.cpp | 17 ++++--- test/mpi/distributed/matrix.cpp | 90 ++++++++++++++++++++++++--------- 2 files changed, 76 insertions(+), 31 deletions(-) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 57191427f2e..d71128f7a6d 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -494,7 +494,6 @@ void Matrix::apply_impl( row_gatherer_->get_collective_communicator() ->get_recv_size()), dense_b->get_size()[1]}; - ; recv_buffer_.init(exec, recv_dim); host_recv_buffer_.init(exec->get_master(), recv_dim); auto recv_ptr = mpi::requires_host_buffer(exec, comm) @@ -544,19 +543,23 @@ void Matrix::col_scale( static_cast( row_gatherer_->get_collective_communicator()->get_recv_size()), scaling_factors->get_size()[1]}; - auto recv_exec = - mpi::requires_host_buffer(exec, comm) ? exec->get_master() : exec; - recv_buffer_.init(recv_exec, recv_dim); + recv_buffer_.init(exec, recv_dim); + host_recv_buffer_.init(exec->get_master(), recv_dim); + auto recv_ptr = mpi::requires_host_buffer(exec, comm) + ? host_recv_buffer_.get() + : recv_buffer_.get(); - auto req = - row_gatherer_->apply_async(scaling_factors_ptr, recv_buffer_.get()); + auto req = row_gatherer_->apply_async(scaling_factors_ptr, recv_ptr); scale_diag->rapply(local_mtx_, local_mtx_); req.wait(); if (n_non_local_cols > 0) { + if (recv_ptr != recv_buffer_.get()) { + recv_buffer_->copy_from(host_recv_buffer_.get()); + } const auto non_local_scale_diag = gko::matrix::Diagonal::create_const( exec, n_non_local_cols, - make_const_array_view(recv_exec, n_non_local_cols, + make_const_array_view(exec, n_non_local_cols, recv_buffer_->get_const_values())); non_local_scale_diag->rapply(non_local_mtx_, non_local_mtx_); } diff --git a/test/mpi/distributed/matrix.cpp b/test/mpi/distributed/matrix.cpp index d1dfa362ad8..0285ebb6680 100644 --- a/test/mpi/distributed/matrix.cpp +++ b/test/mpi/distributed/matrix.cpp @@ -860,6 +860,8 @@ bool needs_transfers(std::shared_ptr exec) class HostToDeviceLogger : public gko::log::Logger { public: + mutable int transfer_count = 0; + void on_copy_started(const gko::Executor* exec_from, const gko::Executor* exec_to, const gko::uintptr& loc_from, @@ -867,28 +869,29 @@ class HostToDeviceLogger : public gko::log::Logger { const gko::size_type& num_bytes) const override { if (exec_from != exec_to) { - transfer_count_++; + transfer_count++; } } +}; - int get_transfer_count() const { return transfer_count_; } - static std::unique_ptr create() - { - return std::unique_ptr(new HostToDeviceLogger()); - } +class AllocationLogger : public gko::log::Logger { +public: + mutable int count = 0; + mutable std::set execs; protected: - explicit HostToDeviceLogger() - : gko::log::Logger(gko::log::Logger::copy_started_mask) - {} - -private: - mutable int transfer_count_ = 0; + void on_allocation_completed(const gko::Executor* exec, + const gko::size_type& num_bytes, + const gko::uintptr& location) const override + { + ++count; + execs.insert(exec); + } }; -class MatrixGpuAwareCheck : public CommonMpiTestFixture { +class MatrixInternalBuffers : public CommonMpiTestFixture { public: using local_index_type = gko::int32; using global_index_type = gko::int64; @@ -898,14 +901,16 @@ class MatrixGpuAwareCheck : public CommonMpiTestFixture { using dist_vec_type = gko::experimental::distributed::Vector; using dense_vec_type = gko::matrix::Dense; - MatrixGpuAwareCheck() - : logger(gko::share(HostToDeviceLogger::create())), engine(42) + MatrixInternalBuffers() { - exec->add_logger(logger); + exec->add_logger(copy_logger); + exec->add_logger(alloc_logger); mat = dist_mtx_type::create(exec, comm); x = dist_vec_type::create(exec, comm); y = dist_vec_type::create(exec, comm); + factor = dist_vec_type::create(exec, comm, gko::dim<2>{0, 1}, + gko::dim<2>{0, 1}); alpha = dense_vec_type::create(exec, gko::dim<2>{1, 1}); beta = dense_vec_type::create(exec, gko::dim<2>{1, 1}); @@ -916,33 +921,70 @@ class MatrixGpuAwareCheck : public CommonMpiTestFixture { std::unique_ptr x; std::unique_ptr y; + std::unique_ptr factor; std::unique_ptr alpha; std::unique_ptr beta; - std::shared_ptr logger; + std::shared_ptr copy_logger = + std::make_shared(); + std::shared_ptr alloc_logger = + std::make_shared(); - std::default_random_engine engine; + std::default_random_engine engine{42}; }; -TEST_F(MatrixGpuAwareCheck, ApplyCopiesToHostOnlyIfNecessary) +TEST_F(MatrixInternalBuffers, ApplyCopiesToHostOnlyIfNecessary) { - auto transfer_count_before = logger->get_transfer_count(); + auto transfer_count_before = copy_logger->transfer_count; mat->apply(x, y); - ASSERT_EQ(logger->get_transfer_count() > transfer_count_before, + ASSERT_EQ(copy_logger->transfer_count > transfer_count_before, needs_transfers(exec)); } -TEST_F(MatrixGpuAwareCheck, AdvancedApplyCopiesToHostOnlyIfNecessary) +TEST_F(MatrixInternalBuffers, AdvancedApplyCopiesToHostOnlyIfNecessary) { - auto transfer_count_before = logger->get_transfer_count(); + auto transfer_count_before = copy_logger->transfer_count; mat->apply(alpha, x, beta, y); - ASSERT_EQ(logger->get_transfer_count() > transfer_count_before, + ASSERT_EQ(copy_logger->transfer_count > transfer_count_before, needs_transfers(exec)); } + + +TEST_F(MatrixInternalBuffers, ApplyAllocatesBuffersOnlyOnce) +{ + mat->apply(x, y); + + auto alloc_count_before = alloc_logger->count; + mat->apply(x, y); + + ASSERT_EQ(alloc_logger->count, alloc_count_before); +} + + +TEST_F(MatrixInternalBuffers, AdvancedApplyAllocatesBuffersOnlyOnce) +{ + mat->apply(alpha, x, beta, y); + + auto alloc_count_before = alloc_logger->count; + mat->apply(alpha, x, beta, y); + + ASSERT_EQ(alloc_logger->count, alloc_count_before); +} + + +TEST_F(MatrixInternalBuffers, ColScaleAllocatesBuffersOnlyOnce) +{ + mat->col_scale(factor); + + auto alloc_count_before = alloc_logger->count; + mat->col_scale(factor); + + ASSERT_EQ(alloc_logger->count, alloc_count_before); +} From 86676b34fd4f1d0b589a03a36d7c98a3cf33810d Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Feb 2025 15:36:25 +0100 Subject: [PATCH 12/25] [dist-rg] don't implement apply The `LinOp::apply` function creates temporary clones to match the operators executor, but this will lead to wrong behavior, if MPI doesn't support GPU buffers. --- core/distributed/row_gatherer.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index a9bd41c1dc3..20c2bb72479 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -24,9 +24,7 @@ using DefaultCollComm = mpi::NeighborhoodCommunicator; template void RowGatherer::apply_impl(const LinOp* b, LinOp* x) const -{ - apply_async(b, x).wait(); -} + GKO_NOT_IMPLEMENTED; template From 895a0314d76e6cc0c82078b2fb6f85512d51df2c Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Feb 2025 15:36:43 +0100 Subject: [PATCH 13/25] [dist-rg] use device exec in tests --- core/distributed/row_gatherer.cpp | 3 +- test/mpi/distributed/row_gatherer.cpp | 166 ++++++++++++++------------ 2 files changed, 93 insertions(+), 76 deletions(-) diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index 20c2bb72479..94a174a9777 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -91,7 +91,8 @@ mpi::request RowGatherer::apply_async( b_local->get_size()[1]); auto send_size_in_bytes = sizeof(ValueType) * send_size[0] * send_size[1]; - if (!mpi_exec->memory_accessible( + if (!workspace.get_executor() || + !mpi_exec->memory_accessible( workspace.get_executor()) || send_size_in_bytes > workspace.get_size()) { workspace = array( diff --git a/test/mpi/distributed/row_gatherer.cpp b/test/mpi/distributed/row_gatherer.cpp index 726f579c7bb..e77d2bb0272 100644 --- a/test/mpi/distributed/row_gatherer.cpp +++ b/test/mpi/distributed/row_gatherer.cpp @@ -28,7 +28,7 @@ using CollCommType = gko::experimental::mpi::NeighborhoodCommunicator; template -class RowGatherer : public ::testing::Test { +class RowGatherer : public CommonMpiTestFixture { protected: using index_type = IndexType; using part_type = @@ -40,15 +40,13 @@ class RowGatherer : public ::testing::Test { RowGatherer() { - int rank = this->comm.rank(); + int rank = comm.rank(); auto part = gko::share(part_type::build_from_global_size_uniform( - this->ref, this->comm.size(), this->comm.size() * 3)); - auto recv_connections = - this->template create_recv_connections()[rank]; - auto imap = - map_type{this->ref, part, this->comm.rank(), recv_connections}; - auto coll_comm = std::make_shared(this->comm, imap); - rg = row_gatherer_type::create(ref, coll_comm, imap); + exec, comm.size(), comm.size() * 3)); + auto recv_connections = create_recv_connections()[rank]; + auto imap = map_type{exec, part, comm.rank(), recv_connections}; + auto coll_comm = std::make_shared(comm, imap); + rg = row_gatherer_type::create(exec, coll_comm, imap); } void SetUp() override { ASSERT_EQ(comm.size(), 6); } @@ -56,42 +54,24 @@ class RowGatherer : public ::testing::Test { template std::array, 6> create_recv_connections() { - return {gko::array{ref, {3, 5, 10, 11}}, - gko::array{ref, {0, 1, 7, 12, 13}}, - gko::array{ref, {3, 4, 17}}, - gko::array{ref, {1, 2, 12, 14}}, - gko::array{ref, {4, 5, 9, 10, 15, 16}}, - gko::array{ref, {8, 12, 13, 14}}}; + return {gko::array{exec, {3, 5, 10, 11}}, + gko::array{exec, {0, 1, 7, 12, 13}}, + gko::array{exec, {3, 4, 17}}, + gko::array{exec, {1, 2, 12, 14}}, + gko::array{exec, {4, 5, 9, 10, 15, 16}}, + gko::array{exec, {8, 12, 13, 14}}}; } - std::shared_ptr ref = gko::ReferenceExecutor::create(); - gko::experimental::mpi::communicator comm = MPI_COMM_WORLD; + std::shared_ptr host_exec = exec->get_master(); + std::shared_ptr mpi_exec = + gko::experimental::mpi::requires_host_buffer(exec, comm) ? host_exec + : exec; std::shared_ptr rg; }; TYPED_TEST_SUITE(RowGatherer, gko::test::IndexTypes, TypenameNameGenerator); -TYPED_TEST(RowGatherer, CanApply) -{ - using Dense = gko::matrix::Dense; - using Vector = gko::experimental::distributed::Vector; - int rank = this->comm.rank(); - auto offset = static_cast(rank * 3); - auto b = Vector::create( - this->ref, this->comm, gko::dim<2>{18, 1}, - gko::initialize({offset, offset + 1, offset + 2}, this->ref)); - auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); - - this->rg->apply(b, x); - - auto expected = this->template create_recv_connections()[rank]; - auto expected_vec = Dense::create( - this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); - GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); -} - - TYPED_TEST(RowGatherer, CanApplyAsync) { using Dense = gko::matrix::Dense; @@ -99,16 +79,17 @@ TYPED_TEST(RowGatherer, CanApplyAsync) int rank = this->comm.rank(); auto offset = static_cast(rank * 3); auto b = Vector::create( - this->ref, this->comm, gko::dim<2>{18, 1}, - gko::initialize({offset, offset + 1, offset + 2}, this->ref)); - auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); + this->exec, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->exec)); + auto x = + Dense::create(this->mpi_exec, gko::dim<2>{this->rg->get_size()[0], 1}); auto req = this->rg->apply_async(b, x); req.wait(); auto expected = this->template create_recv_connections()[rank]; auto expected_vec = Dense::create( - this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + this->mpi_exec, gko::dim<2>{expected.get_size(), 1}, expected, 1); GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); } @@ -120,16 +101,17 @@ TYPED_TEST(RowGatherer, CanApplyAsyncConsequetively) int rank = this->comm.rank(); auto offset = static_cast(rank * 3); auto b = Vector::create( - this->ref, this->comm, gko::dim<2>{18, 1}, - gko::initialize({offset, offset + 1, offset + 2}, this->ref)); - auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); + this->exec, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->exec)); + auto x = + Dense::create(this->mpi_exec, gko::dim<2>{this->rg->get_size()[0], 1}); this->rg->apply_async(b, x).wait(); this->rg->apply_async(b, x).wait(); auto expected = this->template create_recv_connections()[rank]; auto expected_vec = Dense::create( - this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + this->mpi_exec, gko::dim<2>{expected.get_size(), 1}, expected, 1); GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); } @@ -141,17 +123,18 @@ TYPED_TEST(RowGatherer, CanApplyAsyncWithWorkspace) int rank = this->comm.rank(); auto offset = static_cast(rank * 3); auto b = Vector::create( - this->ref, this->comm, gko::dim<2>{18, 1}, - gko::initialize({offset, offset + 1, offset + 2}, this->ref)); - auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); - gko::array workspace(this->ref); + this->exec, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->exec)); + auto x = + Dense::create(this->mpi_exec, gko::dim<2>{this->rg->get_size()[0], 1}); + gko::array workspace; auto req = this->rg->apply_async(b, x, workspace); req.wait(); auto expected = this->template create_recv_connections()[rank]; auto expected_vec = Dense::create( - this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + this->mpi_exec, gko::dim<2>{expected.get_size(), 1}, expected, 1); GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); ASSERT_GT(workspace.get_size(), 0); } @@ -164,14 +147,15 @@ TYPED_TEST(RowGatherer, CanApplyAsyncMultipleTimesWithWorkspace) int rank = this->comm.rank(); auto offset = static_cast(rank * 3); auto b1 = Vector::create( - this->ref, this->comm, gko::dim<2>{18, 1}, - gko::initialize({offset, offset + 1, offset + 2}, this->ref)); + this->exec, this->comm, gko::dim<2>{18, 1}, + gko::initialize({offset, offset + 1, offset + 2}, this->exec)); auto b2 = gko::clone(b1); - b2->scale(gko::initialize({-1}, this->ref)); - auto x1 = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 1}); + b2->scale(gko::initialize({-1}, this->exec)); + auto x1 = + Dense::create(this->mpi_exec, gko::dim<2>{this->rg->get_size()[0], 1}); auto x2 = gko::clone(x1); - gko::array workspace1(this->ref); - gko::array workspace2(this->ref); + gko::array workspace1; + gko::array workspace2; auto req1 = this->rg->apply_async(b1, x1, workspace1); auto req2 = this->rg->apply_async(b2, x2, workspace2); @@ -180,9 +164,9 @@ TYPED_TEST(RowGatherer, CanApplyAsyncMultipleTimesWithWorkspace) auto expected = this->template create_recv_connections()[rank]; auto expected_vec1 = Dense::create( - this->ref, gko::dim<2>{expected.get_size(), 1}, expected, 1); + this->mpi_exec, gko::dim<2>{expected.get_size(), 1}, expected, 1); auto expected_vec2 = gko::clone(expected_vec1); - expected_vec2->scale(gko::initialize({-1}, this->ref)); + expected_vec2->scale(gko::initialize({-1}, this->exec)); GKO_ASSERT_MTX_NEAR(x1, expected_vec1, 0.0); GKO_ASSERT_MTX_NEAR(x2, expected_vec2, 0.0); } @@ -195,40 +179,72 @@ TYPED_TEST(RowGatherer, CanApplyAsyncWithMultipleColumns) int rank = this->comm.rank(); auto offset = static_cast(rank * 3); auto b = Vector::create( - this->ref, this->comm, gko::dim<2>{18, 2}, + this->exec, this->comm, gko::dim<2>{18, 2}, gko::initialize({{offset, offset * offset}, {offset + 1, offset * offset + 1}, {offset + 2, offset * offset + 2}}, - this->ref)); - auto x = Dense::create(this->ref, gko::dim<2>{this->rg->get_size()[0], 2}); + this->exec)); + auto x = + Dense::create(this->mpi_exec, gko::dim<2>{this->rg->get_size()[0], 2}); this->rg->apply_async(b, x).wait(); gko::array expected[] = { - gko::array{this->ref, {3, 9, 5, 11, 10, 82, 11, 83}}, - gko::array{this->ref, {0, 0, 1, 1, 7, 37, 12, 144, 13, 145}}, - gko::array{this->ref, {3, 9, 4, 10, 17, 227}}, - gko::array{this->ref, {1, 1, 2, 2, 12, 144, 14, 146}}, - gko::array{this->ref, + gko::array{this->mpi_exec, {3, 9, 5, 11, 10, 82, 11, 83}}, + gko::array{this->mpi_exec, + {0, 0, 1, 1, 7, 37, 12, 144, 13, 145}}, + gko::array{this->mpi_exec, {3, 9, 4, 10, 17, 227}}, + gko::array{this->mpi_exec, {1, 1, 2, 2, 12, 144, 14, 146}}, + gko::array{this->mpi_exec, {4, 10, 5, 11, 9, 81, 10, 82, 15, 225, 16, 226}}, - gko::array{this->ref, {8, 38, 12, 144, 13, 145, 14, 146}}}; - auto expected_vec = - Dense::create(this->ref, gko::dim<2>{expected[rank].get_size() / 2, 2}, - expected[rank], 2); + gko::array{this->mpi_exec, {8, 38, 12, 144, 13, 145, 14, 146}}}; + auto expected_vec = Dense::create( + this->mpi_exec, gko::dim<2>{expected[rank].get_size() / 2, 2}, + expected[rank], 2); GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); } +TYPED_TEST(RowGatherer, ThrowsOnApply) +{ + using RowGatherer = typename TestFixture::row_gatherer_type; + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + auto rg = RowGatherer::create(this->exec, this->comm); + auto b = Vector::create(this->exec, this->comm); + auto x = Dense::create(this->exec); + + ASSERT_THROW(rg->apply(b, x), gko::NotImplemented); +} + + TYPED_TEST(RowGatherer, ThrowsOnAdvancedApply) { using RowGatherer = typename TestFixture::row_gatherer_type; using Dense = gko::matrix::Dense; using Vector = gko::experimental::distributed::Vector; - auto rg = RowGatherer::create(this->ref, this->comm); - auto b = Vector::create(this->ref, this->comm); - auto x = Dense::create(this->ref); - auto alpha = Dense::create(this->ref, gko::dim<2>{1, 1}); - auto beta = Dense::create(this->ref, gko::dim<2>{1, 1}); + auto rg = RowGatherer::create(this->exec, this->comm); + auto b = Vector::create(this->exec, this->comm); + auto x = Dense::create(this->exec); + auto alpha = Dense::create(this->exec, gko::dim<2>{1, 1}); + auto beta = Dense::create(this->exec, gko::dim<2>{1, 1}); ASSERT_THROW(rg->apply(alpha, b, beta, x), gko::NotImplemented); } + + +TYPED_TEST(RowGatherer, ThrowsOnNonMatchingExecutor) +{ + if (this->mpi_exec == this->exec) { + GTEST_SKIP(); + } + + using RowGatherer = typename TestFixture::row_gatherer_type; + using Dense = gko::matrix::Dense; + using Vector = gko::experimental::distributed::Vector; + auto rg = RowGatherer::create(this->exec, this->comm); + auto b = Vector::create(this->exec, this->comm); + auto x = Dense::create(this->exec); + + ASSERT_THROW(rg->apply_async(b, x).wait(), gko::InvalidStateError); +} From 36f4f94d661578f500afe480ee5d7b9e956354bf Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Feb 2025 15:57:37 +0100 Subject: [PATCH 14/25] [dist-rg] remove row gatherer from linop hierarchy right now the RG doesn't support (blocking) apply, so it doesn't make much sense to keep it as a LinOp --- core/distributed/row_gatherer.cpp | 61 ++++++++----------- .../ginkgo/core/distributed/row_gatherer.hpp | 20 +++--- test/mpi/distributed/row_gatherer.cpp | 28 --------- 3 files changed, 38 insertions(+), 71 deletions(-) diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index 94a174a9777..c182e5cee6e 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -22,17 +22,6 @@ using DefaultCollComm = mpi::NeighborhoodCommunicator; #endif -template -void RowGatherer::apply_impl(const LinOp* b, LinOp* x) const - GKO_NOT_IMPLEMENTED; - - -template -void RowGatherer::apply_impl(const LinOp* alpha, const LinOp* b, - const LinOp* beta, LinOp* x) const - GKO_NOT_IMPLEMENTED; - - template mpi::request RowGatherer::apply_async(ptr_param b, ptr_param x) const @@ -62,29 +51,27 @@ mpi::request RowGatherer::apply_async( { mpi::request req; + auto exec = this->get_executor(); + auto use_host_buffer = + mpi::requires_host_buffer(exec, coll_comm_->get_base_communicator()); + auto mpi_exec = use_host_buffer ? exec->get_master() : exec; + + GKO_THROW_IF_INVALID( + !use_host_buffer || mpi_exec->memory_accessible(x->get_executor()), + "The receive buffer uses device memory, but MPI support of device " + "memory is not available or host buffer were explicitly requested. " + "Please provide a host buffer or enable MPI support for device " + "memory."); + // dispatch global vector run, std::complex>( - b.get(), [&](const auto* b_global) { + make_temporary_clone(exec, b).get(), [&](const auto* b_global) { using ValueType = typename std::decay_t::value_type; // dispatch local vector with the same precision as the global // vector ::gko::precision_dispatch( [&](auto* x_local) { - auto exec = this->get_executor(); - - auto use_host_buffer = mpi::requires_host_buffer( - exec, coll_comm_->get_base_communicator()); - auto mpi_exec = use_host_buffer ? exec->get_master() : exec; - - GKO_THROW_IF_INVALID( - !use_host_buffer || mpi_exec->memory_accessible( - x_local->get_executor()), - "The receive buffer uses device memory, but MPI " - "support of device memory is not available or host " - "buffer were explicitly requested. Please provide a " - "host buffer or enable MPI support for device memory."); - auto b_local = b_global->get_local_vector(); dim<2> send_size(coll_comm_->get_send_size(), @@ -123,6 +110,13 @@ mpi::request RowGatherer::apply_async( } +template +dim<2> RowGatherer::get_size() const +{ + return size_; +} + + template std::shared_ptr RowGatherer::get_collective_communicator() const @@ -137,9 +131,9 @@ RowGatherer::RowGatherer( std::shared_ptr exec, std::shared_ptr coll_comm, const index_map& imap) - : EnableLinOp( - exec, dim<2>{imap.get_non_local_size(), imap.get_global_size()}), + : EnablePolymorphicObject(exec), DistributedBase(coll_comm->get_base_communicator()), + size_(dim<2>{imap.get_non_local_size(), imap.get_global_size()}), coll_comm_(std::move(coll_comm)), send_idxs_(exec), send_workspace_(exec), @@ -181,7 +175,7 @@ size_type RowGatherer::get_num_send_idxs() const template RowGatherer::RowGatherer(std::shared_ptr exec, mpi::communicator comm) - : EnableLinOp(exec), + : EnablePolymorphicObject(exec), DistributedBase(comm), coll_comm_(std::make_shared(comm)), send_idxs_(exec), @@ -192,7 +186,7 @@ RowGatherer::RowGatherer(std::shared_ptr exec, template RowGatherer::RowGatherer(RowGatherer&& o) noexcept - : EnableLinOp(o.get_executor()), + : EnablePolymorphicObject(o.get_executor()), DistributedBase(o.get_communicator()), send_idxs_(o.get_executor()), send_workspace_(o.get_executor()), @@ -207,7 +201,7 @@ RowGatherer& RowGatherer::operator=( const RowGatherer& o) { if (this != &o) { - this->set_size(o.get_size()); + size_ = o.get_size(); coll_comm_ = o.coll_comm_; send_idxs_ = o.send_idxs_; } @@ -220,8 +214,7 @@ RowGatherer& RowGatherer::operator=( RowGatherer&& o) { if (this != &o) { - this->set_size(o.get_size()); - o.set_size({}); + size_ = std::exchange(o.size_, dim<2>{}); coll_comm_ = std::exchange( o.coll_comm_, std::make_shared(o.get_communicator())); @@ -235,7 +228,7 @@ RowGatherer& RowGatherer::operator=( template RowGatherer::RowGatherer(const RowGatherer& o) - : EnableLinOp(o.get_executor()), + : EnablePolymorphicObject(o.get_executor()), DistributedBase(o.get_communicator()), send_idxs_(o.get_executor()) { diff --git a/include/ginkgo/core/distributed/row_gatherer.hpp b/include/ginkgo/core/distributed/row_gatherer.hpp index 97e8672fd54..4fc9b964e4b 100644 --- a/include/ginkgo/core/distributed/row_gatherer.hpp +++ b/include/ginkgo/core/distributed/row_gatherer.hpp @@ -51,9 +51,11 @@ namespace distributed { * @tparam LocalIndexType the index type for the stored indices */ template -class RowGatherer final : public EnableLinOp>, - public DistributedBase { - friend class EnablePolymorphicObject; +class RowGatherer final + : public EnablePolymorphicObject>, + public EnablePolymorphicAssignment>, + public DistributedBase { + friend class EnablePolymorphicObject; public: /** @@ -90,6 +92,11 @@ class RowGatherer final : public EnableLinOp>, mpi::request apply_async(ptr_param b, ptr_param x, array& workspace) const; + /** + * Returns the size of the row gatherer. + */ + dim<2> get_size() const; + /** * Get the used collective communicator. */ @@ -154,12 +161,6 @@ class RowGatherer final : public EnableLinOp>, RowGatherer& operator=(RowGatherer&& o); -protected: - void apply_impl(const LinOp* b, LinOp* x) const override; - - void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta, - LinOp* x) const override; - private: /** * @copydoc RowGatherer::create(std::shared_ptr>, */ RowGatherer(std::shared_ptr exec, mpi::communicator comm); + dim<2> size_; std::shared_ptr coll_comm_; array send_idxs_; mutable array send_workspace_; diff --git a/test/mpi/distributed/row_gatherer.cpp b/test/mpi/distributed/row_gatherer.cpp index e77d2bb0272..e3f02697bc2 100644 --- a/test/mpi/distributed/row_gatherer.cpp +++ b/test/mpi/distributed/row_gatherer.cpp @@ -205,34 +205,6 @@ TYPED_TEST(RowGatherer, CanApplyAsyncWithMultipleColumns) } -TYPED_TEST(RowGatherer, ThrowsOnApply) -{ - using RowGatherer = typename TestFixture::row_gatherer_type; - using Dense = gko::matrix::Dense; - using Vector = gko::experimental::distributed::Vector; - auto rg = RowGatherer::create(this->exec, this->comm); - auto b = Vector::create(this->exec, this->comm); - auto x = Dense::create(this->exec); - - ASSERT_THROW(rg->apply(b, x), gko::NotImplemented); -} - - -TYPED_TEST(RowGatherer, ThrowsOnAdvancedApply) -{ - using RowGatherer = typename TestFixture::row_gatherer_type; - using Dense = gko::matrix::Dense; - using Vector = gko::experimental::distributed::Vector; - auto rg = RowGatherer::create(this->exec, this->comm); - auto b = Vector::create(this->exec, this->comm); - auto x = Dense::create(this->exec); - auto alpha = Dense::create(this->exec, gko::dim<2>{1, 1}); - auto beta = Dense::create(this->exec, gko::dim<2>{1, 1}); - - ASSERT_THROW(rg->apply(alpha, b, beta, x), gko::NotImplemented); -} - - TYPED_TEST(RowGatherer, ThrowsOnNonMatchingExecutor) { if (this->mpi_exec == this->exec) { From 34e67ebe1aaa9d7c78d0a9929a3c1d9119bd5add Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Feb 2025 16:12:38 +0100 Subject: [PATCH 15/25] [dist-rg] add comment on output vector exec --- .../ginkgo/core/distributed/row_gatherer.hpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/include/ginkgo/core/distributed/row_gatherer.hpp b/include/ginkgo/core/distributed/row_gatherer.hpp index 4fc9b964e4b..42b4c1e7b86 100644 --- a/include/ginkgo/core/distributed/row_gatherer.hpp +++ b/include/ginkgo/core/distributed/row_gatherer.hpp @@ -45,8 +45,10 @@ namespace distributed { * ``` * Using apply instead of apply_async will lead to a blocking communication. * - * @note The apply and apply_async function will *not* ensure that the in/output - * vectors use the same executor as the this object. + * @note The output vector for the apply_async functions *must* use an executor + * that is compatible with the MPI implementation. In particular, if the + * MPI implementation is not GPU aware, then the output vector *must* use + * a CPU executor. Otherwise, an exception will be thrown. * * @tparam LocalIndexType the index type for the stored indices */ @@ -64,8 +66,10 @@ class RowGatherer final * @warning Only one mpi::request can be active at any given time. This * function will throw if another request is already active. * - * @param b the input distributed::Vector - * @param x the output matrix::Dense with the rows gathered from b + * @param b the input distributed::Vector. + * @param x the output matrix::Dense with the rows gathered from b. Its + * executor has to be compatible with the MPI implementation, see + * the class documentation. * * @return a mpi::request for this task. The task is guaranteed to * be completed only after `.wait()` has been called on it. @@ -80,8 +84,10 @@ class RowGatherer final * waiting on each previous request will lead to incorrect * data transfers. * - * @param b the input distributed::Vector - * @param x the output matrix::Dense with the rows gathered from b + * @param b the input distributed::Vector. + * @param x the output matrix::Dense with the rows gathered from b. Its + * executor has to be compatible with the MPI implementation, see + * the class documentation. * @param workspace a workspace to store temporary data for the operation. * This might not be modified before the request is * waited on. From 4e5e1988ef9624961c4585dd09381ada35ef0915 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Feb 2025 17:06:07 +0100 Subject: [PATCH 16/25] [core] fix temporary clone of segmented array --- core/test/base/segmented_array.cpp | 49 +++++++++++++++++++- include/ginkgo/core/base/segmented_array.hpp | 15 +++--- 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/core/test/base/segmented_array.cpp b/core/test/base/segmented_array.cpp index 2741990036f..e01d21fe73c 100644 --- a/core/test/base/segmented_array.cpp +++ b/core/test/base/segmented_array.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -11,6 +11,19 @@ #include "core/test/utils.hpp" +class DummyExecutor : public gko::ReferenceExecutor { +public: + DummyExecutor() : ReferenceExecutor(std::make_shared()) + {} + +protected: + bool verify_memory_to(const ReferenceExecutor* other) const override + { + return false; + } +}; + + template gko::array get_flat_array(gko::segmented_array& arr) { @@ -143,6 +156,40 @@ TYPED_TEST(SegmentedArray, CanBeMoved) } +TYPED_TEST(SegmentedArray, CanCreateTemporaryClone) +{ + using value_type = typename TestFixture::value_type; + auto other_exec = std::make_shared(); + auto buffer = gko::array(this->exec, {1, 2, 2, 3, 3, 3}); + auto offsets = gko::array(this->exec, {0, 1, 3, 6}); + auto arr = + gko::segmented_array::create_from_offsets(buffer, offsets); + + auto copy = gko::make_temporary_clone(other_exec, &arr); + + ASSERT_EQ(copy->get_executor(), other_exec); + ASSERT_NE(copy->get_executor(), arr.get_executor()); + GKO_ASSERT_SEGMENTED_ARRAY_EQ(*copy, arr); +} + + +TYPED_TEST(SegmentedArray, TemporaryCloneIsNoopForSameExec) +{ + using value_type = typename TestFixture::value_type; + auto buffer = gko::array(this->exec, {1, 2, 2, 3, 3, 3}); + auto offsets = gko::array(this->exec, {0, 1, 3, 6}); + auto arr = + gko::segmented_array::create_from_offsets(buffer, offsets); + + auto copy = gko::make_temporary_clone(this->exec, &arr); + + ASSERT_EQ(copy->get_executor(), arr.get_executor()); + ASSERT_EQ(copy->get_flat_data(), arr.get_flat_data()); + ASSERT_EQ(copy->get_offsets().get_const_data(), + arr.get_offsets().get_const_data()); +} + + TYPED_TEST(SegmentedArray, ThrowsIfBufferSizeDoesntMatchSizes) { using value_type = typename TestFixture::value_type; diff --git a/include/ginkgo/core/base/segmented_array.hpp b/include/ginkgo/core/base/segmented_array.hpp index b34605cc902..b74ab82e596 100644 --- a/include/ginkgo/core/base/segmented_array.hpp +++ b/include/ginkgo/core/base/segmented_array.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -152,12 +152,11 @@ struct temporary_clone_helper> { bool copy_data) { if (copy_data) { - return std::make_unique>( - make_array_view(exec, ptr->get_size(), ptr->get_flat_data()), - ptr->get_offsets()); + return std::make_unique>(std::move(exec), *ptr); } else { - return std::make_unique>(std::move(exec), - ptr->get_offsets()); + return std::unique_ptr>( + new segmented_array(segmented_array::create_from_offsets( + array(std::move(exec), ptr->get_offsets())))); } } }; @@ -168,9 +167,7 @@ struct temporary_clone_helper> { std::shared_ptr exec, const segmented_array* ptr, bool) { - return std::make_unique>( - make_array_view(exec, ptr->get_size(), ptr->get_const_flat_data()), - ptr->get_offsets()); + return std::make_unique>(std::move(exec), *ptr); } }; From 0b597d79a96c00959c2ab154a883bad56773c01c Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Thu, 20 Feb 2025 17:06:43 +0100 Subject: [PATCH 17/25] [dist-rg] use correct exec when constructing RG --- core/distributed/row_gatherer.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index c182e5cee6e..fe593ebbadb 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -149,12 +149,20 @@ RowGatherer::RowGatherer( auto comm = coll_comm_->get_base_communicator(); auto inverse_comm = coll_comm_->create_inverse(); + auto mpi_exec = + mpi::requires_host_buffer(exec, coll_comm_->get_base_communicator()) + ? exec->get_master() + : exec; + auto temp_remote_local_idxs = + make_temporary_clone(mpi_exec, &imap.get_remote_local_idxs()); + + send_idxs_.set_executor(mpi_exec); send_idxs_.resize_and_reset(coll_comm_->get_send_size()); inverse_comm - ->i_all_to_all_v(exec, - imap.get_remote_local_idxs().get_const_flat_data(), + ->i_all_to_all_v(exec, temp_remote_local_idxs->get_const_flat_data(), send_idxs_.get_data()) .wait(); + send_idxs_.set_executor(exec); } From 70980ac2b6f88782da719beb9beda57b305aa815 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 3 Mar 2025 12:25:54 +0100 Subject: [PATCH 18/25] [bench] update benchmark tests --- .../test/reference/distributed_solver.profile.stderr | 12 ++++++++++++ .../test/reference/spmv_distributed.profile.stderr | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr index dca62ddff33..3535f43009d 100644 --- a/benchmark/test/reference/distributed_solver.profile.stderr +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -100,6 +100,18 @@ DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy DEBUG: begin copy DEBUG: end copy DEBUG: begin components::aos_to_soa diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr index 1931cd2030e..07c0bc74cc9 100644 --- a/benchmark/test/reference/spmv_distributed.profile.stderr +++ b/benchmark/test/reference/spmv_distributed.profile.stderr @@ -116,6 +116,18 @@ DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin copy +DEBUG: end copy DEBUG: begin copy DEBUG: end copy DEBUG: begin copy() From 3581d6d81cfd7d54141cd348feb9ddf0863ac888 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 17 Mar 2025 15:57:31 +0100 Subject: [PATCH 19/25] [dist-rg] add missing half precision dispatch --- core/distributed/row_gatherer.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index fe593ebbadb..48b4150eeea 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -64,7 +64,11 @@ mpi::request RowGatherer::apply_async( "memory."); // dispatch global vector - run, std::complex>( + run, +#endif + double, float, std::complex, std::complex>( make_temporary_clone(exec, b).get(), [&](const auto* b_global) { using ValueType = typename std::decay_t::value_type; From 358db33e835441617028b3208728f3dc89f3e68e Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Mar 2025 13:18:22 +0100 Subject: [PATCH 20/25] [dist] use intermediate coll-comm creator function --- .../distributed_solver.profile.stderr | 14 --------- .../reference/spmv_distributed.profile.stderr | 14 --------- core/distributed/dense_communicator.cpp | 17 ++++++----- core/distributed/matrix.cpp | 10 +++---- .../distributed/neighborhood_communicator.cpp | 19 ++++++------ .../distributed/collective_communicator.hpp | 30 ++++++++++++------- .../core/distributed/dense_communicator.hpp | 4 +-- .../ginkgo/core/distributed/index_map_fwd.hpp | 30 ------------------- .../distributed/neighborhood_communicator.hpp | 4 +-- .../ginkgo/core/distributed/row_gatherer.hpp | 12 ++++---- 10 files changed, 50 insertions(+), 104 deletions(-) delete mode 100644 include/ginkgo/core/distributed/index_map_fwd.hpp diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr index 3535f43009d..64782f7fd05 100644 --- a/benchmark/test/reference/distributed_solver.profile.stderr +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -100,20 +100,6 @@ DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa DEBUG: begin dense::fill diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr index 07c0bc74cc9..09492d6d9c5 100644 --- a/benchmark/test/reference/spmv_distributed.profile.stderr +++ b/benchmark/test/reference/spmv_distributed.profile.stderr @@ -116,20 +116,6 @@ DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs DEBUG: begin components::convert_idxs_to_ptrs DEBUG: end components::convert_idxs_to_ptrs -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin components::fill_array -DEBUG: end components::fill_array -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy -DEBUG: begin copy -DEBUG: end copy DEBUG: begin copy() DEBUG: begin dense::copy DEBUG: end dense::copy diff --git a/core/distributed/dense_communicator.cpp b/core/distributed/dense_communicator.cpp index 2957bed0d46..50c8e8da75c 100644 --- a/core/distributed/dense_communicator.cpp +++ b/core/distributed/dense_communicator.cpp @@ -110,15 +110,16 @@ request DenseCommunicator::i_all_to_all_v_impl( } -std::unique_ptr -DenseCommunicator::create_with_same_type( - communicator base, const distributed::index_map_variant& imap) const +CollectiveCommunicator::creator_fn DenseCommunicator::creator_with_same_type() + const { - return std::visit( - [base](const auto& imap) { - return std::make_unique(base, imap); - }, - imap); + return [](communicator base, index_map_ptr imap) { + return std::visit( + [base](auto imap_) { + return std::make_unique(base, *imap_); + }, + imap); + }; } diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index d71128f7a6d..7f0e0a58901 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -101,9 +101,8 @@ Matrix::Matrix( row_gatherer_ = RowGatherer::create( row_gatherer_->get_executor(), - row_gatherer_->get_collective_communicator()->create_with_same_type( - row_gatherer_->get_communicator(), imap_), - imap_); + row_gatherer_->get_collective_communicator()->creator_with_same_type(), + comm, imap_); } @@ -383,9 +382,8 @@ void Matrix::read_distributed( row_gatherer_ = RowGatherer::create( row_gatherer_->get_executor(), - row_gatherer_->get_collective_communicator()->create_with_same_type( - row_gatherer_->get_communicator(), imap_), - imap_); + row_gatherer_->get_collective_communicator()->creator_with_same_type(), + comm, imap_); } diff --git a/core/distributed/neighborhood_communicator.cpp b/core/distributed/neighborhood_communicator.cpp index c3ba9155e8b..3e3f820e052 100644 --- a/core/distributed/neighborhood_communicator.cpp +++ b/core/distributed/neighborhood_communicator.cpp @@ -156,16 +156,17 @@ request NeighborhoodCommunicator::i_all_to_all_v_impl( } -std::unique_ptr -NeighborhoodCommunicator::create_with_same_type( - communicator base, const distributed::index_map_variant& imap) const +CollectiveCommunicator::creator_fn +NeighborhoodCommunicator::creator_with_same_type() const { - return std::visit( - [base](const auto& imap) { - return std::unique_ptr( - new NeighborhoodCommunicator(base, imap)); - }, - imap); + return [](communicator base, index_map_ptr imap) { + return std::visit( + [base](auto imap_) { + return std::unique_ptr( + std::make_unique(base, *imap_)); + }, + imap); + }; } diff --git a/include/ginkgo/core/distributed/collective_communicator.hpp b/include/ginkgo/core/distributed/collective_communicator.hpp index 2dfcb893e6f..81d439c191b 100644 --- a/include/ginkgo/core/distributed/collective_communicator.hpp +++ b/include/ginkgo/core/distributed/collective_communicator.hpp @@ -11,9 +11,10 @@ #if GINKGO_BUILD_MPI +#include #include -#include +#include namespace gko { @@ -29,6 +30,21 @@ namespace mpi { */ class CollectiveCommunicator { public: + /** + * All allowed index_map types (as const *) + */ + using index_map_ptr = + std::variant*, + const distributed::index_map*, + const distributed::index_map*>; + + /** + * Creator function to create a new CollectiveCommunicator from a + * communicator and a pointer to an index_map. + */ + using creator_fn = std::function( + communicator, index_map_ptr)>; + virtual ~CollectiveCommunicator() = default; explicit CollectiveCommunicator(communicator base = MPI_COMM_NULL); @@ -65,16 +81,10 @@ class CollectiveCommunicator { void* recv_buffer, MPI_Datatype recv_type) const; /** - * Creates a new CollectiveCommunicator with the same dynamic type. - * - * @param base The base communicator - * @param imap The index_map that defines the communication pattern - * - * @return a CollectiveCommunicator with the same dynamic type + * Returns a CollectiveCommunicator::creator_fn which will create a new + * CollectiveCommunicator with the same dynamic type. */ - [[nodiscard]] virtual std::unique_ptr - create_with_same_type(communicator base, - const distributed::index_map_variant& imap) const = 0; + [[nodiscard]] virtual creator_fn creator_with_same_type() const = 0; /** * Creates a CollectiveCommunicator with the inverse communication pattern diff --git a/include/ginkgo/core/distributed/dense_communicator.hpp b/include/ginkgo/core/distributed/dense_communicator.hpp index 1f600a93f18..de595df77d0 100644 --- a/include/ginkgo/core/distributed/dense_communicator.hpp +++ b/include/ginkgo/core/distributed/dense_communicator.hpp @@ -64,9 +64,7 @@ class DenseCommunicator final : public CollectiveCommunicator { communicator base, const distributed::index_map& imap); - [[nodiscard]] std::unique_ptr create_with_same_type( - communicator base, - const distributed::index_map_variant& imap) const override; + [[nodiscard]] creator_fn creator_with_same_type() const override; /** * Creates the inverse DenseCommunicator by switching sources diff --git a/include/ginkgo/core/distributed/index_map_fwd.hpp b/include/ginkgo/core/distributed/index_map_fwd.hpp deleted file mode 100644 index 8781fbfffd6..00000000000 --- a/include/ginkgo/core/distributed/index_map_fwd.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_PUBLIC_CORE_INDEX_MAP_FWD_HPP -#define GKO_PUBLIC_CORE_INDEX_MAP_FWD_HPP - -#include - -#include - - -namespace gko { -namespace experimental { -namespace distributed { - - -template -class index_map; - -using index_map_variant = - std::variant, index_map, - index_map>; - - -} // namespace distributed -} // namespace experimental -} // namespace gko - -#endif // GKO_PUBLIC_CORE_INDEX_MAP_FWD_HPP diff --git a/include/ginkgo/core/distributed/neighborhood_communicator.hpp b/include/ginkgo/core/distributed/neighborhood_communicator.hpp index ee8d937eba5..7274f55d11a 100644 --- a/include/ginkgo/core/distributed/neighborhood_communicator.hpp +++ b/include/ginkgo/core/distributed/neighborhood_communicator.hpp @@ -68,9 +68,7 @@ class NeighborhoodCommunicator final : public CollectiveCommunicator { communicator base, const distributed::index_map& imap); - std::unique_ptr create_with_same_type( - communicator base, - const distributed::index_map_variant& imap) const override; + creator_fn creator_with_same_type() const override; /** * Creates the inverse NeighborhoodCommunicator by switching sources diff --git a/include/ginkgo/core/distributed/row_gatherer.hpp b/include/ginkgo/core/distributed/row_gatherer.hpp index 42b4c1e7b86..6b0ea81d43a 100644 --- a/include/ginkgo/core/distributed/row_gatherer.hpp +++ b/include/ginkgo/core/distributed/row_gatherer.hpp @@ -129,12 +129,9 @@ class RowGatherer final * @tparam GlobalIndexType the global index type of the index map * * @param exec the executor - * @param coll_comm the collective communicator + * @param coll_comm_creator the collective communicator creator_fn * @param imap the index map defining which rows to gather * - * @note The coll_comm and imap have to be compatible. The coll_comm must - * send and recv exactly as many rows as the imap defines. - * * @return a shared_ptr to the created distributed::RowGatherer */ template > static std::unique_ptr create( std::shared_ptr exec, - std::shared_ptr coll_comm, + mpi::CollectiveCommunicator::creator_fn coll_comm_creator, + mpi::communicator base_comm, const index_map& imap) { - return std::unique_ptr( - new RowGatherer(std::move(exec), std::move(coll_comm), imap)); + return std::unique_ptr(new RowGatherer( + std::move(exec), coll_comm_creator(base_comm, &imap), imap)); } /* From 2ef6e24b28f725b1960f7374905f0c84b1e8a992 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Mar 2025 13:33:45 +0100 Subject: [PATCH 21/25] review updates: - documentation - format - unused code Co-authored-by: Yu-Hsiang M. Tsai --- core/distributed/row_gatherer.cpp | 2 ++ include/ginkgo/core/distributed/row_gatherer.hpp | 9 +++++---- test/mpi/distributed/matrix.cpp | 2 -- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index 48b4150eeea..468528e8436 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -15,6 +15,8 @@ namespace gko { namespace experimental { namespace distributed { + + #if GINKGO_HAVE_OPENMPI_PRE_4_1_X using DefaultCollComm = mpi::DenseCommunicator; #else diff --git a/include/ginkgo/core/distributed/row_gatherer.hpp b/include/ginkgo/core/distributed/row_gatherer.hpp index 6b0ea81d43a..3a7ca89e5f0 100644 --- a/include/ginkgo/core/distributed/row_gatherer.hpp +++ b/include/ginkgo/core/distributed/row_gatherer.hpp @@ -39,11 +39,10 @@ namespace distributed { * auto x = matrix::Dense::create(...); * * auto req = rg->apply_async(b, x); - * // do some computation that doesn't modify b, or access x + * // users can do some computation that doesn't modify b, or access x * req.wait(); * // x now contains the gathered rows of b * ``` - * Using apply instead of apply_async will lead to a blocking communication. * * @note The output vector for the apply_async functions *must* use an executor * that is compatible with the MPI implementation. In particular, if the @@ -168,7 +167,7 @@ class RowGatherer final private: /** * @copydoc RowGatherer::create(std::shared_ptr, std::shared_ptr, + * Executor>, std::shared_ptr, * const index_map&) */ template @@ -187,10 +186,12 @@ class RowGatherer final array send_idxs_; mutable array send_workspace_; // This object might not hold an actual MPI request, so we can't use the - // always owning mpi::request. It's destructor would otherwise make the + // always owning mpi::request. Its destructor would otherwise make the // program crash. mutable MPI_Request req_listener_{MPI_REQUEST_NULL}; }; + + } // namespace distributed } // namespace experimental } // namespace gko diff --git a/test/mpi/distributed/matrix.cpp b/test/mpi/distributed/matrix.cpp index 0285ebb6680..de55ef31405 100644 --- a/test/mpi/distributed/matrix.cpp +++ b/test/mpi/distributed/matrix.cpp @@ -878,7 +878,6 @@ class HostToDeviceLogger : public gko::log::Logger { class AllocationLogger : public gko::log::Logger { public: mutable int count = 0; - mutable std::set execs; protected: void on_allocation_completed(const gko::Executor* exec, @@ -886,7 +885,6 @@ class AllocationLogger : public gko::log::Logger { const gko::uintptr& location) const override { ++count; - execs.insert(exec); } }; From 8e43f599e4ffbe43ba4c33e229b5ca756d0c8e0f Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 25 Apr 2025 09:57:56 +0200 Subject: [PATCH 22/25] [dist] revert coll-comm creator function --- core/distributed/dense_communicator.cpp | 17 ++++++------ core/distributed/matrix.cpp | 10 ++++--- .../distributed/neighborhood_communicator.cpp | 19 +++++++------- core/distributed/row_gatherer.cpp | 9 +++++++ .../distributed/collective_communicator.hpp | 26 +++++++++---------- .../core/distributed/dense_communicator.hpp | 3 ++- .../distributed/neighborhood_communicator.hpp | 3 ++- .../ginkgo/core/distributed/row_gatherer.hpp | 18 ++++++------- 8 files changed, 56 insertions(+), 49 deletions(-) diff --git a/core/distributed/dense_communicator.cpp b/core/distributed/dense_communicator.cpp index 50c8e8da75c..5424a1ac9c7 100644 --- a/core/distributed/dense_communicator.cpp +++ b/core/distributed/dense_communicator.cpp @@ -110,16 +110,15 @@ request DenseCommunicator::i_all_to_all_v_impl( } -CollectiveCommunicator::creator_fn DenseCommunicator::creator_with_same_type() - const +std::unique_ptr +DenseCommunicator::create_with_same_type(communicator base, + index_map_ptr imap) const { - return [](communicator base, index_map_ptr imap) { - return std::visit( - [base](auto imap_) { - return std::make_unique(base, *imap_); - }, - imap); - }; + return std::visit( + [base](const auto* imap) { + return std::make_unique(base, *imap); + }, + imap); } diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 7f0e0a58901..febc556db6a 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -101,8 +101,9 @@ Matrix::Matrix( row_gatherer_ = RowGatherer::create( row_gatherer_->get_executor(), - row_gatherer_->get_collective_communicator()->creator_with_same_type(), - comm, imap_); + row_gatherer_->get_collective_communicator()->create_with_same_type( + comm, &imap_), + imap_); } @@ -382,8 +383,9 @@ void Matrix::read_distributed( row_gatherer_ = RowGatherer::create( row_gatherer_->get_executor(), - row_gatherer_->get_collective_communicator()->creator_with_same_type(), - comm, imap_); + row_gatherer_->get_collective_communicator()->create_with_same_type( + comm, &imap_), + imap_); } diff --git a/core/distributed/neighborhood_communicator.cpp b/core/distributed/neighborhood_communicator.cpp index 3e3f820e052..d5afb7217e5 100644 --- a/core/distributed/neighborhood_communicator.cpp +++ b/core/distributed/neighborhood_communicator.cpp @@ -156,17 +156,16 @@ request NeighborhoodCommunicator::i_all_to_all_v_impl( } -CollectiveCommunicator::creator_fn -NeighborhoodCommunicator::creator_with_same_type() const +std::unique_ptr +NeighborhoodCommunicator::create_with_same_type(communicator base, + index_map_ptr imap) const { - return [](communicator base, index_map_ptr imap) { - return std::visit( - [base](auto imap_) { - return std::unique_ptr( - std::make_unique(base, *imap_)); - }, - imap); - }; + return std::visit( + [base](const auto* imap) { + return std::unique_ptr( + new NeighborhoodCommunicator(base, *imap)); + }, + imap); } diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index 468528e8436..386f61dde94 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -186,6 +186,15 @@ size_type RowGatherer::get_num_send_idxs() const } +template +std::unique_ptr> +RowGatherer::create(std::shared_ptr exec, + mpi::communicator comm) +{ + return std::unique_ptr(new RowGatherer(std::move(exec), comm)); +} + + template RowGatherer::RowGatherer(std::shared_ptr exec, mpi::communicator comm) diff --git a/include/ginkgo/core/distributed/collective_communicator.hpp b/include/ginkgo/core/distributed/collective_communicator.hpp index 81d439c191b..afddc96212f 100644 --- a/include/ginkgo/core/distributed/collective_communicator.hpp +++ b/include/ginkgo/core/distributed/collective_communicator.hpp @@ -38,13 +38,6 @@ class CollectiveCommunicator { const distributed::index_map*, const distributed::index_map*>; - /** - * Creator function to create a new CollectiveCommunicator from a - * communicator and a pointer to an index_map. - */ - using creator_fn = std::function( - communicator, index_map_ptr)>; - virtual ~CollectiveCommunicator() = default; explicit CollectiveCommunicator(communicator base = MPI_COMM_NULL); @@ -81,10 +74,15 @@ class CollectiveCommunicator { void* recv_buffer, MPI_Datatype recv_type) const; /** - * Returns a CollectiveCommunicator::creator_fn which will create a new - * CollectiveCommunicator with the same dynamic type. + * Creates a new CollectiveCommunicator with the same dynamic type. + * + * @param base The base communicator + * @param imap The index_map that defines the communication pattern + * + * @return a CollectiveCommunicator with the same dynamic type */ - [[nodiscard]] virtual creator_fn creator_with_same_type() const = 0; + [[nodiscard]] virtual std::unique_ptr + create_with_same_type(communicator base, index_map_ptr imap) const = 0; /** * Creates a CollectiveCommunicator with the inverse communication pattern @@ -97,16 +95,16 @@ class CollectiveCommunicator { create_inverse() const = 0; /** - * Get the total number of received elements this communication patterns - * expects. + * Get the number of elements received by this process within this + * communication pattern. * * @return number of received elements. */ [[nodiscard]] virtual comm_index_type get_recv_size() const = 0; /** - * Get the total number of sent elements this communication patterns - * expects. + * Get the number of elements sent by this process within this communication + * pattern. * * @return number of sent elements. */ diff --git a/include/ginkgo/core/distributed/dense_communicator.hpp b/include/ginkgo/core/distributed/dense_communicator.hpp index de595df77d0..221de351752 100644 --- a/include/ginkgo/core/distributed/dense_communicator.hpp +++ b/include/ginkgo/core/distributed/dense_communicator.hpp @@ -64,7 +64,8 @@ class DenseCommunicator final : public CollectiveCommunicator { communicator base, const distributed::index_map& imap); - [[nodiscard]] creator_fn creator_with_same_type() const override; + [[nodiscard]] std::unique_ptr create_with_same_type( + communicator base, index_map_ptr imap) const override; /** * Creates the inverse DenseCommunicator by switching sources diff --git a/include/ginkgo/core/distributed/neighborhood_communicator.hpp b/include/ginkgo/core/distributed/neighborhood_communicator.hpp index 7274f55d11a..0e69a97383a 100644 --- a/include/ginkgo/core/distributed/neighborhood_communicator.hpp +++ b/include/ginkgo/core/distributed/neighborhood_communicator.hpp @@ -68,7 +68,8 @@ class NeighborhoodCommunicator final : public CollectiveCommunicator { communicator base, const distributed::index_map& imap); - creator_fn creator_with_same_type() const override; + std::unique_ptr create_with_same_type( + communicator base, index_map_ptr imap) const override; /** * Creates the inverse NeighborhoodCommunicator by switching sources diff --git a/include/ginkgo/core/distributed/row_gatherer.hpp b/include/ginkgo/core/distributed/row_gatherer.hpp index 3a7ca89e5f0..273220ef342 100644 --- a/include/ginkgo/core/distributed/row_gatherer.hpp +++ b/include/ginkgo/core/distributed/row_gatherer.hpp @@ -128,9 +128,12 @@ class RowGatherer final * @tparam GlobalIndexType the global index type of the index map * * @param exec the executor - * @param coll_comm_creator the collective communicator creator_fn + * @param coll_comm the collective communicator * @param imap the index map defining which rows to gather * + * @note The coll_comm and imap have to be compatible. The coll_comm must + * send and recv exactly as many rows as the imap defines. + * * @return a shared_ptr to the created distributed::RowGatherer */ template > static std::unique_ptr create( std::shared_ptr exec, - mpi::CollectiveCommunicator::creator_fn coll_comm_creator, - mpi::communicator base_comm, + std::shared_ptr coll_comm, const index_map& imap) { - return std::unique_ptr(new RowGatherer( - std::move(exec), coll_comm_creator(base_comm, &imap), imap)); + return std::unique_ptr( + new RowGatherer(std::move(exec), std::move(coll_comm), imap)); } /* * Create method for an empty RowGatherer. */ static std::unique_ptr create( - std::shared_ptr exec, mpi::communicator comm) - { - return std::unique_ptr( - new RowGatherer(std::move(exec), std::move(comm))); - } + std::shared_ptr exec, mpi::communicator comm); RowGatherer(const RowGatherer& o); From 011dfbac5be98b9449829a7c6241f8c58c3a70da Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 5 May 2025 15:46:16 +0200 Subject: [PATCH 23/25] [test] use neighborhood comm only if available and reduce total number of cores required. --- core/test/mpi/distributed/row_gatherer.cpp | 22 +++++++++++++--------- test/mpi/distributed/CMakeLists.txt | 3 ++- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/core/test/mpi/distributed/row_gatherer.cpp b/core/test/mpi/distributed/row_gatherer.cpp index 8799b4939ce..fc2f528643c 100644 --- a/core/test/mpi/distributed/row_gatherer.cpp +++ b/core/test/mpi/distributed/row_gatherer.cpp @@ -4,6 +4,7 @@ #include +#include #include #include #include @@ -12,6 +13,14 @@ #include "core/test/utils/assertions.hpp" +using CollCommType = +#if GINKGO_HAVE_OPENMPI_PRE_4_1_X + gko::experimental::mpi::DenseCommunicator; +#else + gko::experimental::mpi::NeighborhoodCommunicator; +#endif + + template class RowGatherer : public ::testing::Test { protected: @@ -41,10 +50,8 @@ class RowGatherer : public ::testing::Test { this->ref, this->comm.size(), this->comm.size() * 3); map_type imap = map_type{ref, part, comm.rank(), create_recv_connections()[comm.rank()]}; - std::shared_ptr - coll_comm = - std::make_shared( - this->comm, imap); + std::shared_ptr coll_comm = + std::make_shared(this->comm, imap); }; TYPED_TEST_SUITE(RowGatherer, gko::test::IndexTypes, TypenameNameGenerator); @@ -64,9 +71,7 @@ TYPED_TEST(RowGatherer, CanConstructWithEmptyCollectiveCommAndIndexMap) { using RowGatherer = typename TestFixture::row_gatherer_type; using IndexMap = typename TestFixture::map_type; - auto coll_comm = - std::make_shared( - this->comm); + auto coll_comm = std::make_shared(this->comm); auto map = IndexMap{this->ref}; auto rg = RowGatherer::create(this->ref, coll_comm, map); @@ -96,8 +101,7 @@ TYPED_TEST(RowGatherer, CanCopy) auto copy = gko::clone(rg); GKO_ASSERT_EQUAL_DIMENSIONS(rg, copy); - auto copy_coll_comm = std::dynamic_pointer_cast< - const gko::experimental::mpi::NeighborhoodCommunicator>( + auto copy_coll_comm = std::dynamic_pointer_cast( copy->get_collective_communicator()); ASSERT_EQ(*this->coll_comm, *copy_coll_comm); auto send_idxs = gko::make_const_array_view( diff --git a/test/mpi/distributed/CMakeLists.txt b/test/mpi/distributed/CMakeLists.txt index 6010b9a7560..73424c9757f 100644 --- a/test/mpi/distributed/CMakeLists.txt +++ b/test/mpi/distributed/CMakeLists.txt @@ -2,4 +2,5 @@ ginkgo_create_common_and_reference_test(assembly MPI_SIZE 3 LABELS distributed) ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3 LABELS distributed) ginkgo_create_common_and_reference_test(partition_helpers MPI_SIZE 3 LABELS distributed) ginkgo_create_common_and_reference_test(vector MPI_SIZE 3 LABELS distributed) -ginkgo_create_common_and_reference_test(row_gatherer MPI_SIZE 6 LABELS distributed) +# reduce the number of OpenMP threads per MPI rank to 2, so that in total 12 cores are used +ginkgo_create_common_and_reference_test(row_gatherer MPI_SIZE 6 LABELS distributed RESOURCE_LOCAL_CORES 2) From 0f674843c11186ac01495e868ad0d02e6eb893d8 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 12 May 2025 12:35:08 +0200 Subject: [PATCH 24/25] fixup! [dist-rg] add missing half precision dispatch --- core/distributed/matrix.cpp | 12 ++---------- core/distributed/row_gatherer.cpp | 3 +++ test/mpi/distributed/matrix.cpp | 6 +++++- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index febc556db6a..9752a825b69 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -274,11 +274,7 @@ void Matrix::convert_to( result->get_communicator().size()); result->local_mtx_->copy_from(this->local_mtx_.get()); result->non_local_mtx_->copy_from(this->non_local_mtx_.get()); - result->gather_idxs_ = this->gather_idxs_; - result->send_offsets_ = this->send_offsets_; - result->recv_offsets_ = this->recv_offsets_; - result->recv_sizes_ = this->recv_sizes_; - result->send_sizes_ = this->send_sizes_; + result->row_gatherer_->copy_from(this->row_gatherer_); result->imap_ = this->imap_; result->set_size(this->get_size()); } @@ -293,11 +289,7 @@ void Matrix::move_to( result->get_communicator().size()); result->local_mtx_->move_from(this->local_mtx_.get()); result->non_local_mtx_->move_from(this->non_local_mtx_.get()); - result->gather_idxs_ = std::move(this->gather_idxs_); - result->send_offsets_ = std::move(this->send_offsets_); - result->recv_offsets_ = std::move(this->recv_offsets_); - result->recv_sizes_ = std::move(this->recv_sizes_); - result->send_sizes_ = std::move(this->send_sizes_); + result->row_gatherer_->move_from(this->row_gatherer_); result->imap_ = std::move(this->imap_); result->set_size(this->get_size()); this->set_size({}); diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index 386f61dde94..38697508bd8 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -69,6 +69,9 @@ mpi::request RowGatherer::apply_async( run, +#endif +#if GINKGO_ENABLE_BFLOAT16 + bfloat16, std::complex, #endif double, float, std::complex, std::complex>( make_temporary_clone(exec, b).get(), [&](const auto* b_global) { diff --git a/test/mpi/distributed/matrix.cpp b/test/mpi/distributed/matrix.cpp index de55ef31405..0c7457ff184 100644 --- a/test/mpi/distributed/matrix.cpp +++ b/test/mpi/distributed/matrix.cpp @@ -572,7 +572,11 @@ TYPED_TEST(Matrix, CanApplyToMultipleVectors) this->dist_mat->apply(this->x, this->y); - GKO_ASSERT_MTX_NEAR(this->y->get_local_vector(), result[rank], 0); + auto eps = std::is_same_v || + std::is_same_v> + ? r::value + : gko::remove_complex{0.0}; + GKO_ASSERT_MTX_NEAR(this->y->get_local_vector(), result[rank], eps); } From e7178d5ff576c195b619369c5ea0748208a2b224 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 12 May 2025 12:36:10 +0200 Subject: [PATCH 25/25] [dist] accept only dist-vectors for row-gather --- core/distributed/matrix.cpp | 53 ++++++----- core/distributed/row_gatherer.cpp | 20 +++- core/test/mpi/distributed/row_gatherer.cpp | 13 ++- include/ginkgo/core/distributed/index_map.hpp | 4 +- include/ginkgo/core/distributed/matrix.hpp | 5 +- .../ginkgo/core/distributed/row_gatherer.hpp | 2 + test/mpi/distributed/row_gatherer.cpp | 92 +++++++++++-------- 7 files changed, 119 insertions(+), 70 deletions(-) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 9752a825b69..84079ec6c2e 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -422,6 +422,26 @@ void Matrix::read_distributed( } +template +void init_recv_buffers(std::shared_ptr exec, + const RowGatherer* row_gatherer, + size_type num_cols, + const detail::VectorCache& buffer, + const detail::VectorCache& host_buffer) +{ + auto comm = + row_gatherer->get_collective_communicator()->get_base_communicator(); + auto global_recv_dim = + dim<2>{static_cast(row_gatherer->get_size()[0]), num_cols}; + auto local_recv_dim = dim<2>{ + static_cast( + row_gatherer->get_collective_communicator()->get_recv_size()), + num_cols}; + buffer.init(exec, comm, global_recv_dim, local_recv_dim); + host_buffer.init(exec->get_master(), comm, global_recv_dim, local_recv_dim); +} + + template void Matrix::apply_impl( const LinOp* b, LinOp* x) const @@ -439,13 +459,8 @@ void Matrix::apply_impl( auto exec = this->get_executor(); auto comm = this->get_communicator(); - auto recv_dim = - dim<2>{static_cast( - row_gatherer_->get_collective_communicator() - ->get_recv_size()), - dense_b->get_size()[1]}; - recv_buffer_.init(exec, recv_dim); - host_recv_buffer_.init(exec->get_master(), recv_dim); + init_recv_buffers(exec, row_gatherer_.get(), dense_b->get_size()[1], + recv_buffer_, host_recv_buffer_); auto recv_ptr = mpi::requires_host_buffer(exec, comm) ? host_recv_buffer_.get() : recv_buffer_.get(); @@ -456,7 +471,8 @@ void Matrix::apply_impl( if (recv_ptr != recv_buffer_.get()) { recv_buffer_->copy_from(host_recv_buffer_.get()); } - non_local_mtx_->apply(one_scalar_.get(), recv_buffer_.get(), + non_local_mtx_->apply(one_scalar_.get(), + recv_buffer_->get_local_vector(), one_scalar_.get(), local_x); }, b, x); @@ -481,13 +497,8 @@ void Matrix::apply_impl( auto exec = this->get_executor(); auto comm = this->get_communicator(); - auto recv_dim = - dim<2>{static_cast( - row_gatherer_->get_collective_communicator() - ->get_recv_size()), - dense_b->get_size()[1]}; - recv_buffer_.init(exec, recv_dim); - host_recv_buffer_.init(exec->get_master(), recv_dim); + init_recv_buffers(exec, row_gatherer_.get(), dense_b->get_size()[1], + recv_buffer_, host_recv_buffer_); auto recv_ptr = mpi::requires_host_buffer(exec, comm) ? host_recv_buffer_.get() : recv_buffer_.get(); @@ -499,7 +510,7 @@ void Matrix::apply_impl( if (recv_ptr != recv_buffer_.get()) { recv_buffer_->copy_from(host_recv_buffer_.get()); } - non_local_mtx_->apply(local_alpha, recv_buffer_.get(), + non_local_mtx_->apply(local_alpha, recv_buffer_->get_local_vector(), one_scalar_.get(), local_x); }, alpha, b, beta, x); @@ -531,12 +542,8 @@ void Matrix::col_scale( make_const_array_view(exec, n_local_cols, scaling_factors_ptr->get_const_local_values())); - auto recv_dim = dim<2>{ - static_cast( - row_gatherer_->get_collective_communicator()->get_recv_size()), - scaling_factors->get_size()[1]}; - recv_buffer_.init(exec, recv_dim); - host_recv_buffer_.init(exec->get_master(), recv_dim); + init_recv_buffers(exec, row_gatherer_.get(), scaling_factors->get_size()[1], + recv_buffer_, host_recv_buffer_); auto recv_ptr = mpi::requires_host_buffer(exec, comm) ? host_recv_buffer_.get() : recv_buffer_.get(); @@ -552,7 +559,7 @@ void Matrix::col_scale( gko::matrix::Diagonal::create_const( exec, n_non_local_cols, make_const_array_view(exec, n_non_local_cols, - recv_buffer_->get_const_values())); + recv_buffer_->get_const_local_values())); non_local_scale_diag->rapply(non_local_mtx_, non_local_mtx_); } } diff --git a/core/distributed/row_gatherer.cpp b/core/distributed/row_gatherer.cpp index 38697508bd8..6a3f873e2c1 100644 --- a/core/distributed/row_gatherer.cpp +++ b/core/distributed/row_gatherer.cpp @@ -79,8 +79,8 @@ mpi::request RowGatherer::apply_async( typename std::decay_t::value_type; // dispatch local vector with the same precision as the global // vector - ::gko::precision_dispatch( - [&](auto* x_local) { + distributed::precision_dispatch( + [&](auto* x_global) { auto b_local = b_global->get_local_vector(); dim<2> send_size(coll_comm_->get_send_size(), @@ -103,7 +103,7 @@ mpi::request RowGatherer::apply_async( send_size[1]); b_local->row_gather(&send_idxs_, send_buffer); - auto recv_ptr = x_local->get_values(); + auto recv_ptr = x_global->get_local_values(); auto send_ptr = send_buffer->get_values(); b_local->get_executor()->synchronize(); @@ -134,6 +134,16 @@ RowGatherer::get_collective_communicator() const } +template +T global_add(std::shared_ptr exec, + const mpi::communicator& comm, const T& value) +{ + T result; + comm.all_reduce(std::move(exec), &value, &result, 1, MPI_SUM); + return result; +} + + template template RowGatherer::RowGatherer( @@ -142,7 +152,9 @@ RowGatherer::RowGatherer( const index_map& imap) : EnablePolymorphicObject(exec), DistributedBase(coll_comm->get_base_communicator()), - size_(dim<2>{imap.get_non_local_size(), imap.get_global_size()}), + size_(dim<2>{global_add(exec, coll_comm->get_base_communicator(), + imap.get_non_local_size()), + imap.get_global_size()}), coll_comm_(std::move(coll_comm)), send_idxs_(exec), send_workspace_(exec), diff --git a/core/test/mpi/distributed/row_gatherer.cpp b/core/test/mpi/distributed/row_gatherer.cpp index fc2f528643c..b0908006903 100644 --- a/core/test/mpi/distributed/row_gatherer.cpp +++ b/core/test/mpi/distributed/row_gatherer.cpp @@ -44,6 +44,15 @@ class RowGatherer : public ::testing::Test { gko::array{ref, {8, 12, 13, 14}}}; } + gko::size_type recv_connections_size() + { + gko::size_type size = 0; + for (auto& recv_connections : create_recv_connections()) { + size += recv_connections.get_size(); + } + return size; + } + std::shared_ptr ref = gko::ReferenceExecutor::create(); gko::experimental::mpi::communicator comm = MPI_COMM_WORLD; std::shared_ptr part = part_type::build_from_global_size_uniform( @@ -86,9 +95,7 @@ TYPED_TEST(RowGatherer, CanConstructFromCollectiveCommAndIndexMap) auto rg = RowGatherer::create(this->ref, this->coll_comm, this->imap); - int rank = this->comm.rank(); - auto recv_connections = this->create_recv_connections()[rank]; - gko::dim<2> size{recv_connections.get_size(), 18}; + gko::dim<2> size{this->recv_connections_size(), 18}; GKO_ASSERT_EQUAL_DIMENSIONS(rg, size); } diff --git a/include/ginkgo/core/distributed/index_map.hpp b/include/ginkgo/core/distributed/index_map.hpp index 09037d303a3..093f907e494 100644 --- a/include/ginkgo/core/distributed/index_map.hpp +++ b/include/ginkgo/core/distributed/index_map.hpp @@ -90,7 +90,7 @@ class index_map { * @param index_space_v the index space in which the passed-in local * indices are defined * - * @return the mapped global indices. Any local index, that is not in the + * @return the mapped global indices. Any local index that is not in the * specified index space is mapped to invalid_index */ array map_to_global( @@ -98,7 +98,7 @@ class index_map { index_space index_space_v) const; /** - * \brief get size of index_space::local + * \brief get size of the global index space */ size_type get_global_size() const; diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index 577d73c7463..68f149939ec 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -19,6 +19,7 @@ #include #include #include +#include namespace gko { @@ -699,8 +700,8 @@ class Matrix std::shared_ptr> row_gatherer_; index_map imap_; gko::detail::DenseCache one_scalar_; - gko::detail::DenseCache recv_buffer_; - gko::detail::DenseCache host_recv_buffer_; + detail::VectorCache recv_buffer_; + detail::VectorCache host_recv_buffer_; std::shared_ptr local_mtx_; std::shared_ptr non_local_mtx_; }; diff --git a/include/ginkgo/core/distributed/row_gatherer.hpp b/include/ginkgo/core/distributed/row_gatherer.hpp index 273220ef342..7623875e69f 100644 --- a/include/ginkgo/core/distributed/row_gatherer.hpp +++ b/include/ginkgo/core/distributed/row_gatherer.hpp @@ -133,6 +133,8 @@ class RowGatherer final * * @note The coll_comm and imap have to be compatible. The coll_comm must * send and recv exactly as many rows as the imap defines. + * @note This is a collective operation, all participating processes have + * to execute this operation. * * @return a shared_ptr to the created distributed::RowGatherer */ diff --git a/test/mpi/distributed/row_gatherer.cpp b/test/mpi/distributed/row_gatherer.cpp index e3f02697bc2..0ddffaf6ec1 100644 --- a/test/mpi/distributed/row_gatherer.cpp +++ b/test/mpi/distributed/row_gatherer.cpp @@ -81,16 +81,20 @@ TYPED_TEST(RowGatherer, CanApplyAsync) auto b = Vector::create( this->exec, this->comm, gko::dim<2>{18, 1}, gko::initialize({offset, offset + 1, offset + 2}, this->exec)); - auto x = - Dense::create(this->mpi_exec, gko::dim<2>{this->rg->get_size()[0], 1}); + auto expected = this->template create_recv_connections()[rank]; + auto x = Vector::create(this->mpi_exec, this->comm, + gko::dim<2>{this->rg->get_size()[0], 1}, + gko::dim<2>{expected.get_size(), 1}); auto req = this->rg->apply_async(b, x); req.wait(); - auto expected = this->template create_recv_connections()[rank]; - auto expected_vec = Dense::create( - this->mpi_exec, gko::dim<2>{expected.get_size(), 1}, expected, 1); - GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); + auto expected_vec = Vector::create( + this->mpi_exec, this->comm, gko::dim<2>{this->rg->get_size()[0], 1}, + Dense::create(this->mpi_exec, gko::dim<2>{expected.get_size(), 1}, + expected, 1)); + GKO_ASSERT_MTX_NEAR(x->get_local_vector(), expected_vec->get_local_vector(), + 0.0); } @@ -103,16 +107,20 @@ TYPED_TEST(RowGatherer, CanApplyAsyncConsequetively) auto b = Vector::create( this->exec, this->comm, gko::dim<2>{18, 1}, gko::initialize({offset, offset + 1, offset + 2}, this->exec)); - auto x = - Dense::create(this->mpi_exec, gko::dim<2>{this->rg->get_size()[0], 1}); + auto expected = this->template create_recv_connections()[rank]; + auto x = Vector::create(this->mpi_exec, this->comm, + gko::dim<2>{this->rg->get_size()[0], 1}, + gko::dim<2>{expected.get_size(), 1}); this->rg->apply_async(b, x).wait(); this->rg->apply_async(b, x).wait(); - auto expected = this->template create_recv_connections()[rank]; - auto expected_vec = Dense::create( - this->mpi_exec, gko::dim<2>{expected.get_size(), 1}, expected, 1); - GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); + auto expected_vec = Vector::create( + this->mpi_exec, this->comm, gko::dim<2>{this->rg->get_size()[0], 1}, + Dense::create(this->mpi_exec, gko::dim<2>{expected.get_size(), 1}, + expected, 1)); + GKO_ASSERT_MTX_NEAR(x->get_local_vector(), expected_vec->get_local_vector(), + 0.0); } @@ -125,17 +133,21 @@ TYPED_TEST(RowGatherer, CanApplyAsyncWithWorkspace) auto b = Vector::create( this->exec, this->comm, gko::dim<2>{18, 1}, gko::initialize({offset, offset + 1, offset + 2}, this->exec)); - auto x = - Dense::create(this->mpi_exec, gko::dim<2>{this->rg->get_size()[0], 1}); + auto expected = this->template create_recv_connections()[rank]; + auto x = Vector::create(this->mpi_exec, this->comm, + gko::dim<2>{this->rg->get_size()[0], 1}, + gko::dim<2>{expected.get_size(), 1}); gko::array workspace; auto req = this->rg->apply_async(b, x, workspace); req.wait(); - auto expected = this->template create_recv_connections()[rank]; - auto expected_vec = Dense::create( - this->mpi_exec, gko::dim<2>{expected.get_size(), 1}, expected, 1); - GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); + auto expected_vec = Vector::create( + this->mpi_exec, this->comm, gko::dim<2>{this->rg->get_size()[0], 1}, + Dense::create(this->mpi_exec, gko::dim<2>{expected.get_size(), 1}, + expected, 1)); + GKO_ASSERT_MTX_NEAR(x->get_local_vector(), expected_vec->get_local_vector(), + 0.0); ASSERT_GT(workspace.get_size(), 0); } @@ -151,8 +163,10 @@ TYPED_TEST(RowGatherer, CanApplyAsyncMultipleTimesWithWorkspace) gko::initialize({offset, offset + 1, offset + 2}, this->exec)); auto b2 = gko::clone(b1); b2->scale(gko::initialize({-1}, this->exec)); - auto x1 = - Dense::create(this->mpi_exec, gko::dim<2>{this->rg->get_size()[0], 1}); + auto expected = this->template create_recv_connections()[rank]; + auto x1 = Vector::create(this->mpi_exec, this->comm, + gko::dim<2>{this->rg->get_size()[0], 1}, + gko::dim<2>{expected.get_size(), 1}); auto x2 = gko::clone(x1); gko::array workspace1; gko::array workspace2; @@ -162,13 +176,16 @@ TYPED_TEST(RowGatherer, CanApplyAsyncMultipleTimesWithWorkspace) req1.wait(); req2.wait(); - auto expected = this->template create_recv_connections()[rank]; - auto expected_vec1 = Dense::create( - this->mpi_exec, gko::dim<2>{expected.get_size(), 1}, expected, 1); + auto expected_vec1 = Vector::create( + this->mpi_exec, this->comm, gko::dim<2>{this->rg->get_size()[0], 1}, + Dense::create(this->mpi_exec, gko::dim<2>{expected.get_size(), 1}, + expected, 1)); auto expected_vec2 = gko::clone(expected_vec1); expected_vec2->scale(gko::initialize({-1}, this->exec)); - GKO_ASSERT_MTX_NEAR(x1, expected_vec1, 0.0); - GKO_ASSERT_MTX_NEAR(x2, expected_vec2, 0.0); + GKO_ASSERT_MTX_NEAR(x1->get_local_vector(), + expected_vec1->get_local_vector(), 0.0); + GKO_ASSERT_MTX_NEAR(x2->get_local_vector(), + expected_vec2->get_local_vector(), 0.0); } @@ -184,11 +201,6 @@ TYPED_TEST(RowGatherer, CanApplyAsyncWithMultipleColumns) {offset + 1, offset * offset + 1}, {offset + 2, offset * offset + 2}}, this->exec)); - auto x = - Dense::create(this->mpi_exec, gko::dim<2>{this->rg->get_size()[0], 2}); - - this->rg->apply_async(b, x).wait(); - gko::array expected[] = { gko::array{this->mpi_exec, {3, 9, 5, 11, 10, 82, 11, 83}}, gko::array{this->mpi_exec, @@ -198,10 +210,19 @@ TYPED_TEST(RowGatherer, CanApplyAsyncWithMultipleColumns) gko::array{this->mpi_exec, {4, 10, 5, 11, 9, 81, 10, 82, 15, 225, 16, 226}}, gko::array{this->mpi_exec, {8, 38, 12, 144, 13, 145, 14, 146}}}; - auto expected_vec = Dense::create( - this->mpi_exec, gko::dim<2>{expected[rank].get_size() / 2, 2}, - expected[rank], 2); - GKO_ASSERT_MTX_NEAR(x, expected_vec, 0.0); + auto x = Vector::create(this->mpi_exec, this->comm, + gko::dim<2>{this->rg->get_size()[0], 2}, + gko::dim<2>{expected[rank].get_size() / 2, 2}); + + this->rg->apply_async(b, x).wait(); + + auto expected_vec = Vector::create( + this->mpi_exec, this->comm, gko::dim<2>{this->rg->get_size()[0], 2}, + Dense::create(this->mpi_exec, + gko::dim<2>{expected[rank].get_size() / 2, 2}, + expected[rank], 2)); + GKO_ASSERT_MTX_NEAR(x->get_local_vector(), expected_vec->get_local_vector(), + 0.0); } @@ -212,11 +233,10 @@ TYPED_TEST(RowGatherer, ThrowsOnNonMatchingExecutor) } using RowGatherer = typename TestFixture::row_gatherer_type; - using Dense = gko::matrix::Dense; using Vector = gko::experimental::distributed::Vector; auto rg = RowGatherer::create(this->exec, this->comm); auto b = Vector::create(this->exec, this->comm); - auto x = Dense::create(this->exec); + auto x = Vector::create(this->exec, this->comm); ASSERT_THROW(rg->apply_async(b, x).wait(), gko::InvalidStateError); }