diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp index cdf363f6a87..906be6ff3b5 100644 --- a/common/cuda_hip/matrix/csr_kernels.template.cpp +++ b/common/cuda_hip/matrix/csr_kernels.template.cpp @@ -2468,7 +2468,7 @@ void spgemm(std::shared_ptr exec, } else { GKO_NOT_IMPLEMENTED; } -#else // GKO_COMPILING_CUDA +#else // GKO_COMPILING_CUDA auto a_vals = a->get_const_values(); auto a_row_ptrs = a->get_const_row_ptrs(); auto a_col_idxs = a->get_const_col_idxs(); @@ -2503,26 +2503,69 @@ void spgemm(std::shared_ptr exec, const_cast(b_col_idxs), const_cast(b_vals)); auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index, null_value); + auto spgemm_alg = CUSPARSE_SPGEMM_ALG1; - // estimate work size_type buffer1_size{}; - sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, - c_descr, spgemm_descr, buffer1_size, - nullptr); - array buffer1{exec, buffer1_size}; - sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, - c_descr, spgemm_descr, buffer1_size, - buffer1.get_data()); - - // compute spgemm size_type buffer2_size{}; - sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, - spgemm_descr, buffer1.get_data(), buffer2_size, - nullptr); - array buffer2{exec, buffer2_size}; - sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, - spgemm_descr, buffer1.get_data(), buffer2_size, - buffer2.get_data()); + array buffer1{exec}; + array buffer2{exec}; + + // Try CUSPARSE_SPGEMM_ALG1 first as it is fastest for small matrices + try { + // Memory estimate for Alg1 + sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, + &beta, c_descr, spgemm_descr, + spgemm_alg, buffer1_size, nullptr); + buffer1.resize_and_reset(buffer1_size); + sparselib::spgemm_work_estimation( + handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr, + spgemm_alg, buffer1_size, buffer1.get_data()); + sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, + c_descr, spgemm_descr, spgemm_alg, + buffer1.get_data(), buffer2_size, nullptr); + // compute spgemm + buffer2.resize_and_reset(buffer2_size); + sparselib::spgemm_compute( + handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr, + spgemm_alg, buffer1.get_data(), buffer2_size, buffer2.get_data()); + } + + catch (const CusparseError& cse) { + // If estimated buffer size is too large and CUDA > 12.0, fall back to + // ALG2 +#if CUDA_VERSION >= 12000 + if (cse.get_error_code() == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { + spgemm_alg = CUSPARSE_SPGEMM_ALG2; + // Memory estimate for Alg2/Alg3 + sparselib::spgemm_work_estimation( + handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr, + spgemm_alg, buffer1_size, nullptr); + buffer1.resize_and_reset(buffer1_size); + sparselib::spgemm_work_estimation( + handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr, + spgemm_alg, buffer1_size, buffer1.get_data()); + size_type buffer3_size{}; + sparselib::spgemm_estimate_memory( + handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr, + spgemm_alg, 1.0f, buffer3_size, nullptr, nullptr); + array buffer3{exec, buffer3_size}; + sparselib::spgemm_estimate_memory( + handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr, + spgemm_alg, 1.0f, buffer3_size, buffer3.get_data(), + &buffer2_size); + buffer2.resize_and_reset(buffer2_size); + // compute spgemm + sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, + c_descr, spgemm_descr, spgemm_alg, + buffer1.get_data(), buffer2_size, + buffer2.get_data()); + } else { + throw(cse); + } +#else // CUDA_VERSION < 12000 + throw(cse); +#endif + } // copy data to result auto c_nnz = sparselib::sparse_matrix_nnz(c_descr); @@ -2533,7 +2576,7 @@ void spgemm(std::shared_ptr exec, c_vals_array.get_data()); sparselib::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr, - spgemm_descr); + spgemm_descr, spgemm_alg); sparselib::destroy(c_descr); sparselib::destroy(b_descr); @@ -2632,7 +2675,7 @@ void advanced_spgemm(std::shared_ptr exec, } else { GKO_NOT_IMPLEMENTED; } -#else // GKO_COMPILING_CUDA +#else // GKO_COMPILING_CUDA auto handle = exec->get_sparselib_handle(); sparselib::pointer_mode_guard pm_guard(handle); @@ -2669,26 +2712,70 @@ void advanced_spgemm(std::shared_ptr exec, const_cast(b_col_idxs), const_cast(b_vals)); auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index, null_value); + auto spgemm_alg = CUSPARSE_SPGEMM_ALG1; - // estimate work size_type buffer1_size{}; - sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, - &zero_val, c_descr, spgemm_descr, - buffer1_size, nullptr); - array buffer1{exec, buffer1_size}; - sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, - &zero_val, c_descr, spgemm_descr, - buffer1_size, buffer1.get_data()); - - // compute spgemm size_type buffer2_size{}; - sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, - c_descr, spgemm_descr, buffer1.get_data(), - buffer2_size, nullptr); - array buffer2{exec, buffer2_size}; - sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, - c_descr, spgemm_descr, buffer1.get_data(), - buffer2_size, buffer2.get_data()); + array buffer1{exec}; + array buffer2{exec}; + + // Try CUSPARSE_SPGEMM_ALG1 first as it is fastest for small matrices + try { + // Memory estimate for Alg1 + sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, + &zero_val, c_descr, spgemm_descr, + spgemm_alg, buffer1_size, nullptr); + buffer1.resize_and_reset(buffer1_size); + sparselib::spgemm_work_estimation( + handle, &one_val, a_descr, b_descr, &zero_val, c_descr, + spgemm_descr, spgemm_alg, buffer1_size, buffer1.get_data()); + sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr, spgemm_alg, + buffer1.get_data(), buffer2_size, nullptr); + // compute spgemm + buffer2.resize_and_reset(buffer2_size); + sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr, spgemm_alg, + buffer1.get_data(), buffer2_size, + buffer2.get_data()); + } + + catch (const CusparseError& cse) { + // If estimated buffer size is too large and CUDA > 12.0, fall back to + // ALG2 +#if CUDA_VERSION >= 12000 + if (cse.get_error_code() == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { + spgemm_alg = CUSPARSE_SPGEMM_ALG2; + // Memory estimate for Alg2/Alg3 + sparselib::spgemm_work_estimation( + handle, &one_val, a_descr, b_descr, &zero_val, c_descr, + spgemm_descr, spgemm_alg, buffer1_size, nullptr); + buffer1.resize_and_reset(buffer1_size); + sparselib::spgemm_work_estimation( + handle, &one_val, a_descr, b_descr, &zero_val, c_descr, + spgemm_descr, spgemm_alg, buffer1_size, buffer1.get_data()); + size_type buffer3_size{}; + sparselib::spgemm_estimate_memory( + handle, &one_val, a_descr, b_descr, &zero_val, c_descr, + spgemm_descr, spgemm_alg, 1.0f, buffer3_size, nullptr, nullptr); + array buffer3{exec, buffer3_size}; + sparselib::spgemm_estimate_memory( + handle, &one_val, a_descr, b_descr, &zero_val, c_descr, + spgemm_descr, spgemm_alg, 1.0f, buffer3_size, + buffer3.get_data(), &buffer2_size); + buffer2.resize_and_reset(buffer2_size); + // compute spgemm + sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, + &zero_val, c_descr, spgemm_descr, + spgemm_alg, buffer1.get_data(), + buffer2_size, buffer2.get_data()); + } else { + throw(cse); + } +#else // CUDA_VERSION < 12000 + throw(cse); +#endif + } // write result to temporary storage auto c_tmp_nnz = sparselib::sparse_matrix_nnz(c_descr); @@ -2700,7 +2787,7 @@ void advanced_spgemm(std::shared_ptr exec, c_tmp_vals_array.get_data()); sparselib::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val, - c_descr, spgemm_descr); + c_descr, spgemm_descr, spgemm_alg); sparselib::destroy(c_descr); sparselib::destroy(b_descr); diff --git a/contributors.txt b/contributors.txt index aec120d93dd..2d9be655046 100644 --- a/contributors.txt +++ b/contributors.txt @@ -25,3 +25,4 @@ Olenik Gregor HPSim Ribizel Tobias Karlsruhe Institute of Technology Riemer Lukas Karlsruhe Institute of Technology Tsai Yuhsiang National Taiwan University +Ilya Zilberter Tech-X Corporation diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp index 4be00b88aaf..fe58716a02c 100644 --- a/cuda/base/cusparse_bindings.hpp +++ b/cuda/base/cusparse_bindings.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -208,13 +208,31 @@ void spgemm_work_estimation(cusparseHandle_t handle, const ValueType* alpha, cusparseSpMatDescr_t b_descr, const ValueType* beta, cusparseSpMatDescr_t c_descr, cusparseSpGEMMDescr_t spgemm_descr, + cusparseSpGEMMAlg_t spgemm_alg, size_type& buffer1_size, void* buffer1) { GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_workEstimation( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta, - c_descr, cuda_data_type(), CUSPARSE_SPGEMM_DEFAULT, - spgemm_descr, &buffer1_size, buffer1)); + c_descr, cuda_data_type(), spgemm_alg, spgemm_descr, + &buffer1_size, buffer1)); +} + +template +void spgemm_estimate_memory(cusparseHandle_t handle, const ValueType* alpha, + cusparseSpMatDescr_t a_descr, + cusparseSpMatDescr_t b_descr, const ValueType* beta, + cusparseSpMatDescr_t c_descr, + cusparseSpGEMMDescr_t spgemm_descr, + cusparseSpGEMMAlg_t spgemm_alg, + float chunk_fraction, size_type& buffer3_size, + void* buffer3, size_type* buffer2_size) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_estimateMemory( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta, + c_descr, cuda_data_type(), spgemm_alg, spgemm_descr, + chunk_fraction, &buffer3_size, buffer3, buffer2_size)); } @@ -222,14 +240,15 @@ template void spgemm_compute(cusparseHandle_t handle, const ValueType* alpha, cusparseSpMatDescr_t a_descr, cusparseSpMatDescr_t b_descr, const ValueType* beta, cusparseSpMatDescr_t c_descr, - cusparseSpGEMMDescr_t spgemm_descr, void* buffer1, + cusparseSpGEMMDescr_t spgemm_descr, + cusparseSpGEMMAlg_t spgemm_alg, void* buffer1, size_type& buffer2_size, void* buffer2) { GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_compute( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta, - c_descr, cuda_data_type(), CUSPARSE_SPGEMM_DEFAULT, - spgemm_descr, &buffer2_size, buffer2)); + c_descr, cuda_data_type(), spgemm_alg, spgemm_descr, + &buffer2_size, buffer2)); } @@ -237,13 +256,13 @@ template void spgemm_copy(cusparseHandle_t handle, const ValueType* alpha, cusparseSpMatDescr_t a_descr, cusparseSpMatDescr_t b_descr, const ValueType* beta, cusparseSpMatDescr_t c_descr, - cusparseSpGEMMDescr_t spgemm_descr) + cusparseSpGEMMDescr_t spgemm_descr, + cusparseSpGEMMAlg_t spgemm_alg) { - GKO_ASSERT_NO_CUSPARSE_ERRORS( - cusparseSpGEMM_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, - b_descr, beta, c_descr, cuda_data_type(), - CUSPARSE_SPGEMM_DEFAULT, spgemm_descr)); + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_copy( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta, + c_descr, cuda_data_type(), spgemm_alg, spgemm_descr)); } diff --git a/cuda/base/exception.cpp b/cuda/base/exception.cpp index 7bb7fae5bd5..1f7d90113a4 100644 --- a/cuda/base/exception.cpp +++ b/cuda/base/exception.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -6,6 +6,7 @@ #include +#include #include #include #include diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp index febc5e17034..c60834c23dc 100644 --- a/include/ginkgo/core/base/exception.hpp +++ b/include/ginkgo/core/base/exception.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -250,11 +250,18 @@ class CusparseError : public Error { */ CusparseError(const std::string& file, int line, const std::string& func, int64 error_code) - : Error(file, line, func + ": " + get_error(error_code)) + : Error(file, line, func + ": " + get_error(error_code)), + err_code(error_code) {} + /** + * Returns the error code + */ + int64 get_error_code() const noexcept { return err_code; } + private: static std::string get_error(int64 error_code); + const int64 err_code; };