From 1ca77276a93e4067cfe0ac5ee6aa43555436960c Mon Sep 17 00:00:00 2001
From: Ilya Zilberter <izilberter@txcorp.com>
Date: Tue, 18 Feb 2025 21:33:53 -0700
Subject: [PATCH 1/3] Add fallback to lower-memory cuSparse SpGEMM algorithm

Address an issue where the default CuSPARSE SpGEMM
algorithm estimates an overly large memory buffer for matrices
greater than ~4 million rows, causing a memory allocation
exception regardless of the actual GPU memory capacity. Since
CUDA 12.0, alternate, less memory-intensive algorithms for
SpGEMM have been introduced to fix the issue.

The spgemm and advanced_spgemm cuda routines now attempt
to compute the matrix product using the default CUSPARSE_SPGEMM_ALG1
algorithm, and if it fails, fall back to CUSPARSE_SPGEMM_ALG2.
Update the CuSparse bindings for spgemm-related functions
to take the algorithm as an argument.
---
 .../cuda_hip/matrix/csr_kernels.template.cpp  | 153 +++++++++++++-----
 cuda/base/cusparse_bindings.hpp               |  43 +++--
 2 files changed, 146 insertions(+), 50 deletions(-)
diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp
index cdf363f6a87..8aa8d644482 100644
--- a/common/cuda_hip/matrix/csr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -2468,7 +2468,7 @@ void spgemm(std::shared_ptr<const DefaultExecutor> exec,
     } else {
         GKO_NOT_IMPLEMENTED;
     }
-#else   // GKO_COMPILING_CUDA
+#else  // GKO_COMPILING_CUDA
     auto a_vals = a->get_const_values();
     auto a_row_ptrs = a->get_const_row_ptrs();
     auto a_col_idxs = a->get_const_col_idxs();
@@ -2503,26 +2503,63 @@ void spgemm(std::shared_ptr<const DefaultExecutor> exec,
         const_cast<IndexType*>(b_col_idxs), const_cast<ValueType*>(b_vals));
     auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index,
                                          null_value);
+    auto spgemm_alg = CUSPARSE_SPGEMM_ALG1;
 
-    // estimate work
     size_type buffer1_size{};
-    sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
-                                      c_descr, spgemm_descr, buffer1_size,
-                                      nullptr);
-    array<char> buffer1{exec, buffer1_size};
-    sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
-                                      c_descr, spgemm_descr, buffer1_size,
-                                      buffer1.get_data());
-
-    // compute spgemm
     size_type buffer2_size{};
-    sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
-                              spgemm_descr, buffer1.get_data(), buffer2_size,
-                              nullptr);
-    array<char> buffer2{exec, buffer2_size};
-    sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
-                              spgemm_descr, buffer1.get_data(), buffer2_size,
-                              buffer2.get_data());
+    array<char> buffer1{exec};
+    array<char> buffer2{exec};
+
+    // Try CUSPARSE_SPGEMM_ALG1 first as it is fastest for small matrices
+    try {
+        // Memory estimate for Alg1
+        sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr,
+                                          &beta, c_descr, spgemm_descr,
+                                          spgemm_alg, buffer1_size, nullptr);
+        buffer1.resize_and_reset(buffer1_size);
+        sparselib::spgemm_work_estimation(
+            handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
+            spgemm_alg, buffer1_size, buffer1.get_data());
+        sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta,
+                                  c_descr, spgemm_descr, spgemm_alg,
+                                  buffer1.get_data(), buffer2_size, nullptr);
+        // compute spgemm
+        buffer2.resize_and_reset(buffer2_size);
+        sparselib::spgemm_compute(
+            handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
+            spgemm_alg, buffer1.get_data(), buffer2_size, buffer2.get_data());
+    }
+
+    catch (const CusparseError& cse) {
+        // If estimated buffer size is too large and CUDA > 12.0,  fall back to
+        // ALG2
+#if CUDA_VERSION >= 12000
+        spgemm_alg = CUSPARSE_SPGEMM_ALG2;
+        // Memory estimate for Alg2/Alg3
+        sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr,
+                                          &beta, c_descr, spgemm_descr,
+                                          spgemm_alg, buffer1_size, nullptr);
+        buffer1.resize_and_reset(buffer1_size);
+        sparselib::spgemm_work_estimation(
+            handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
+            spgemm_alg, buffer1_size, buffer1.get_data());
+        size_type buffer3_size{};
+        sparselib::spgemm_estimate_memory(
+            handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
+            spgemm_alg, 1.0f, buffer3_size, nullptr, nullptr);
+        array<char> buffer3{exec, buffer3_size};
+        sparselib::spgemm_estimate_memory(
+            handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
+            spgemm_alg, 1.0f, buffer3_size, buffer3.get_data(), &buffer2_size);
+        buffer2.resize_and_reset(buffer2_size);
+        // compute spgemm
+        sparselib::spgemm_compute(
+            handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
+            spgemm_alg, buffer1.get_data(), buffer2_size, buffer2.get_data());
+#else  // CUDA_VERSION < 12000
+        throw(cse);
+#endif
+    }
 
     // copy data to result
     auto c_nnz = sparselib::sparse_matrix_nnz(c_descr);
@@ -2533,7 +2570,7 @@ void spgemm(std::shared_ptr<const DefaultExecutor> exec,
                                 c_vals_array.get_data());
 
     sparselib::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr,
-                           spgemm_descr);
+                           spgemm_descr, spgemm_alg);
 
     sparselib::destroy(c_descr);
     sparselib::destroy(b_descr);
@@ -2632,7 +2669,7 @@ void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
     } else {
         GKO_NOT_IMPLEMENTED;
     }
-#else   // GKO_COMPILING_CUDA
+#else  // GKO_COMPILING_CUDA
     auto handle = exec->get_sparselib_handle();
     sparselib::pointer_mode_guard pm_guard(handle);
 
@@ -2669,26 +2706,66 @@ void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
         const_cast<IndexType*>(b_col_idxs), const_cast<ValueType*>(b_vals));
     auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index,
                                          null_value);
+    auto spgemm_alg = CUSPARSE_SPGEMM_ALG1;
 
-    // estimate work
     size_type buffer1_size{};
-    sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
-                                      &zero_val, c_descr, spgemm_descr,
-                                      buffer1_size, nullptr);
-    array<char> buffer1{exec, buffer1_size};
-    sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
-                                      &zero_val, c_descr, spgemm_descr,
-                                      buffer1_size, buffer1.get_data());
-
-    // compute spgemm
     size_type buffer2_size{};
-    sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
-                              c_descr, spgemm_descr, buffer1.get_data(),
-                              buffer2_size, nullptr);
-    array<char> buffer2{exec, buffer2_size};
-    sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
-                              c_descr, spgemm_descr, buffer1.get_data(),
-                              buffer2_size, buffer2.get_data());
+    array<char> buffer1{exec};
+    array<char> buffer2{exec};
+
+    // Try CUSPARSE_SPGEMM_ALG1 first as it is fastest for small matrices
+    try {
+        // Memory estimate for Alg1
+        sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
+                                          &zero_val, c_descr, spgemm_descr,
+                                          spgemm_alg, buffer1_size, nullptr);
+        buffer1.resize_and_reset(buffer1_size);
+        sparselib::spgemm_work_estimation(
+            handle, &one_val, a_descr, b_descr, &zero_val, c_descr,
+            spgemm_descr, spgemm_alg, buffer1_size, buffer1.get_data());
+        sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
+                                  c_descr, spgemm_descr, spgemm_alg,
+                                  buffer1.get_data(), buffer2_size, nullptr);
+        // compute spgemm
+        buffer2.resize_and_reset(buffer2_size);
+        sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
+                                  c_descr, spgemm_descr, spgemm_alg,
+                                  buffer1.get_data(), buffer2_size,
+                                  buffer2.get_data());
+    }
+
+    catch (const CusparseError& cse) {
+        // If estimated buffer size is too large and CUDA > 12.0,  fall back to
+        // ALG2
+#if CUDA_VERSION >= 12000
+        spgemm_alg = CUSPARSE_SPGEMM_ALG2;
+        // Memory estimate for Alg2/Alg3
+        sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
+                                          &zero_val, c_descr, spgemm_descr,
+                                          spgemm_alg, buffer1_size, nullptr);
+        buffer1.resize_and_reset(buffer1_size);
+        sparselib::spgemm_work_estimation(
+            handle, &one_val, a_descr, b_descr, &zero_val, c_descr,
+            spgemm_descr, spgemm_alg, buffer1_size, buffer1.get_data());
+        size_type buffer3_size{};
+        sparselib::spgemm_estimate_memory(
+            handle, &one_val, a_descr, b_descr, &zero_val, c_descr,
+            spgemm_descr, spgemm_alg, 1.0f, buffer3_size, nullptr, nullptr);
+        array<char> buffer3{exec, buffer3_size};
+        sparselib::spgemm_estimate_memory(handle, &one_val, a_descr, b_descr,
+                                          &zero_val, c_descr, spgemm_descr,
+                                          spgemm_alg, 1.0f, buffer3_size,
+                                          buffer3.get_data(), &buffer2_size);
+        buffer2.resize_and_reset(buffer2_size);
+        // compute spgemm
+        sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
+                                  c_descr, spgemm_descr, spgemm_alg,
+                                  buffer1.get_data(), buffer2_size,
+                                  buffer2.get_data());
+#else  // CUDA_VERSION < 12000
+        throw(cse);
+#endif
+    }
 
     // write result to temporary storage
     auto c_tmp_nnz = sparselib::sparse_matrix_nnz(c_descr);
@@ -2700,7 +2777,7 @@ void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
                                 c_tmp_vals_array.get_data());
 
     sparselib::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val,
-                           c_descr, spgemm_descr);
+                           c_descr, spgemm_descr, spgemm_alg);
 
     sparselib::destroy(c_descr);
     sparselib::destroy(b_descr);
diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp
index 4be00b88aaf..fe58716a02c 100644
--- a/cuda/base/cusparse_bindings.hpp
+++ b/cuda/base/cusparse_bindings.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -208,13 +208,31 @@ void spgemm_work_estimation(cusparseHandle_t handle, const ValueType* alpha,
                             cusparseSpMatDescr_t b_descr, const ValueType* beta,
                             cusparseSpMatDescr_t c_descr,
                             cusparseSpGEMMDescr_t spgemm_descr,
+                            cusparseSpGEMMAlg_t spgemm_alg,
                             size_type& buffer1_size, void* buffer1)
 {
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_workEstimation(
         handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
         CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta,
-        c_descr, cuda_data_type<ValueType>(), CUSPARSE_SPGEMM_DEFAULT,
-        spgemm_descr, &buffer1_size, buffer1));
+        c_descr, cuda_data_type<ValueType>(), spgemm_alg, spgemm_descr,
+        &buffer1_size, buffer1));
+}
+
+template <typename ValueType>
+void spgemm_estimate_memory(cusparseHandle_t handle, const ValueType* alpha,
+                            cusparseSpMatDescr_t a_descr,
+                            cusparseSpMatDescr_t b_descr, const ValueType* beta,
+                            cusparseSpMatDescr_t c_descr,
+                            cusparseSpGEMMDescr_t spgemm_descr,
+                            cusparseSpGEMMAlg_t spgemm_alg,
+                            float chunk_fraction, size_type& buffer3_size,
+                            void* buffer3, size_type* buffer2_size)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_estimateMemory(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta,
+        c_descr, cuda_data_type<ValueType>(), spgemm_alg, spgemm_descr,
+        chunk_fraction, &buffer3_size, buffer3, buffer2_size));
 }
 
 
@@ -222,14 +240,15 @@ template <typename ValueType>
 void spgemm_compute(cusparseHandle_t handle, const ValueType* alpha,
                     cusparseSpMatDescr_t a_descr, cusparseSpMatDescr_t b_descr,
                     const ValueType* beta, cusparseSpMatDescr_t c_descr,
-                    cusparseSpGEMMDescr_t spgemm_descr, void* buffer1,
+                    cusparseSpGEMMDescr_t spgemm_descr,
+                    cusparseSpGEMMAlg_t spgemm_alg, void* buffer1,
                     size_type& buffer2_size, void* buffer2)
 {
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_compute(
         handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
         CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta,
-        c_descr, cuda_data_type<ValueType>(), CUSPARSE_SPGEMM_DEFAULT,
-        spgemm_descr, &buffer2_size, buffer2));
+        c_descr, cuda_data_type<ValueType>(), spgemm_alg, spgemm_descr,
+        &buffer2_size, buffer2));
 }
 
 
@@ -237,13 +256,13 @@ template <typename ValueType>
 void spgemm_copy(cusparseHandle_t handle, const ValueType* alpha,
                  cusparseSpMatDescr_t a_descr, cusparseSpMatDescr_t b_descr,
                  const ValueType* beta, cusparseSpMatDescr_t c_descr,
-                 cusparseSpGEMMDescr_t spgemm_descr)
+                 cusparseSpGEMMDescr_t spgemm_descr,
+                 cusparseSpGEMMAlg_t spgemm_alg)
 {
-    GKO_ASSERT_NO_CUSPARSE_ERRORS(
-        cusparseSpGEMM_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                            CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr,
-                            b_descr, beta, c_descr, cuda_data_type<ValueType>(),
-                            CUSPARSE_SPGEMM_DEFAULT, spgemm_descr));
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_copy(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta,
+        c_descr, cuda_data_type<ValueType>(), spgemm_alg, spgemm_descr));
 }
 
 

From 8c9ddb3f78f8d61102fd05e9dc71afcc79143210 Mon Sep 17 00:00:00 2001
From: Ilya Zilberter <izilberter@txcorp.com>
Date: Thu, 20 Feb 2025 09:42:25 -0700
Subject: [PATCH 2/3] Fallback cusparse_spgemm algorithm only if
 CUSPARSE_STATUS_INSUFFICIENT_RESOURCES

---
 .../cuda_hip/matrix/csr_kernels.template.cpp  | 104 ++++++++++--------
 cuda/base/exception.cpp                       |   3 +-
 2 files changed, 60 insertions(+), 47 deletions(-)

diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp
index 8aa8d644482..ed68a30898f 100644
--- a/common/cuda_hip/matrix/csr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -2534,28 +2534,35 @@ void spgemm(std::shared_ptr<const DefaultExecutor> exec,
         // If estimated buffer size is too large and CUDA > 12.0,  fall back to
         // ALG2
 #if CUDA_VERSION >= 12000
-        spgemm_alg = CUSPARSE_SPGEMM_ALG2;
-        // Memory estimate for Alg2/Alg3
-        sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr,
-                                          &beta, c_descr, spgemm_descr,
-                                          spgemm_alg, buffer1_size, nullptr);
-        buffer1.resize_and_reset(buffer1_size);
-        sparselib::spgemm_work_estimation(
-            handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
-            spgemm_alg, buffer1_size, buffer1.get_data());
-        size_type buffer3_size{};
-        sparselib::spgemm_estimate_memory(
-            handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
-            spgemm_alg, 1.0f, buffer3_size, nullptr, nullptr);
-        array<char> buffer3{exec, buffer3_size};
-        sparselib::spgemm_estimate_memory(
-            handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
-            spgemm_alg, 1.0f, buffer3_size, buffer3.get_data(), &buffer2_size);
-        buffer2.resize_and_reset(buffer2_size);
-        // compute spgemm
-        sparselib::spgemm_compute(
-            handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
-            spgemm_alg, buffer1.get_data(), buffer2_size, buffer2.get_data());
+        const char* error_code = "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES";
+        if (strstr(cse.what(), error_code)) {
+            spgemm_alg = CUSPARSE_SPGEMM_ALG2;
+            // Memory estimate for Alg2/Alg3
+            sparselib::spgemm_work_estimation(
+                handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
+                spgemm_alg, buffer1_size, nullptr);
+            buffer1.resize_and_reset(buffer1_size);
+            sparselib::spgemm_work_estimation(
+                handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
+                spgemm_alg, buffer1_size, buffer1.get_data());
+            size_type buffer3_size{};
+            sparselib::spgemm_estimate_memory(
+                handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
+                spgemm_alg, 1.0f, buffer3_size, nullptr, nullptr);
+            array<char> buffer3{exec, buffer3_size};
+            sparselib::spgemm_estimate_memory(
+                handle, &alpha, a_descr, b_descr, &beta, c_descr, spgemm_descr,
+                spgemm_alg, 1.0f, buffer3_size, buffer3.get_data(),
+                &buffer2_size);
+            buffer2.resize_and_reset(buffer2_size);
+            // compute spgemm
+            sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta,
+                                      c_descr, spgemm_descr, spgemm_alg,
+                                      buffer1.get_data(), buffer2_size,
+                                      buffer2.get_data());
+        } else {
+            throw(cse);
+        }
 #else  // CUDA_VERSION < 12000
         throw(cse);
 #endif
@@ -2738,30 +2745,35 @@ void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
         // If estimated buffer size is too large and CUDA > 12.0,  fall back to
         // ALG2
 #if CUDA_VERSION >= 12000
-        spgemm_alg = CUSPARSE_SPGEMM_ALG2;
-        // Memory estimate for Alg2/Alg3
-        sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
-                                          &zero_val, c_descr, spgemm_descr,
-                                          spgemm_alg, buffer1_size, nullptr);
-        buffer1.resize_and_reset(buffer1_size);
-        sparselib::spgemm_work_estimation(
-            handle, &one_val, a_descr, b_descr, &zero_val, c_descr,
-            spgemm_descr, spgemm_alg, buffer1_size, buffer1.get_data());
-        size_type buffer3_size{};
-        sparselib::spgemm_estimate_memory(
-            handle, &one_val, a_descr, b_descr, &zero_val, c_descr,
-            spgemm_descr, spgemm_alg, 1.0f, buffer3_size, nullptr, nullptr);
-        array<char> buffer3{exec, buffer3_size};
-        sparselib::spgemm_estimate_memory(handle, &one_val, a_descr, b_descr,
-                                          &zero_val, c_descr, spgemm_descr,
-                                          spgemm_alg, 1.0f, buffer3_size,
-                                          buffer3.get_data(), &buffer2_size);
-        buffer2.resize_and_reset(buffer2_size);
-        // compute spgemm
-        sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
-                                  c_descr, spgemm_descr, spgemm_alg,
-                                  buffer1.get_data(), buffer2_size,
-                                  buffer2.get_data());
+        const char* error_code = "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES";
+        if (strstr(cse.what(), error_code)) {
+            spgemm_alg = CUSPARSE_SPGEMM_ALG2;
+            // Memory estimate for Alg2/Alg3
+            sparselib::spgemm_work_estimation(
+                handle, &one_val, a_descr, b_descr, &zero_val, c_descr,
+                spgemm_descr, spgemm_alg, buffer1_size, nullptr);
+            buffer1.resize_and_reset(buffer1_size);
+            sparselib::spgemm_work_estimation(
+                handle, &one_val, a_descr, b_descr, &zero_val, c_descr,
+                spgemm_descr, spgemm_alg, buffer1_size, buffer1.get_data());
+            size_type buffer3_size{};
+            sparselib::spgemm_estimate_memory(
+                handle, &one_val, a_descr, b_descr, &zero_val, c_descr,
+                spgemm_descr, spgemm_alg, 1.0f, buffer3_size, nullptr, nullptr);
+            array<char> buffer3{exec, buffer3_size};
+            sparselib::spgemm_estimate_memory(
+                handle, &one_val, a_descr, b_descr, &zero_val, c_descr,
+                spgemm_descr, spgemm_alg, 1.0f, buffer3_size,
+                buffer3.get_data(), &buffer2_size);
+            buffer2.resize_and_reset(buffer2_size);
+            // compute spgemm
+            sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr,
+                                      &zero_val, c_descr, spgemm_descr,
+                                      spgemm_alg, buffer1.get_data(),
+                                      buffer2_size, buffer2.get_data());
+        } else {
+            throw(cse);
+        }
 #else  // CUDA_VERSION < 12000
         throw(cse);
 #endif
diff --git a/cuda/base/exception.cpp b/cuda/base/exception.cpp
index 7bb7fae5bd5..1f7d90113a4 100644
--- a/cuda/base/exception.cpp
+++ b/cuda/base/exception.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -6,6 +6,7 @@
 
 #include <string>
 
+#include <cuda.h>
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cufft.h>

From f4d20f626b7259c26d277c8902355bbd269b803c Mon Sep 17 00:00:00 2001
From: Ilya Zilberter <izilberter@txcorp.com>
Date: Thu, 20 Feb 2025 15:19:47 -0700
Subject: [PATCH 3/3] Add accessor for CusparseError error code

Use this to check for CUSPARSE_STATUS_INSUFFICIENT_RESOURCES
when falling back to spgemm ALG2.
---
 common/cuda_hip/matrix/csr_kernels.template.cpp |  6 ++----
 contributors.txt                                |  1 +
 include/ginkgo/core/base/exception.hpp          | 11 +++++++++--
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp
index ed68a30898f..906be6ff3b5 100644
--- a/common/cuda_hip/matrix/csr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -2534,8 +2534,7 @@ void spgemm(std::shared_ptr<const DefaultExecutor> exec,
         // If estimated buffer size is too large and CUDA > 12.0,  fall back to
         // ALG2
 #if CUDA_VERSION >= 12000
-        const char* error_code = "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES";
-        if (strstr(cse.what(), error_code)) {
+        if (cse.get_error_code() == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
             spgemm_alg = CUSPARSE_SPGEMM_ALG2;
             // Memory estimate for Alg2/Alg3
             sparselib::spgemm_work_estimation(
@@ -2745,8 +2744,7 @@ void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
         // If estimated buffer size is too large and CUDA > 12.0,  fall back to
         // ALG2
 #if CUDA_VERSION >= 12000
-        const char* error_code = "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES";
-        if (strstr(cse.what(), error_code)) {
+        if (cse.get_error_code() == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
             spgemm_alg = CUSPARSE_SPGEMM_ALG2;
             // Memory estimate for Alg2/Alg3
             sparselib::spgemm_work_estimation(
diff --git a/contributors.txt b/contributors.txt
index aec120d93dd..2d9be655046 100644
--- a/contributors.txt
+++ b/contributors.txt
@@ -25,3 +25,4 @@ Olenik Gregor <go@hpsim.de> HPSim
 Ribizel Tobias <mail@upsj.de> Karlsruhe Institute of Technology
 Riemer Lukas <lksriemer@gmail.com> Karlsruhe Institute of Technology
 Tsai Yuhsiang <yhmtsai@gmail.com> National Taiwan University
+Ilya Zilberter <izilberter@txcorp.com> Tech-X Corporation
diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp
index febc5e17034..c60834c23dc 100644
--- a/include/ginkgo/core/base/exception.hpp
+++ b/include/ginkgo/core/base/exception.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -250,11 +250,18 @@ class CusparseError : public Error {
      */
     CusparseError(const std::string& file, int line, const std::string& func,
                   int64 error_code)
-        : Error(file, line, func + ": " + get_error(error_code))
+        : Error(file, line, func + ": " + get_error(error_code)),
+          err_code(error_code)
     {}
 
+    /**
+     * Returns the error code
+     */
+    int64 get_error_code() const noexcept { return err_code; }
+
 private:
     static std::string get_error(int64 error_code);
+    const int64 err_code;
 };