Update mkl_lapack::batch_error handling in getrf_batch / getri_batch (#2458)

vlad-perevezentsev · web-flow · commit 56ccf864ccd6 · 2025-05-19T23:36:13.000+02:00
This PR suggests improvements to the handling of
`oneapi::mkl::lapack::batch_error` in `oneapi::mkl::lapack::getrf_batch`
and `oneapi::mkl::lapack::getri_batch`

OneMKL batched functions throw a single
`oneapi::mkl::lapack::batch_error` interpreted as computation_error for
all failed matrices (ids()). Now set dev_info[...] = 1 for each to
determinate singular matrix.

This change allows a consistent raising of `LinAlgError` in
`dpnp.linalg.inv()` for both non-batched and batched singular matrices
and unskipping tests with singular matrices for `dpnp.linalg.inv()` ,
`dpnp.linalg.det`, `dpnp.linalg.slogdet`
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -47,6 +47,7 @@ This release achieves 100% compliance with Python Array API specification (revis
 * Updated `conda create` commands build and install instructions of `Quick start guide` to avoid a compilation error [#2395](https://github.com/IntelPython/dpnp/pull/2395)
 * Added handling of empty string passed to a test env variable defining data type scope as a `False` value [#2415](https://github.com/IntelPython/dpnp/pull/2415)
 * Resolved build issues on non-Intel targets in `dpnp.i0` and `dpnp.kaiser` [#2439](https://github.com/IntelPython/dpnp/pull/2439)
+* Ensure consistency in the `dpnp.linalg.LinAlgError` exception raised on singular input matrices for both non-batched and batched cases in `dpnp.linalg.inv` [#2458] (https://github.com/IntelPython/dpnp/pull/2458)
 
 
 ## [0.17.0] - 02/26/2025
diff --git a/dpnp/backend/extensions/lapack/getrf_batch.cpp b/dpnp/backend/extensions/lapack/getrf_batch.cpp
@@ -110,22 +110,24 @@ static sycl::event getrf_batch_impl(sycl::queue &exec_q,
         // Get the indices of matrices within the batch that encountered an
         // error
         auto error_matrices_ids = be.ids();
-        // Get the indices of the first zero diagonal elements of these matrices
-        auto error_info = be.exceptions();
 
         auto error_matrices_ids_size = error_matrices_ids.size();
         auto dev_info_size = static_cast<std::size_t>(py::len(dev_info));
-        if (error_matrices_ids_size != dev_info_size) {
-            throw py::value_error("The size of `dev_info` must be equal to " +
+        if (error_matrices_ids_size > dev_info_size) {
+            throw py::value_error("The size of `dev_info` must be greater than"
+                                  " or equal to " +
                                   std::to_string(error_matrices_ids_size) +
                                   ", but currently it is " +
                                   std::to_string(dev_info_size) + ".");
         }
 
+        // OneMKL batched functions throw a single `batch_error`
+        // instead of per-matrix exceptions or an info array.
+        // This is interpreted as a computation_error (singular matrix),
+        // consistent with non-batched LAPACK behavior.
+        // Set dev_info[...] to any positive value for each failed index.
         for (size_t i = 0; i < error_matrices_ids.size(); ++i) {
-            // Assign the index of the first zero diagonal element in each
-            // error matrix to the corresponding index in 'dev_info'
-            dev_info[error_matrices_ids[i]] = error_info[i];
+            dev_info[error_matrices_ids[i]] = 1;
         }
     } catch (mkl_lapack::exception const &e) {
         is_exception_caught = true;
diff --git a/dpnp/backend/extensions/lapack/getri_batch.cpp b/dpnp/backend/extensions/lapack/getri_batch.cpp
@@ -108,22 +108,24 @@ static sycl::event getri_batch_impl(sycl::queue &exec_q,
         // Get the indices of matrices within the batch that encountered an
         // error
         auto error_matrices_ids = be.ids();
-        // Get the indices of the first zero diagonal elements of these matrices
-        auto error_info = be.exceptions();
 
         auto error_matrices_ids_size = error_matrices_ids.size();
         auto dev_info_size = static_cast<std::size_t>(py::len(dev_info));
-        if (error_matrices_ids_size != dev_info_size) {
-            throw py::value_error("The size of `dev_info` must be equal to " +
+        if (error_matrices_ids_size > dev_info_size) {
+            throw py::value_error("The size of `dev_info` must be greater than"
+                                  " or equal to " +
                                   std::to_string(error_matrices_ids_size) +
                                   ", but currently it is " +
                                   std::to_string(dev_info_size) + ".");
         }
 
+        // OneMKL batched functions throw a single `batch_error`
+        // instead of per-matrix exceptions or an info array.
+        // This is interpreted as a computation_error (singular matrix),
+        // consistent with non-batched LAPACK behavior.
+        // Set dev_info[...] to any positive value for each failed index.
         for (size_t i = 0; i < error_matrices_ids.size(); ++i) {
-            // Assign the index of the first zero diagonal element in each
-            // error matrix to the corresponding index in 'dev_info'
-            dev_info[error_matrices_ids[i]] = error_info[i];
+            dev_info[error_matrices_ids[i]] = 1;
         }
     } catch (mkl_lapack::exception const &e) {
         is_exception_caught = true;
diff --git a/dpnp/tests/helper.py b/dpnp/tests/helper.py
@@ -443,3 +443,20 @@ def is_win_platform():
 
 def numpy_version():
     return numpy.lib.NumpyVersion(numpy.__version__)
+
+
+def requires_intel_mkl_version(version):
+    """
+    Check if Intel MKL is used and its version is greater than or
+    equal to the specified one.
+
+    The check is based on MKL backend name stored in Build Dependencies
+    and only applies if Intel NumPy is detected.
+    The version is extracted from the BLAS section of NumPy's build
+    information and compared to the given version string.
+    """
+    if not is_intel_numpy():
+        return False
+
+    build_deps = numpy.show_config(mode="dicts")["Build Dependencies"]
+    return build_deps["blas"]["version"] >= version
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
@@ -24,9 +24,8 @@
     has_support_aspect64,
     is_cpu_device,
     is_cuda_device,
-    is_gpu_device,
-    is_win_platform,
     numpy_version,
+    requires_intel_mkl_version,
 )
 from .third_party.cupy import testing
 
@@ -334,11 +333,13 @@ def test_nan(self, p):
         # while OneMKL returns nans
         if is_cuda_device() and p in [-dpnp.inf, -1, 1, dpnp.inf, "fro"]:
             pytest.skip("Different behavior on CUDA")
-        elif (
-            is_gpu_device()
-            and is_win_platform()
-            and p in [-dpnp.inf, -1, 1, dpnp.inf, "fro"]
-        ):
+        elif requires_intel_mkl_version("2025.2") and p in [
+            -dpnp.inf,
+            -1,
+            1,
+            dpnp.inf,
+            "fro",
+        ]:
             pytest.skip("SAT-7966")
         a = generate_random_numpy_array((2, 2, 2, 2))
         a[0, 0] = 0
@@ -460,10 +461,6 @@ def test_det_singular_matrix(self, matrix):
 
         assert_allclose(result, expected)
 
-    # TODO: remove skipif when MKLD-13852 is resolved
-    # _getrf_batch does not raise an error with singular matrices.
-    # Skip running on cpu because dpnp uses _getrf_batch only on cpu.
-    @pytest.mark.skipif(is_cpu_device(), reason="MKLD-13852")
     def test_det_singular_matrix_3D(self):
         a_np = numpy.array(
             [[[1, 2], [3, 4]], [[1, 2], [1, 2]], [[1, 3], [3, 1]]]
@@ -1761,9 +1758,10 @@ def test_inv_singular_matrix(self, matrix):
         assert_raises(numpy.linalg.LinAlgError, numpy.linalg.inv, a_np)
         assert_raises(dpnp.linalg.LinAlgError, dpnp.linalg.inv, a_dp)
 
-    # TODO: remove skip when MKLD-13852 is resolved
-    # _getrf_batch does not raise an error with singular matrices.
-    @pytest.mark.skip("MKLD-13852")
+    # TODO: remove skipif when Intel MKL 2025.2 is released
+    @pytest.mark.skipif(
+        not requires_intel_mkl_version("2025.2"), reason="mkl<2025.2"
+    )
     def test_inv_singular_matrix_3D(self):
         a_np = numpy.array(
             [[[1, 2], [3, 4]], [[1, 2], [1, 2]], [[1, 3], [3, 1]]]
@@ -2785,6 +2783,13 @@ def test_slogdet_strides(self):
         assert_allclose(sign_result, sign_expected)
         assert_allclose(logdet_result, logdet_expected)
 
+    # TODO: remove skipif when Intel MKL 2025.2 is released
+    # Skip running on CPU because dpnp uses _getrf_batch only on CPU
+    # for dpnp.linalg.det/slogdet.
+    @pytest.mark.skipif(
+        is_cpu_device() and not requires_intel_mkl_version("2025.2"),
+        reason="mkl<2025.2",
+    )
     @pytest.mark.parametrize(
         "matrix",
         [
@@ -2815,10 +2820,13 @@ def test_slogdet_singular_matrix(self, matrix):
         assert_allclose(sign_result, sign_expected)
         assert_allclose(logdet_result, logdet_expected)
 
-    # TODO: remove skipif when MKLD-13852 is resolved
-    # _getrf_batch does not raise an error with singular matrices.
-    # Skip running on cpu because dpnp uses _getrf_batch only on cpu.
-    @pytest.mark.skipif(is_cpu_device(), reason="MKLD-13852")
+    # TODO: remove skipif when Intel MKL 2025.2 is released
+    # Skip running on CPU because dpnp uses _getrf_batch only on CPU
+    # for dpnp.linalg.det/slogdet.
+    @pytest.mark.skipif(
+        is_cpu_device() and not requires_intel_mkl_version("2025.2"),
+        reason="mkl<2025.2",
+    )
     def test_slogdet_singular_matrix_3D(self):
         a_np = numpy.array(
             [[[1, 2], [3, 4]], [[1, 2], [1, 2]], [[1, 3], [3, 1]]]
diff --git a/dpnp/tests/third_party/cupy/linalg_tests/test_norms.py b/dpnp/tests/third_party/cupy/linalg_tests/test_norms.py
@@ -170,10 +170,6 @@ def test_det_zero_dim(self, dtype):
             with pytest.raises(xp.linalg.LinAlgError):
                 xp.linalg.det(a)
 
-    # TODO: remove skipif when MKLD-13852 is resolved
-    # _getrf_batch does not raise an error with singular matrices.
-    # Skip running on cpu because dpnp uses _getrf_batch only on cpu.
-    @pytest.mark.skipif(is_cpu_device(), reason="MKLD-13852")
     @testing.for_float_dtypes(no_float16=True)
     @testing.numpy_cupy_allclose(rtol=1e-3, atol=1e-4)
     def test_det_singular(self, xp, dtype):
diff --git a/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py b/dpnp/tests/third_party/cupy/linalg_tests/test_solve.py
@@ -7,6 +7,7 @@
 from dpnp.tests.helper import (
     assert_dtype_allclose,
     has_support_aspect64,
+    requires_intel_mkl_version,
 )
 from dpnp.tests.third_party.cupy import testing
 from dpnp.tests.third_party.cupy.testing import _condition
@@ -213,9 +214,10 @@ def test_inv(self, dtype):
             ):
                 xp.linalg.inv(a)
 
-    # TODO: remove skip when MKLD-13852 is resolved
-    # _getrf_batch does not raise an error with singular matrices.
-    @pytest.mark.skip("MKLD-13852")
+    # TODO: remove skipif when Intel MKL 2025.2 is released
+    @pytest.mark.skipif(
+        not requires_intel_mkl_version("2025.2"), reason="mkl<2025.2"
+    )
     @testing.for_dtypes("ifdFD")
     def test_batched_inv(self, dtype):
         for xp in (numpy, cupy):