[CUSOLVER] Improve the dispatch for LAPACK routines (#1806)

amontoison · web-flow · commit f3156e3db439 · 2023-03-16T21:16:14.000+01:00
diff --git a/lib/cusolver/dense.jl b/lib/cusolver/dense.jl
@@ -1,7 +1,7 @@
 
 using LinearAlgebra
 using LinearAlgebra: BlasInt, checksquare
-using LinearAlgebra.LAPACK: chkargsok, chklapackerror
+using LinearAlgebra.LAPACK: chkargsok, chklapackerror, chktrans, chkside, chkdiag
 
 using ..CUBLAS: unsafe_batch
 
@@ -18,10 +18,11 @@ for (bname, fname,elty) in ((:cusolverDnSpotrf_bufferSize, :cusolverDnSpotrf, :F
                             (:cusolverDnCpotrf_bufferSize, :cusolverDnCpotrf, :ComplexF32),
                             (:cusolverDnZpotrf_bufferSize, :cusolverDnZpotrf, :ComplexF64))
     @eval begin
-        function LinearAlgebra.LAPACK.potrf!(uplo::Char,
+        function potrf!(uplo::Char,
                         A::StridedCuMatrix{$elty})
-            n       = checksquare(A)
-            lda     = max(1, stride(A, 2))
+            VERSION >= v"1.8" && LinearAlgebra.BLAS.chkuplo(uplo)
+            n = checksquare(A)
+            lda = max(1, stride(A, 2))
 
             function bufferSize()
                 out = Ref{Cint}(0)
@@ -49,9 +50,10 @@ for (fname,elty) in ((:cusolverDnSpotrs, :Float32),
                      (:cusolverDnCpotrs, :ComplexF32),
                      (:cusolverDnZpotrs, :ComplexF64))
     @eval begin
-        function LinearAlgebra.LAPACK.potrs!(uplo::Char,
+        function potrs!(uplo::Char,
                         A::StridedCuMatrix{$elty},
                         B::StridedCuVecOrMat{$elty})
+            VERSION >= v"1.8" && LinearAlgebra.BLAS.chkuplo(uplo)
             n = checksquare(A)
             if size(B, 1) != n
                 throw(DimensionMismatch("first dimension of B, $(size(B,1)), must match second dimension of A, $n"))
@@ -72,24 +74,16 @@ for (fname,elty) in ((:cusolverDnSpotrs, :Float32),
     end
 end
 
-## potri
-"""
-    potri!(uplo::Char, A::CuMatrix)
-
-!!! note
-
-    `potri!` requires CUDA >= 10.1
-"""
-LinearAlgebra.LAPACK.potri!(uplo::Char, A::CuMatrix)
 for (bname, fname,elty) in ((:cusolverDnSpotri_bufferSize, :cusolverDnSpotri, :Float32),
-                     (:cusolverDnDpotri_bufferSize, :cusolverDnDpotri, :Float64),
-                     (:cusolverDnCpotri_bufferSize, :cusolverDnCpotri, :ComplexF32),
-                     (:cusolverDnZpotri_bufferSize, :cusolverDnZpotri, :ComplexF64))
-@eval begin
-    function LinearAlgebra.LAPACK.potri!(uplo::Char,
-                    A::StridedCuMatrix{$elty})
-            n       = checksquare(A)
-            lda     = max(1, stride(A, 2))
+                            (:cusolverDnDpotri_bufferSize, :cusolverDnDpotri, :Float64),
+                            (:cusolverDnCpotri_bufferSize, :cusolverDnCpotri, :ComplexF32),
+                            (:cusolverDnZpotri_bufferSize, :cusolverDnZpotri, :ComplexF64))
+    @eval begin
+        function potri!(uplo::Char,
+                        A::StridedCuMatrix{$elty})
+            VERSION >= v"1.8" && LinearAlgebra.BLAS.chkuplo(uplo)
+            n = checksquare(A)
+            lda = max(1, stride(A, 2))
 
             function bufferSize()
                 out = Ref{Cint}(0)
@@ -118,26 +112,26 @@ for (bname, fname,elty) in ((:cusolverDnSgetrf_bufferSize, :cusolverDnSgetrf, :F
                             (:cusolverDnZgetrf_bufferSize, :cusolverDnZgetrf, :ComplexF64))
     @eval begin
         function getrf!(A::StridedCuMatrix{$elty})
-            m,n     = size(A)
-            lda     = max(1, stride(A, 2))
+            m,n = size(A)
+            lda = max(1, stride(A, 2))
 
             function bufferSize()
                 out = Ref{Cint}(0)
                 $bname(dense_handle(), m, n, A, lda, out)
                 return out[]
             end
 
-            devipiv = CuArray{Cint}(undef, min(m,n))
+            ipiv = CuArray{Cint}(undef, min(m,n))
             devinfo = CuArray{Cint}(undef, 1)
             with_workspace($elty, bufferSize) do buffer
-                $fname(dense_handle(), m, n, A, lda, buffer, devipiv, devinfo)
+                $fname(dense_handle(), m, n, A, lda, buffer, ipiv, devinfo)
             end
 
             info = @allowscalar devinfo[1]
             unsafe_free!(devinfo)
             chkargsok(BlasInt(info))
 
-            A, devipiv, info
+            A, ipiv, info
         end
     end
 end
@@ -149,8 +143,8 @@ for (bname, fname,elty) in ((:cusolverDnSgeqrf_bufferSize, :cusolverDnSgeqrf, :F
                             (:cusolverDnZgeqrf_bufferSize, :cusolverDnZgeqrf, :ComplexF64))
     @eval begin
         function geqrf!(A::StridedCuMatrix{$elty})
-            m, n    = size(A)
-            lda     = max(1, stride(A, 2))
+            m, n = size(A)
+            lda  = max(1, stride(A, 2))
 
             function bufferSize()
                 out = Ref{Cint}(0)
@@ -181,7 +175,8 @@ for (bname, fname,elty) in ((:cusolverDnSsytrf_bufferSize, :cusolverDnSsytrf, :F
     @eval begin
         function sytrf!(uplo::Char,
                         A::StridedCuMatrix{$elty})
-            n      = checksquare(A)
+            VERSION >= v"1.8" && LinearAlgebra.BLAS.chkuplo(uplo)
+            n = checksquare(A)
             lda = max(1, stride(A, 2))
 
             function bufferSize()
@@ -190,17 +185,17 @@ for (bname, fname,elty) in ((:cusolverDnSsytrf_bufferSize, :cusolverDnSsytrf, :F
                 return out[]
             end
 
-            devipiv = CuArray{Cint}(undef, n)
+            ipiv = CuArray{Cint}(undef, n)
             devinfo = CuArray{Cint}(undef, 1)
             with_workspace($elty, bufferSize) do buffer
-                $fname(dense_handle(), uplo, n, A, lda, devipiv, buffer, length(buffer), devinfo)
+                $fname(dense_handle(), uplo, n, A, lda, ipiv, buffer, length(buffer), devinfo)
             end
 
             info = @allowscalar devinfo[1]
             unsafe_free!(devinfo)
             chkargsok(BlasInt(info))
 
-            A, devipiv, info
+            A, ipiv, info
         end
     end
 end
@@ -215,6 +210,11 @@ for (fname,elty) in ((:cusolverDnSgetrs, :Float32),
                         A::StridedCuMatrix{$elty},
                         ipiv::CuVector{Cint},
                         B::StridedCuVecOrMat{$elty})
+
+            # Support transa = 'C' for real matrices
+            trans = $elty <: Real && trans == 'C' ? 'T' : trans
+
+            chktrans(trans)
             n = size(A, 1)
             if size(A, 2) != n
                 throw(DimensionMismatch("LU factored matrix A must be square!"))
@@ -232,6 +232,7 @@ for (fname,elty) in ((:cusolverDnSgetrs, :Float32),
             info = @allowscalar devinfo[1]
             unsafe_free!(devinfo)
             chkargsok(BlasInt(info))
+
             B
         end
     end
@@ -248,6 +249,12 @@ for (bname, fname, elty) in ((:cusolverDnSormqr_bufferSize, :cusolverDnSormqr, :
                         A::CuMatrix{$elty},
                         tau::CuVector{$elty},
                         C::CuVecOrMat{$elty})
+
+            # Support transa = 'C' for real matrices
+            trans = $elty <: Real && trans == 'C' ? 'T' : trans
+
+            chkside(side)
+            chktrans(trans)
             m,n = ndims(C) == 2 ? size(C) : (size(C, 1), 1)
             mA  = size(A, 1)
             k   = length(tau)
@@ -330,8 +337,8 @@ for (bname, fname, elty, relty) in ((:cusolverDnSgebrd_bufferSize, :cusolverDnSg
                                     (:cusolverDnZgebrd_bufferSize, :cusolverDnZgebrd, :ComplexF64, :Float64))
     @eval begin
         function gebrd!(A::StridedCuMatrix{$elty})
-            m, n    = size(A)
-            lda     = max(1, stride(A, 2))
+            m, n = size(A)
+            lda  = max(1, stride(A, 2))
 
             function bufferSize()
                 out = Ref{Cint}(0)
@@ -367,7 +374,7 @@ for (bname, fname, elty, relty) in ((:cusolverDnSgesvd_bufferSize, :cusolverDnSg
         function gesvd!(jobu::Char,
                         jobvt::Char,
                         A::StridedCuMatrix{$elty})
-            m, n    = size(A)
+            m, n = size(A)
             if m < n
                 throw(ArgumentError("CUSOLVER's gesvd currently requires m >= n"))
             end
@@ -484,6 +491,7 @@ for (jname, bname, fname, elty, relty) in ((:syevd!, :cusolverDnSsyevd_bufferSiz
         function $jname(jobz::Char,
                         uplo::Char,
                         A::StridedCuMatrix{$elty})
+            VERSION >= v"1.8" && LinearAlgebra.BLAS.chkuplo(uplo)
             n       = checksquare(A)
             lda     = max(1, stride(A, 2))
             W       = CuArray{$relty}(undef, n)
@@ -523,6 +531,7 @@ for (jname, bname, fname, elty, relty) in ((:sygvd!, :cusolverDnSsygvd_bufferSiz
                         uplo::Char,
                         A::StridedCuMatrix{$elty},
                         B::StridedCuMatrix{$elty})
+            VERSION >= v"1.8" && LinearAlgebra.BLAS.chkuplo(uplo)
             nA, nB  = checksquare(A, B)
             if nB != nA
                 throw(DimensionMismatch("Dimensions of A ($nA, $nA) and B ($nB, $nB) must match!"))
@@ -569,6 +578,7 @@ for (jname, bname, fname, elty, relty) in ((:sygvj!, :cusolverDnSsygvj_bufferSiz
                         B::StridedCuMatrix{$elty};
                         tol::$relty=eps($relty),
                         max_sweeps::Int=100)
+            VERSION >= v"1.8" && LinearAlgebra.BLAS.chkuplo(uplo)
             nA, nB  = checksquare(A, B)
             if nB != nA
                 throw(DimensionMismatch("Dimensions of A ($nA, $nA) and B ($nB, $nB) must match!"))
@@ -613,8 +623,7 @@ end
 for (jname, bname, fname, elty, relty) in ((:syevjBatched!, :cusolverDnSsyevjBatched_bufferSize, :cusolverDnSsyevjBatched, :Float32, :Float32),
                                            (:syevjBatched!, :cusolverDnDsyevjBatched_bufferSize, :cusolverDnDsyevjBatched, :Float64, :Float64),
                                            (:heevjBatched!, :cusolverDnCheevjBatched_bufferSize, :cusolverDnCheevjBatched, :ComplexF32, :Float32),
-                                           (:heevjBatched!, :cusolverDnZheevjBatched_bufferSize, :cusolverDnZheevjBatched, :ComplexF64, :Float64)
-                                           )
+                                           (:heevjBatched!, :cusolverDnZheevjBatched_bufferSize, :cusolverDnZheevjBatched, :ComplexF64, :Float64))
     @eval begin
         function $jname(jobz::Char,
                         uplo::Char,
@@ -623,6 +632,7 @@ for (jname, bname, fname, elty, relty) in ((:syevjBatched!, :cusolverDnSsyevjBat
                         max_sweeps::Int=100)
 
             # Set up information for the solver arguments
+            VERSION >= v"1.8" && LinearAlgebra.BLAS.chkuplo(uplo)
             n       = checksquare(A)
             lda     = max(1, stride(A, 2))
             batchSize = size(A,3)
@@ -667,19 +677,19 @@ for (jname, bname, fname, elty, relty) in ((:syevjBatched!, :cusolverDnSsyevjBat
     end
 end
 
-for (jname, fname, elty) in ((:potrsBatched!, :cusolverDnSpotrsBatched, :Float32),
-                             (:potrsBatched!, :cusolverDnDpotrsBatched, :Float64),
-                             (:potrsBatched!, :cusolverDnCpotrsBatched, :ComplexF32),
-                             (:potrsBatched!, :cusolverDnZpotrsBatched, :ComplexF64)
-                             )
+for (fname, elty) in ((:cusolverDnSpotrsBatched, :Float32),
+                      (:cusolverDnDpotrsBatched, :Float64),
+                      (:cusolverDnCpotrsBatched, :ComplexF32),
+                      (:cusolverDnZpotrsBatched, :ComplexF64))
     @eval begin
-        function $jname(uplo::Char,
-                        A::Vector{<:StridedCuMatrix{$elty}},
-                        B::Vector{<:StridedCuVecOrMat{$elty}})
+        function potrsBatched!(uplo::Char,
+                               A::Vector{<:StridedCuMatrix{$elty}},
+                               B::Vector{<:StridedCuVecOrMat{$elty}})
             if length(A) != length(B)
                 throw(DimensionMismatch(""))
             end
             # Set up information for the solver arguments
+            VERSION >= v"1.8" && LinearAlgebra.BLAS.chkuplo(uplo)
             n = checksquare(A[1])
             if size(B[1], 1) != n
                 throw(DimensionMismatch("first dimension of B[i], $(size(B[1],1)), must match second dimension of A, $n"))
@@ -710,15 +720,15 @@ for (jname, fname, elty) in ((:potrsBatched!, :cusolverDnSpotrsBatched, :Float32
     end
 end
 
-for (jname, fname, elty) in ((:potrfBatched!, :cusolverDnSpotrfBatched, :Float32),
-                             (:potrfBatched!, :cusolverDnDpotrfBatched, :Float64),
-                             (:potrfBatched!, :cusolverDnCpotrfBatched, :ComplexF32),
-                             (:potrfBatched!, :cusolverDnZpotrfBatched, :ComplexF64)
-                             )
+for (fname, elty) in ((:cusolverDnSpotrfBatched, :Float32),
+                      (:cusolverDnDpotrfBatched, :Float64),
+                      (:cusolverDnCpotrfBatched, :ComplexF32),
+                      (:cusolverDnZpotrfBatched, :ComplexF64))
     @eval begin
-        function $jname(uplo::Char, A::Vector{<:StridedCuMatrix{$elty}})
+        function potrfBatched!(uplo::Char, A::Vector{<:StridedCuMatrix{$elty}})
 
             # Set up information for the solver arguments
+            VERSION >= v"1.8" && LinearAlgebra.BLAS.chkuplo(uplo)
             n = checksquare(A[1])
             lda = max(1, stride(A[1], 2))
             batchSize = length(A)
@@ -745,3 +755,32 @@ for (jname, fname, elty) in ((:potrfBatched!, :cusolverDnSpotrfBatched, :Float32
         end
     end
 end
+
+# LAPACK
+for elty in (:Float32, :Float64, :ComplexF32, :ComplexF64)
+    @eval begin
+        LinearAlgebra.LAPACK.potrf!(uplo::Char, A::StridedCuMatrix{$elty}) = CUSOLVER.potrf!(uplo, A)
+        LinearAlgebra.LAPACK.potrs!(uplo::Char, A::StridedCuMatrix{$elty}, B::StridedCuVecOrMat{$elty}) = CUSOLVER.potrs!(uplo, A, B)
+        LinearAlgebra.LAPACK.potri!(uplo::Char, A::StridedCuMatrix{$elty}) = CUSOLVER.potri!(uplo, A)
+        LinearAlgebra.LAPACK.getrf!(A::StridedCuMatrix{$elty}) = CUSOLVER.getrf!(A)
+        LinearAlgebra.LAPACK.geqrf!(A::StridedCuMatrix{$elty}) = CUSOLVER.geqrf!(A)
+        LinearAlgebra.LAPACK.sytrf!(uplo::Char, A::StridedCuMatrix{$elty}) = sytrf!(uplo, A)
+        LinearAlgebra.LAPACK.getrs!(trans::Char, A::StridedCuMatrix{$elty}, ipiv::CuVector{Cint}, B::StridedCuVecOrMat{$elty}) = CUSOLVER.getrs!(trans, A, ipiv, B)
+        LinearAlgebra.LAPACK.ormqr!(side::Char, trans::Char, A::CuMatrix{$elty}, tau::CuVector{$elty}, C::CuVecOrMat{$elty}) = CUSOLVER.ormqr!(side, trans, A, tau, C)
+        LinearAlgebra.LAPACK.orgqr!(A::StridedCuMatrix{$elty}, tau::CuVector{$elty}) = CUSOLVER.orgqr!(A, tau)
+        LinearAlgebra.LAPACK.gebrd!(A::StridedCuMatrix{$elty}) = CUSOLVER.gebrd!(A)
+        LinearAlgebra.LAPACK.gesvd!(jobu::Char, jobvt::Char, A::StridedCuMatrix{$elty}) = CUSOLVER.gesvd!(jobu, jobvt, A)
+    end
+end
+
+for elty in (:Float32, :Float64)
+    @eval begin
+        LinearAlgebra.LAPACK.sygvd!(itype::Int, jobz::Char, uplo::Char, A::StridedCuMatrix{$elty}, B::StridedCuMatrix{$elty}) = CUSOLVER.sygvd!(itype, jobz, uplo, A, B)
+    end
+end
+
+for elty in (:ComplexF32, :ComplexF64)
+    @eval begin
+        LinearAlgebra.LAPACK.sygvd!(itype::Int, jobz::Char, uplo::Char, A::StridedCuMatrix{$elty}, B::StridedCuMatrix{$elty}) = CUSOLVER.hegvd!(itype, jobz, uplo, A, B)
+    end
+end
diff --git a/lib/cusolver/linalg.jl b/lib/cusolver/linalg.jl
@@ -2,7 +2,7 @@
 
 using LinearAlgebra
 using LinearAlgebra: BlasComplex, BlasFloat, BlasReal
-using ..CUBLAS: CublasFloat, trsm!
+using ..CUBLAS: CublasFloat
 
 function copy_cublasfloat(As...)
     eltypes = eltype.(As)
@@ -53,7 +53,7 @@ function Base.:\(_A::CuMatOrAdj, _B::CuOrAdj)
     else
         # QR decomposition
         F, tau = CUSOLVER.geqrf!(A)  # A = QR
-        CUSOLVER.ormqr!('L', T <: Real ? 'T' : 'C', F, tau, B)
+        CUSOLVER.ormqr!('L', 'C', F, tau, B)
         if B isa CuVector{T}
             X = B[1:m]
             CUBLAS.trsv!('U', 'N', 'N', view(F,1:m,1:m), X)