Skip to content

Expand eigen() and add eig[vals,vecs]() #2787

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: master
Choose a base branch
from

Conversation

matteosecli
Copy link

  • Expand LinearAlgebra.eigen() to handle non-symmetric matrices via recent Xgeev
  • Add LinearAlgebra.eigvals() and LinearAlgebra.eigvecs()

- Expand LinearAlgebra.eigen() to handle non-symmetric matrices via recent `Xgeev`
- Add LinearAlgebra.eigvals() and LinearAlgebra.eigvecs()
Copy link
Contributor

github-actions bot commented May 26, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/test/libraries/cusolver/dense.jl b/test/libraries/cusolver/dense.jl
index ecc9078ed..71ad9ce83 100644
--- a/test/libraries/cusolver/dense.jl
+++ b/test/libraries/cusolver/dense.jl
@@ -333,31 +333,31 @@ sorteig!(λ::AbstractVector, sortby::Union{Function, Nothing} = eigsortby) = sor
         @testset "geev!" begin
             local d_W, d_V
 
-            A              = rand(elty,m,m)
-            d_A            = CuArray(A)
-            Eig            = eigen(A)
-            d_eig          = eigen(d_A)
+            A = rand(elty, m, m)
+            d_A = CuArray(A)
+            Eig = eigen(A)
+            d_eig = eigen(d_A)
             sorteig!(d_eig.values, d_eig.vectors)
             @test Eig.values ≈ collect(d_eig.values)
-            h_V            = collect(d_eig.vectors)
-            h_V⁻¹          = inv(h_V)
-            @test abs.(h_V⁻¹*Eig.vectors) ≈ I
+            h_V = collect(d_eig.vectors)
+            h_V⁻¹ = inv(h_V)
+            @test abs.(h_V⁻¹ * Eig.vectors) ≈ I
 
-            A              = rand(elty,m,m)
-            d_A            = CuArray(A)
-            W              = eigvals(A)
-            d_W            = eigvals(d_A)
+            A = rand(elty, m, m)
+            d_A = CuArray(A)
+            W = eigvals(A)
+            d_W = eigvals(d_A)
             sorteig!(d_W)
-            @test W        ≈ collect(d_W)
+            @test W ≈ collect(d_W)
 
-            A              = rand(elty,m,m)
-            d_A            = CuArray(A)
-            V              = eigvecs(A)
-            d_W            = eigvals(d_A)
-            d_V            = eigvecs(d_A)
+            A = rand(elty, m, m)
+            d_A = CuArray(A)
+            V = eigvecs(A)
+            d_W = eigvals(d_A)
+            d_V = eigvecs(d_A)
             sorteig!(d_W, d_V)
-            V⁻¹            = inv(V)
-            @test abs.(V⁻¹*collect(d_V)) ≈ I
+            V⁻¹ = inv(V)
+            @test abs.(V⁻¹ * collect(d_V)) ≈ I
         end
     end
 
@@ -402,7 +402,7 @@ sorteig!(λ::AbstractVector, sortby::Union{Function, Nothing} = eigsortby) = sor
         d_A            = CuArray(A)
         Eig            = eigen(LinearAlgebra.Hermitian(A))
         if CUSOLVER.version() >= v"11.7.1"
-            d_eig          = eigen(d_A)
+            d_eig = eigen(d_A)
             sorteig!(d_eig.values, d_eig.vectors)
             @test Eig.values ≈ collect(d_eig.values)
         end
@@ -418,42 +418,42 @@ sorteig!(λ::AbstractVector, sortby::Union{Function, Nothing} = eigsortby) = sor
             @test abs.(Eig.vectors'*h_V) ≈ I
         end
 
-        A              = rand(elty,m,m)
-        A             += A'
-        d_A            = CuArray(A)
-        W              = eigvals(LinearAlgebra.Hermitian(A))
+        A = rand(elty, m, m)
+        A += A'
+        d_A = CuArray(A)
+        W = eigvals(LinearAlgebra.Hermitian(A))
         if CUSOLVER.version() >= v"11.7.1"
-            d_W            = eigvals(d_A)
+            d_W = eigvals(d_A)
             sorteig!(d_W)
-            @test W        ≈ collect(d_W)
+            @test W ≈ collect(d_W)
         end
-        d_W            = eigvals(LinearAlgebra.Hermitian(d_A))
-        @test W        ≈ collect(d_W)
+        d_W = eigvals(LinearAlgebra.Hermitian(d_A))
+        @test W ≈ collect(d_W)
         if elty <: Real
-            W              = eigvals(LinearAlgebra.Symmetric(A))
-            d_W            = eigvals(LinearAlgebra.Symmetric(d_A))
-            @test W        ≈ collect(d_W)
+            W = eigvals(LinearAlgebra.Symmetric(A))
+            d_W = eigvals(LinearAlgebra.Symmetric(d_A))
+            @test W ≈ collect(d_W)
         end
 
-        A              = rand(elty,m,m)
-        A             += A'
-        d_A            = CuArray(A)
-        V              = eigvecs(LinearAlgebra.Hermitian(A))
+        A = rand(elty, m, m)
+        A += A'
+        d_A = CuArray(A)
+        V = eigvecs(LinearAlgebra.Hermitian(A))
         if CUSOLVER.version() >= v"11.7.1"
-            d_W            = eigvals(d_A)
-            d_V            = eigvecs(d_A)
+            d_W = eigvals(d_A)
+            d_V = eigvecs(d_A)
             sorteig!(d_W, d_V)
-            h_V            = collect(d_V)
-            @test abs.(V'*h_V) ≈ I
+            h_V = collect(d_V)
+            @test abs.(V' * h_V) ≈ I
         end
-        d_V            = eigvecs(LinearAlgebra.Hermitian(d_A))
-        h_V            = collect(d_V)
-        @test abs.(V'*h_V) ≈ I
+        d_V = eigvecs(LinearAlgebra.Hermitian(d_A))
+        h_V = collect(d_V)
+        @test abs.(V' * h_V) ≈ I
         if elty <: Real
-            V              = eigvecs(LinearAlgebra.Symmetric(A))
-            d_V            = eigvecs(LinearAlgebra.Symmetric(d_A))
-            h_V            = collect(d_V)
-            @test abs.(V'*h_V) ≈ I
+            V = eigvecs(LinearAlgebra.Symmetric(A))
+            d_V = eigvecs(LinearAlgebra.Symmetric(d_A))
+            h_V = collect(d_V)
+            @test abs.(V' * h_V) ≈ I
         end
     end
 

@maleadt
Copy link
Member

maleadt commented May 27, 2025

Thanks. Can you add some tests?

@maleadt maleadt added enhancement New feature or request needs tests Tests are requested. cuda libraries Stuff about CUDA library wrappers. labels May 27, 2025
@maleadt maleadt requested a review from kshyatt May 27, 2025 11:21
Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA.jl Benchmarks

Benchmark suite Current: 3d0f36d Previous: 1004fd9 Ratio
latency/precompile 43031373202 ns 43347615330 ns 0.99
latency/ttfp 7173263952 ns 7144432561 ns 1.00
latency/import 3423781003 ns 3422583362 ns 1.00
integration/volumerhs 9605952 ns 9621539.5 ns 1.00
integration/byval/slices=1 147112 ns 147204 ns 1.00
integration/byval/slices=3 425992 ns 426126.5 ns 1.00
integration/byval/reference 145268 ns 145306 ns 1.00
integration/byval/slices=2 286683.5 ns 286683 ns 1.00
integration/cudadevrt 103590 ns 103645 ns 1.00
kernel/indexing 14248 ns 14297 ns 1.00
kernel/indexing_checked 14914 ns 14899 ns 1.00
kernel/occupancy 707.7708333333334 ns 742.9652777777778 ns 0.95
kernel/launch 2215.1111111111113 ns 2214.8888888888887 ns 1.00
kernel/rand 15562 ns 18344.5 ns 0.85
array/reverse/1d 19675 ns 19882 ns 0.99
array/reverse/2d 23676 ns 23850 ns 0.99
array/reverse/1d_inplace 10240 ns 10508 ns 0.97
array/reverse/2d_inplace 11832 ns 12080 ns 0.98
array/copy 21336 ns 21206 ns 1.01
array/iteration/findall/int 158355 ns 159710.5 ns 0.99
array/iteration/findall/bool 139132.5 ns 139788 ns 1.00
array/iteration/findfirst/int 162389.5 ns 163426.5 ns 0.99
array/iteration/findfirst/bool 163669.5 ns 164898.5 ns 0.99
array/iteration/scalar 71620 ns 73632 ns 0.97
array/iteration/logical 215127 ns 219076 ns 0.98
array/iteration/findmin/1d 46710 ns 47341 ns 0.99
array/iteration/findmin/2d 97023 ns 97310 ns 1.00
array/reductions/reduce/1d 36311.5 ns 36812 ns 0.99
array/reductions/reduce/2d 42188 ns 41111 ns 1.03
array/reductions/mapreduce/1d 34148.5 ns 35000.5 ns 0.98
array/reductions/mapreduce/2d 47730.5 ns 41081 ns 1.16
array/broadcast 21165 ns 21375 ns 0.99
array/copyto!/gpu_to_gpu 11256 ns 12932 ns 0.87
array/copyto!/cpu_to_gpu 214394 ns 217571 ns 0.99
array/copyto!/gpu_to_cpu 285576 ns 286553 ns 1.00
array/accumulate/1d 109412 ns 109198 ns 1.00
array/accumulate/2d 80595.5 ns 81190 ns 0.99
array/construct 1250.4 ns 1248.9 ns 1.00
array/random/randn/Float32 43357 ns 44976.5 ns 0.96
array/random/randn!/Float32 25041 ns 25430 ns 0.98
array/random/rand!/Int64 27195 ns 27147 ns 1.00
array/random/rand!/Float32 8701 ns 8919.333333333334 ns 0.98
array/random/rand/Int64 30059 ns 30142 ns 1.00
array/random/rand/Float32 13115 ns 13307 ns 0.99
array/permutedims/4d 61299 ns 61252 ns 1.00
array/permutedims/2d 55151 ns 55576 ns 0.99
array/permutedims/3d 56054 ns 56240 ns 1.00
array/sorting/1d 2761352 ns 2778491 ns 0.99
array/sorting/by 3368586.5 ns 3369601 ns 1.00
array/sorting/2d 1086296.5 ns 1087382 ns 1.00
cuda/synchronization/stream/auto 1040.5 ns 1018.7 ns 1.02
cuda/synchronization/stream/nonblocking 7264 ns 8223 ns 0.88
cuda/synchronization/stream/blocking 803.7444444444444 ns 800.9072164948453 ns 1.00
cuda/synchronization/context/auto 1193.6 ns 1159.8 ns 1.03
cuda/synchronization/context/nonblocking 7195.4 ns 7478.9 ns 0.96
cuda/synchronization/context/blocking 929.7777777777778 ns 897.4426229508197 ns 1.04

This comment was automatically generated by workflow using github-action-benchmark.

@matteosecli
Copy link
Author

matteosecli commented May 27, 2025

Thanks. Can you add some tests?

I've added some, though most fail due to the fact that LinearAlgebra.eig*() for nonsymmetric standard arrays return sorted eigenpairs/values (see LinearAlgebra.jl/src/eigen.jl#L128-L140), while the existing and new overloads for CUDA arrays do not.

Should we return ordered eigenpairs, too? Or should we just order them in the tests alone? Reordering could impact performance.

@matteosecli
Copy link
Author

matteosecli commented May 27, 2025

I've simplified a bit the eigvecs functions and fixed the symmetric eigvals functions. Now all the tests for symmetric routines pass.

Unfortunately, it seems that the output of Xgeev needs to be reordered to match LAPACK's output. I've added a quick sorteig! function adapted from LinearAlgebra in order for the tests to pass, but I would avoid reordering tbh and just use that function for testing purposes.

As for geev testing, stuff like LAPACK.geev!('N','V', CuArray(A)) (which is instead tested for e.g. syevd) won't work as there is no explicit overload for that method in the code (perhaps due to missing typed routines?). I've commented that testing section for not, but tbh I'd just remove it if that's ok for you.

FInally, I still cannot get the two @test abs.(h_V⁻¹*Eig.vectors) ≈ I tests to work for geev, any idea? From a quick look it seems like Eig.vectors*h_V⁻¹ is almost an identity matrix, although with a not-so-great precision.

@matteosecli
Copy link
Author

It turns out I accidentally overlooked the final basis transformation necessary for real geev routines to return proper eigenvectors; it should be fixed now.

The only points left are whether to do eigenvalues/vectors sorting or not, and what to do with the commented out geev test.

@maleadt
Copy link
Member

maleadt commented May 28, 2025

Lacking familiarity, some superficial thoughts.

whether to do eigenvalues/vectors sorting or not

Given that sorting is relatively expensive, and IIUC does not practically matter if only for convenience/reproducibility, I'd only do this in the tests.

what to do with the commented out geev test

What functionality is missing? We typically do not "overload" LAPACK methods like LAPACK.geev!. Why doesn'tCUSOLVER.Xgeev! work?

@matteosecli
Copy link
Author

Given that sorting is relatively expensive, and IIUC does not practically matter if only for convenience/reproducibility, I'd only do this in the tests.

Great, I'll do that.

What functionality is missing? We typically do not "overload" LAPACK methods like LAPACK.geev!. Why doesn'tCUSOLVER.Xgeev! work?

The geev equivalent of these definitions:

CUDA.jl/lib/cusolver/dense.jl

Lines 1053 to 1063 in 8b6a2a0

for elty in (:Float32, :Float64)
@eval begin
LAPACK.syevd!(jobz::Char, uplo::Char, A::StridedCuMatrix{$elty}) = syevd!(jobz, uplo, A)
end
end
for elty in (:ComplexF32, :ComplexF64)
@eval begin
LAPACK.syevd!(jobz::Char, uplo::Char, A::StridedCuMatrix{$elty}) = heevd!(jobz, uplo, A)
end
end

which in turn use these definitions:
for (jname, bname, fname, elty, relty) in ((:syevd!, :cusolverDnSsyevd_bufferSize, :cusolverDnSsyevd, :Float32, :Float32),
(:syevd!, :cusolverDnDsyevd_bufferSize, :cusolverDnDsyevd, :Float64, :Float64),
(:heevd!, :cusolverDnCheevd_bufferSize, :cusolverDnCheevd, :ComplexF32, :Float32),
(:heevd!, :cusolverDnZheevd_bufferSize, :cusolverDnZheevd, :ComplexF64, :Float64))
@eval begin
function $jname(jobz::Char,
uplo::Char,
A::StridedCuMatrix{$elty})
chkuplo(uplo)
n = checksquare(A)
lda = max(1, stride(A, 2))
W = CuArray{$relty}(undef, n)
dh = dense_handle()
function bufferSize()
out = Ref{Cint}(0)
$bname(dh, jobz, uplo, n, A, lda, W, out)
return out[] * sizeof($elty)
end
with_workspace(dh.workspace_gpu, bufferSize) do buffer
$fname(dh, jobz, uplo, n, A, lda, W,
buffer, sizeof(buffer) ÷ sizeof($elty), dh.info)
end
info = @allowscalar dh.info[1]
chkargsok(BlasInt(info))
if jobz == 'N'
return W
elseif jobz == 'V'
return W, A
end
end
end
end

is missing.

The definitions above use a "legacy" API, with type-dependent function names (e.g. *Ssyevd, *Dsyevd, etc), while CUDA offers also a "generic" API (see docs) in which types are passed as parameters and there is just one function name (e.g. *Xsyevd).

The generic API functions are translated in dense_generic.jl, e.g.:

# Xsyevd
function Xsyevd!(jobz::Char, uplo::Char, A::StridedCuMatrix{T}) where {T <: BlasFloat}
chkuplo(uplo)
n = checksquare(A)
R = real(T)
lda = max(1, stride(A, 2))
W = CuVector{R}(undef, n)
params = CuSolverParameters()
dh = dense_handle()
function bufferSize()
out_cpu = Ref{Csize_t}(0)
out_gpu = Ref{Csize_t}(0)
cusolverDnXsyevd_bufferSize(dh, params, jobz, uplo, n,
T, A, lda, R, W, T, out_gpu, out_cpu)
out_gpu[], out_cpu[]
end
with_workspaces(dh.workspace_gpu, dh.workspace_cpu, bufferSize()...) do buffer_gpu, buffer_cpu
cusolverDnXsyevd(dh, params, jobz, uplo, n, T, A,
lda, R, W, T, buffer_gpu, sizeof(buffer_gpu),
buffer_cpu, sizeof(buffer_cpu), dh.info)
end
flag = @allowscalar dh.info[1]
chkargsok(flag |> BlasInt)
if jobz == 'N'
return W
elseif jobz == 'V'
return W, A
end
end

So, while the symmetric diagonalization routines have both the "legacy" and "generic" API, the non-symmetric diagonalization routine has just the "generic" API Xgeev.

This results in a somewhat unbalanced situation in the existing codebase. Calling e.g. LAPACK.syev!(..., CuArray(A)) for a symmetric matrix actually forwards to CUSOLVER, while calling LAPACK.geev!(..., CuArray(A)) for a generic matrix does not, and produces an error.

@maleadt
Copy link
Member

maleadt commented May 28, 2025

The geev equivalent of these definitions:

CUDA.jl/lib/cusolver/dense.jl

Lines 1053 to 1063 in 8b6a2a0

for elty in (:Float32, :Float64)
@eval begin
LAPACK.syevd!(jobz::Char, uplo::Char, A::StridedCuMatrix{$elty}) = syevd!(jobz, uplo, A)
end
end
for elty in (:ComplexF32, :ComplexF64)
@eval begin
LAPACK.syevd!(jobz::Char, uplo::Char, A::StridedCuMatrix{$elty}) = heevd!(jobz, uplo, A)
end
end

It's unfortunate that we (still) have these (again). The Base LAPACK interface isn't really meant to be extended; IME it's better to implement interfaces at a higher level (e.g. LinearAlgebra) and dispatch to methods specific to CUDA libraries. There are also often incompatibilities between the CPU and GPU LAPACK interfaces that prevent this kind of code reuse.
That said, things are working, so it's not really a priority to change all that.

@matteosecli
Copy link
Author

It's unfortunate that we (still) have these (again). The Base LAPACK interface isn't really meant to be extended; IME it's better to implement interfaces at a higher level (e.g. LinearAlgebra) and dispatch to methods specific to CUDA libraries. There are also often incompatibilities between the CPU and GPU LAPACK interfaces that prevent this kind of code reuse. That said, things are working, so it's not really a priority to change all that.

Sounds great, I'll then remove the commented out tests.

As a side note, I've tried to adhere to the whitespace style of current tests, but runic is not happy; do you really want me to reformat as runic says?

@maleadt
Copy link
Member

maleadt commented May 28, 2025

As a side note, I've tried to adhere to the whitespace style of current tests, but runic is not happy; do you really want me to reformat as runic says?

No need.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
cuda libraries Stuff about CUDA library wrappers. enhancement New feature or request needs tests Tests are requested.
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants