Skip to content

Commit 0e9ce69

Browse files
authored
Merge pull request #328 from denius/qr-wopivoting
Add QR decomposition
2 parents 400fd9c + 4712daf commit 0e9ce69

File tree

4 files changed

+352
-26
lines changed

4 files changed

+352
-26
lines changed

perf/bench-qr.txt

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
Size(1,1)
2+
Compilation time: 0.386616 seconds (111.68 k allocations: 5.811 MiB)
3+
2.649 ns (0 allocations: 0 bytes)
4+
778.696 ns (21 allocations: 1.13 KiB)
5+
Size(2,2)
6+
Compilation time: 0.024005 seconds (19.69 k allocations: 1.002 MiB)
7+
10.429 ns (0 allocations: 0 bytes)
8+
1.602 μs (21 allocations: 1.22 KiB)
9+
Size(3,3)
10+
Compilation time: 0.037816 seconds (38.85 k allocations: 1.885 MiB)
11+
33.545 ns (0 allocations: 0 bytes)
12+
2.508 μs (21 allocations: 1.50 KiB)
13+
Size(4,4)
14+
Compilation time: 0.061655 seconds (75.39 k allocations: 3.573 MiB)
15+
64.099 ns (0 allocations: 0 bytes)
16+
3.352 μs (21 allocations: 1.78 KiB)
17+
Size(5,5)
18+
Compilation time: 0.110812 seconds (139.52 k allocations: 6.376 MiB)
19+
118.805 ns (0 allocations: 0 bytes)
20+
4.359 μs (21 allocations: 2.53 KiB)
21+
Size(6,6)
22+
Compilation time: 0.176334 seconds (247.47 k allocations: 10.854 MiB)
23+
159.423 ns (0 allocations: 0 bytes)
24+
5.320 μs (21 allocations: 2.91 KiB)
25+
Size(7,7)
26+
Compilation time: 0.281787 seconds (414.78 k allocations: 17.502 MiB)
27+
279.791 ns (0 allocations: 0 bytes)
28+
6.324 μs (21 allocations: 3.75 KiB)
29+
Size(8,8)
30+
Compilation time: 0.418656 seconds (663.01 k allocations: 27.114 MiB, 0.67% gc time)
31+
295.032 ns (0 allocations: 0 bytes)
32+
7.336 μs (21 allocations: 4.22 KiB)
33+
Size(9,9)
34+
Compilation time: 0.659570 seconds (1.02 M allocations: 40.792 MiB, 0.97% gc time)
35+
522.869 ns (0 allocations: 0 bytes)
36+
8.599 μs (21 allocations: 4.88 KiB)
37+
Size(10,10)
38+
Compilation time: 0.968919 seconds (1.53 M allocations: 59.968 MiB, 1.11% gc time)
39+
639.133 ns (0 allocations: 0 bytes)
40+
9.947 μs (21 allocations: 5.81 KiB)
41+
Size(11,11)
42+
Compilation time: 1.398025 seconds (2.22 M allocations: 85.519 MiB, 1.10% gc time)
43+
968.560 ns (0 allocations: 0 bytes)
44+
11.189 μs (21 allocations: 6.94 KiB)
45+
Size(12,12)
46+
Compilation time: 1.910943 seconds (3.15 M allocations: 119.594 MiB, 1.06% gc time)
47+
1.023 μs (0 allocations: 0 bytes)
48+
12.336 μs (21 allocations: 7.88 KiB)
49+
Size(13,13)
50+
Compilation time: 2.775988 seconds (4.39 M allocations: 164.094 MiB, 1.09% gc time)
51+
1.668 μs (0 allocations: 0 bytes)
52+
13.682 μs (21 allocations: 9.28 KiB)
53+
Size(14,14)
54+
Compilation time: 3.626697 seconds (6.01 M allocations: 221.593 MiB, 1.07% gc time)
55+
2.127 μs (0 allocations: 0 bytes)
56+
15.404 μs (21 allocations: 11.16 KiB)
57+
Size(15,15)
58+
Compilation time: 4.943009 seconds (8.09 M allocations: 294.704 MiB, 2.50% gc time)
59+
2.994 μs (0 allocations: 0 bytes)
60+
17.252 μs (21 allocations: 12.47 KiB)
61+
Size(16,16)
62+
Compilation time: 6.496340 seconds (10.99 M allocations: 390.435 MiB, 2.03% gc time)
63+
4.306 μs (0 allocations: 0 bytes)
64+
18.581 μs (21 allocations: 13.31 KiB)
65+
Size(17,17)
66+
Compilation time: 8.676325 seconds (14.40 M allocations: 506.984 MiB, 1.78% gc time)
67+
5.601 μs (0 allocations: 0 bytes)
68+
20.941 μs (21 allocations: 15.19 KiB)
69+
Size(18,18)
70+
Compilation time: 11.178198 seconds (18.62 M allocations: 649.376 MiB, 1.67% gc time)
71+
9.696 μs (0 allocations: 0 bytes)
72+
22.942 μs (21 allocations: 16.69 KiB)
73+
Size(19,19)
74+
Compilation time: 14.343331 seconds (23.80 M allocations: 823.311 MiB, 1.45% gc time)
75+
12.400 μs (0 allocations: 0 bytes)
76+
24.977 μs (21 allocations: 18.56 KiB)
77+
Size(20,20)
78+
Compilation time: 19.905053 seconds (30.08 M allocations: 1.009 GiB, 1.86% gc time)
79+
14.505 μs (0 allocations: 0 bytes)
80+
26.370 μs (21 allocations: 20.06 KiB)
81+
Size(21,21)
82+
Compilation time: 24.492438 seconds (37.63 M allocations: 1.254 GiB, 2.07% gc time)
83+
17.629 μs (0 allocations: 0 bytes)
84+
28.935 μs (21 allocations: 22.31 KiB)
85+
Size(22,22)
86+
Compilation time: 30.472894 seconds (46.62 M allocations: 1.546 GiB, 1.97% gc time)
87+
19.947 μs (0 allocations: 0 bytes)
88+
31.394 μs (21 allocations: 24.19 KiB)

perf/benchmark-qr.jl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
using StaticArrays
2+
using BenchmarkTools, Compat
3+
4+
a = m = 0
5+
for K = 1:22
6+
a = rand(SMatrix{K,K,Float64,K*K})
7+
m = Matrix(a)
8+
print("Size($K,$K)\n Compilation time:")
9+
@time qr(a)
10+
@btime qr($a)
11+
@btime qr($m)
12+
end

src/qr.jl

Lines changed: 201 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,207 @@
1-
_thin_must_hold(thin) =
1+
_thin_must_hold(thin) =
22
thin || throw(ArgumentError("For the sake of type stability, `thin = true` must hold."))
33
import Base.qr
44

5-
function qr(A::StaticMatrix, pivot::Type{Val{true}}; thin::Bool=true)
5+
6+
"""
7+
qr(A::StaticMatrix, pivot=Val{false}; thin=true) -> Q, R, [p]
8+
9+
Compute the QR factorization of `A` such that `A = Q*R` or `A[:,p] = Q*R`, see [`qr`](@ref).
10+
This function does not support `thin=false` keyword option due to type inference instability.
11+
To use this option call `qr(A, pivot, Val{false})` instead.
12+
"""
13+
@inline function qr(A::StaticMatrix, pivot::Union{Type{Val{false}}, Type{Val{true}}} = Val{false}; thin::Bool=true)
614
_thin_must_hold(thin)
7-
Q0, R0, p0 = Base.qr(Matrix(A), pivot)
8-
T = arithmetic_closure(eltype(A))
9-
QT = similar_type(A, T, Size(diagsize(A), diagsize(A)))
10-
RT = similar_type(A, T)
11-
PT = similar_type(A, Int, Size(Size(A)[2]))
12-
QT(Q0), RT(R0), PT(p0)
15+
return _qr(Size(A), A, pivot, Val{true})
1316
end
14-
function qr(A::StaticMatrix, pivot::Type{Val{false}}; thin::Bool=true)
15-
_thin_must_hold(thin)
16-
Q0, R0 = Base.qr(Matrix(A), pivot)
17-
T = arithmetic_closure(eltype(A))
18-
QT = similar_type(A, T, Size(diagsize(A), diagsize(A)))
19-
RT = similar_type(A, T)
20-
QT(Q0), RT(R0)
17+
18+
19+
@inline qr(A::StaticMatrix, pivot::Union{Type{Val{false}}, Type{Val{true}}}, thin::Union{Type{Val{false}}, Type{Val{true}}}) = _qr(Size(A), A, pivot, thin)
20+
21+
22+
@generated function _qr(::Size{sA}, A::StaticMatrix{<:Any, <:Any, TA}, pivot::Union{Type{Val{false}}, Type{Val{true}}} = Val{false}, thin::Union{Type{Val{false}}, Type{Val{true}}} = Val{true}) where {sA, TA}
23+
24+
isthin = thin == Type{Val{true}}
25+
26+
SizeQ = Size( sA[1], isthin ? diagsize(Size(A)) : sA[1] )
27+
SizeR = Size( diagsize(Size(A)), sA[2] )
28+
29+
if pivot == Type{Val{true}}
30+
return quote
31+
@_inline_meta
32+
Q0, R0, p0 = Base.qr(Matrix(A), pivot, thin=$isthin)
33+
T = arithmetic_closure(TA)
34+
return similar_type(A, T, $(SizeQ))(Q0),
35+
similar_type(A, T, $(SizeR))(R0),
36+
similar_type(A, Int, $(Size(sA[2])))(p0)
37+
end
38+
else
39+
if (sA[1]*sA[1] + sA[1]*sA[2])÷2 * diagsize(Size(A)) < 17*17*17
40+
return quote
41+
@_inline_meta
42+
return qr_unrolled(Size(A), A, pivot, thin)
43+
end
44+
else
45+
return quote
46+
@_inline_meta
47+
Q0, R0 = Base.qr(Matrix(A), pivot, thin=$isthin)
48+
T = arithmetic_closure(TA)
49+
return similar_type(A, T, $(SizeQ))(Q0),
50+
similar_type(A, T, $(SizeR))(R0)
51+
end
52+
end
53+
end
2154
end
55+
56+
57+
# Compute the QR decomposition of `A` such that `A = Q*R`
58+
# by Householder reflections without pivoting.
59+
#
60+
# `thin=true` (reduced) method will produce `Q` and `R` in truncated form,
61+
# in the case of `thin=false` Q is full, but R is still reduced, see [`qr`](@ref).
62+
#
63+
# For original source code see below.
64+
@generated function qr_unrolled(::Size{sA}, A::StaticMatrix{<:Any, <:Any, TA}, pivot::Type{Val{false}}, thin::Union{Type{Val{false}}, Type{Val{true}}} = Val{true}) where {sA, TA}
65+
m, n = sA[1], sA[2]
66+
67+
Q = [Symbol("Q_$(i)_$(j)") for i = 1:m, j = 1:m]
68+
R = [Symbol("R_$(i)_$(j)") for i = 1:m, j = 1:n]
69+
70+
initQ = [:($(Q[i, j]) = $(i == j ? one : zero)(T)) for i = 1:m, j = 1:m] # Q .= eye(A)
71+
initR = [:($(R[i, j]) = T(A[$i, $j])) for i = 1:m, j = 1:n] # R .= A
72+
73+
code = quote end
74+
for k = 1:min(m - 1 + !(TA<:Real), n)
75+
#x = view(R, k:m, k)
76+
#τk = reflector!(x)
77+
push!(code.args, :(ξ1 = $(R[k, k])))
78+
ex = :(normu = abs2(ξ1))
79+
for i = k+1:m
80+
ex = :($ex + abs2($(R[i, k])))
81+
end
82+
push!(code.args, :(normu = sqrt($ex)))
83+
push!(code.args, :(ν = copysign(normu, real(ξ1))))
84+
push!(code.args, :(ξ1 += ν))
85+
push!(code.args, :(invξ1 = ξ1 == zero(T) ? zero(T) : inv(ξ1)))
86+
push!(code.args, :($(R[k, k]) = -ν))
87+
for i = k+1:m
88+
push!(code.args, :($(R[i, k]) *= invξ1))
89+
end
90+
push!(code.args, :(τk = ν == zero(T) ? zero(T) : ξ1/ν))
91+
92+
#reflectorApply!(x, τk, view(R, k:m, k+1:n))
93+
for j = k+1:n
94+
ex = :($(R[k, j]))
95+
for i = k+1:m
96+
ex = :($ex + $(R[i, k])'*$(R[i, j]))
97+
end
98+
push!(code.args, :(vRj = τk'*$ex))
99+
push!(code.args, :($(R[k, j]) -= vRj))
100+
for i = k+1:m
101+
push!(code.args, :($(R[i, j]) -= $(R[i, k])*vRj))
102+
end
103+
end
104+
105+
#reflectorApplyRight!(x, τk, view(Q, 1:m, k:m))
106+
for i = 1:m
107+
ex = :($(Q[i, k]))
108+
for j = k+1:m
109+
ex = :($ex + $(Q[i, j])*$(R[j, k]))
110+
end
111+
push!(code.args, :(Qiv = $ex*τk))
112+
push!(code.args, :($(Q[i, k]) -= Qiv))
113+
for j = k+1:m
114+
push!(code.args, :($(Q[i, j]) -= Qiv*$(R[j, k])'))
115+
end
116+
end
117+
118+
for i = k+1:m
119+
push!(code.args, :($(R[i, k]) = zero(T)))
120+
end
121+
end
122+
123+
# truncate Q and R sizes in LAPACK consilient way
124+
if thin == Type{Val{true}}
125+
mQ, nQ = m, min(m, n)
126+
else
127+
mQ, nQ = m, m
128+
end
129+
mR, nR = min(m, n), n
130+
131+
return quote
132+
@_inline_meta
133+
T = arithmetic_closure(TA)
134+
@inbounds $(Expr(:block, initQ...))
135+
@inbounds $(Expr(:block, initR...))
136+
@inbounds $code
137+
@inbounds return similar_type(A, T, $(Size(mQ, nQ)))( tuple($(Q[1:mQ, 1:nQ]...)) ),
138+
similar_type(A, T, $(Size(mR, nR)))( tuple($(R[1:mR, 1:nR]...)) )
139+
end
140+
141+
end
142+
143+
144+
## Source for @generated qr_unrolled() function above.
145+
## Derived from base/linalg/qr.jl
146+
## thin=true version of QR
147+
#function qr_unrolled(A::StaticMatrix{<:Any, <:Any, TA}) where {TA}
148+
# m, n = size(A)
149+
# T = arithmetic_closure(TA)
150+
# Q = eye(MMatrix{m,m,T,m*m})
151+
# R = MMatrix{m,n,T,m*n}(A)
152+
# for k = 1:min(m - 1 + !(TA<:Real), n)
153+
# #x = view(R, k:m, k)
154+
# #τk = reflector!(x)
155+
# ξ1 = R[k, k]
156+
# normu = abs2(ξ1)
157+
# for i = k+1:m
158+
# normu += abs2(R[i, k])
159+
# end
160+
# normu = sqrt(normu)
161+
# ν = copysign(normu, real(ξ1))
162+
# ξ1 += ν
163+
# invξ1 = ξ1 == zero(T) ? zero(T) : inv(ξ1)
164+
# R[k, k] = -ν
165+
# for i = k+1:m
166+
# R[i, k] *= invξ1
167+
# end
168+
# τk = ν == zero(T) ? zero(T) : ξ1/ν
169+
#
170+
# #reflectorApply!(x, τk, view(R, k:m, k+1:n))
171+
# for j = k+1:n
172+
# vRj = R[k, j]
173+
# for i = k+1:m
174+
# vRj += R[i, k]'*R[i, j]
175+
# end
176+
# vRj = τk'*vRj
177+
# R[k, j] -= vRj
178+
# for i = k+1:m
179+
# R[i, j] -= R[i, k]*vRj
180+
# end
181+
# end
182+
#
183+
# #reflectorApplyRight!(x, τk, view(Q, 1:m, k:m))
184+
# for i = 1:m
185+
# Qiv = Q[i, k]
186+
# for j = k+1:m
187+
# Qiv += Q[i, j]*R[j, k]
188+
# end
189+
# Qiv = Qiv*τk
190+
# Q[i, k] -= Qiv
191+
# for j = k+1:m
192+
# Q[i, j] -= Qiv*R[j, k]'
193+
# end
194+
# end
195+
#
196+
# for i = k+1:m
197+
# R[i, k] = zero(T)
198+
# end
199+
#
200+
# end
201+
# if m > n
202+
# return (similar_type(A, T, Size(m, n))(Q[1:m,1:n]), similar_type(A, T, Size(n, n))(R[1:n,1:n]))
203+
# else
204+
# return (similar_type(A, T, Size(m, m))(Q), similar_type(A, T, Size(n, n))(R))
205+
# end
206+
#end
207+

0 commit comments

Comments
 (0)