Skip to content

Commit 95b6e84

Browse files
Updates Givens rotations with preciser algorithms
1 parent 2904d87 commit 95b6e84

File tree

6 files changed

+251
-117
lines changed

6 files changed

+251
-117
lines changed

BLAS/SRC/crotg.f90

Lines changed: 72 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
!> \brief \b CROTG
1+
!> \brief \b CROTG generates a Givens rotation with real cosine and complex sine.
22
!
33
! =========== DOCUMENTATION ===========
44
!
@@ -24,12 +24,12 @@
2424
!> = 1 if x = 0
2525
!> c = |a| / sqrt(|a|**2 + |b|**2)
2626
!> s = sgn(a) * conjg(b) / sqrt(|a|**2 + |b|**2)
27-
!> When a and b are real and r /= 0, the formulas simplify to
2827
!> r = sgn(a)*sqrt(|a|**2 + |b|**2)
28+
!> When a and b are real and r /= 0, the formulas simplify to
2929
!> c = a / r
3030
!> s = b / r
31-
!> the same as in CROTG when |a| > |b|. When |b| >= |a|, the
32-
!> sign of c and s will be different from those computed by CROTG
31+
!> the same as in SROTG when |a| > |b|. When |b| >= |a|, the
32+
!> sign of c and s will be different from those computed by SROTG
3333
!> if the signs of a and b are not the same.
3434
!>
3535
!> \endverbatim
@@ -65,20 +65,19 @@
6565
! Authors:
6666
! ========
6767
!
68-
!> \author Edward Anderson, Lockheed Martin
68+
!> \author Weslley Pereira, University of Colorado Denver, USA
6969
!
70-
!> \par Contributors:
71-
! ==================
72-
!>
73-
!> Weslley Pereira, University of Colorado Denver, USA
70+
!> \date December 2021
7471
!
75-
!> \ingroup single_blas_level1
72+
!> \ingroup OTHERauxiliary
7673
!
7774
!> \par Further Details:
7875
! =====================
7976
!>
8077
!> \verbatim
8178
!>
79+
!> Based on the algorithm from
80+
!>
8281
!> Anderson E. (2017)
8382
!> Algorithm 978: Safe Scaling in the Level 1 BLAS
8483
!> ACM Trans Math Softw 44:1--28
@@ -108,21 +107,14 @@ subroutine CROTG( a, b, c, s )
108107
1-minexponent(0._wp), &
109108
maxexponent(0._wp)-1 &
110109
)
111-
real(wp), parameter :: rtmin = sqrt( real(radix(0._wp),wp)**max( &
112-
minexponent(0._wp)-1, &
113-
1-maxexponent(0._wp) &
114-
) / epsilon(0._wp) )
115-
real(wp), parameter :: rtmax = sqrt( real(radix(0._wp),wp)**max( &
116-
1-minexponent(0._wp), &
117-
maxexponent(0._wp)-1 &
118-
) * epsilon(0._wp) )
110+
real(wp), parameter :: rtmin = sqrt( safmin )
119111
! ..
120112
! .. Scalar Arguments ..
121113
real(wp) :: c
122114
complex(wp) :: a, b, s
123115
! ..
124116
! .. Local Scalars ..
125-
real(wp) :: d, f1, f2, g1, g2, h2, u, v, w
117+
real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmax
126118
complex(wp) :: f, fs, g, gs, r, t
127119
! ..
128120
! .. Intrinsic Functions ..
@@ -145,6 +137,7 @@ subroutine CROTG( a, b, c, s )
145137
else if( f == czero ) then
146138
c = zero
147139
g1 = max( abs(real(g)), abs(aimag(g)) )
140+
rtmax = sqrt( safmax/2 )
148141
if( g1 > rtmin .and. g1 < rtmax ) then
149142
!
150143
! Use unscaled algorithm
@@ -165,6 +158,7 @@ subroutine CROTG( a, b, c, s )
165158
else
166159
f1 = max( abs(real(f)), abs(aimag(f)) )
167160
g1 = max( abs(real(g)), abs(aimag(g)) )
161+
rtmax = sqrt( safmax/4 )
168162
if( f1 > rtmin .and. f1 < rtmax .and. &
169163
g1 > rtmin .and. g1 < rtmax ) then
170164
!
@@ -173,14 +167,36 @@ subroutine CROTG( a, b, c, s )
173167
f2 = ABSSQ( f )
174168
g2 = ABSSQ( g )
175169
h2 = f2 + g2
176-
if( f2 > rtmin .and. h2 < rtmax ) then
177-
d = sqrt( f2*h2 )
170+
! safmin <= f2 <= h2 <= safmax
171+
if( f2 >= h2 * safmin ) then
172+
! safmin <= f2/h2 <= 1, and h2/f2 is finite
173+
c = sqrt( f2 / h2 )
174+
r = f / c
175+
rtmax = rtmax * 2
176+
if( f2 > rtmin .and. h2 < rtmax ) then
177+
! safmin <= sqrt( f2*h2 ) <= safmax
178+
s = conjg( g ) * ( f / sqrt( f2*h2 ) )
179+
else
180+
s = conjg( g ) * ( r / h2 )
181+
end if
178182
else
179-
d = sqrt( f2 )*sqrt( h2 )
183+
! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow.
184+
! Moreover,
185+
! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax,
186+
! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax).
187+
! Also,
188+
! g2 >> f2, which means that h2 = g2.
189+
d = sqrt( f2 * h2 )
190+
c = f2 / d
191+
if( c >= safmin ) then
192+
r = f / c
193+
else
194+
! f2 / sqrt(f2 * h2) < safmin, then
195+
! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax
196+
r = f * ( h2 / d )
197+
end if
198+
s = conjg( g ) * ( f / d )
180199
end if
181-
c = f2 / d
182-
s = conjg( g )*( f / d )
183-
r = f*( h2 / d )
184200
else
185201
!
186202
! Use scaled algorithm
@@ -207,14 +223,39 @@ subroutine CROTG( a, b, c, s )
207223
f2 = ABSSQ( fs )
208224
h2 = f2 + g2
209225
end if
210-
if( f2 > rtmin .and. h2 < rtmax ) then
211-
d = sqrt( f2*h2 )
226+
! safmin <= f2 <= h2 <= safmax
227+
if( f2 >= h2 * safmin ) then
228+
! safmin <= f2/h2 <= 1, and h2/f2 is finite
229+
c = sqrt( f2 / h2 )
230+
r = fs / c
231+
rtmax = rtmax * 2
232+
if( f2 > rtmin .and. h2 < rtmax ) then
233+
! safmin <= sqrt( f2*h2 ) <= safmax
234+
s = conjg( gs ) * ( fs / sqrt( f2*h2 ) )
235+
else
236+
s = conjg( gs ) * ( r / h2 )
237+
end if
212238
else
213-
d = sqrt( f2 )*sqrt( h2 )
239+
! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow.
240+
! Moreover,
241+
! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax,
242+
! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax).
243+
! Also,
244+
! g2 >> f2, which means that h2 = g2.
245+
d = sqrt( f2 * h2 )
246+
c = f2 / d
247+
if( c >= safmin ) then
248+
r = fs / c
249+
else
250+
! f2 / sqrt(f2 * h2) < safmin, then
251+
! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax
252+
r = fs * ( h2 / d )
253+
end if
254+
s = conjg( gs ) * ( fs / d )
214255
end if
215-
c = ( f2 / d )*w
216-
s = conjg( gs )*( fs / d )
217-
r = ( fs*( h2 / d ) )*u
256+
! Rescale c and r
257+
c = c * w
258+
r = r * u
218259
end if
219260
end if
220261
a = r

BLAS/SRC/zrotg.f90

Lines changed: 72 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
!> \brief \b ZROTG
1+
!> \brief \b ZROTG generates a Givens rotation with real cosine and complex sine.
22
!
33
! =========== DOCUMENTATION ===========
44
!
@@ -24,12 +24,12 @@
2424
!> = 1 if x = 0
2525
!> c = |a| / sqrt(|a|**2 + |b|**2)
2626
!> s = sgn(a) * conjg(b) / sqrt(|a|**2 + |b|**2)
27-
!> When a and b are real and r /= 0, the formulas simplify to
2827
!> r = sgn(a)*sqrt(|a|**2 + |b|**2)
28+
!> When a and b are real and r /= 0, the formulas simplify to
2929
!> c = a / r
3030
!> s = b / r
31-
!> the same as in ZROTG when |a| > |b|. When |b| >= |a|, the
32-
!> sign of c and s will be different from those computed by ZROTG
31+
!> the same as in DROTG when |a| > |b|. When |b| >= |a|, the
32+
!> sign of c and s will be different from those computed by DROTG
3333
!> if the signs of a and b are not the same.
3434
!>
3535
!> \endverbatim
@@ -65,20 +65,19 @@
6565
! Authors:
6666
! ========
6767
!
68-
!> \author Edward Anderson, Lockheed Martin
68+
!> \author Weslley Pereira, University of Colorado Denver, USA
6969
!
70-
!> \par Contributors:
71-
! ==================
72-
!>
73-
!> Weslley Pereira, University of Colorado Denver, USA
70+
!> \date December 2021
7471
!
75-
!> \ingroup single_blas_level1
72+
!> \ingroup OTHERauxiliary
7673
!
7774
!> \par Further Details:
7875
! =====================
7976
!>
8077
!> \verbatim
8178
!>
79+
!> Based on the algorithm from
80+
!>
8281
!> Anderson E. (2017)
8382
!> Algorithm 978: Safe Scaling in the Level 1 BLAS
8483
!> ACM Trans Math Softw 44:1--28
@@ -108,21 +107,14 @@ subroutine ZROTG( a, b, c, s )
108107
1-minexponent(0._wp), &
109108
maxexponent(0._wp)-1 &
110109
)
111-
real(wp), parameter :: rtmin = sqrt( real(radix(0._wp),wp)**max( &
112-
minexponent(0._wp)-1, &
113-
1-maxexponent(0._wp) &
114-
) / epsilon(0._wp) )
115-
real(wp), parameter :: rtmax = sqrt( real(radix(0._wp),wp)**max( &
116-
1-minexponent(0._wp), &
117-
maxexponent(0._wp)-1 &
118-
) * epsilon(0._wp) )
110+
real(wp), parameter :: rtmin = sqrt( safmin )
119111
! ..
120112
! .. Scalar Arguments ..
121113
real(wp) :: c
122114
complex(wp) :: a, b, s
123115
! ..
124116
! .. Local Scalars ..
125-
real(wp) :: d, f1, f2, g1, g2, h2, u, v, w
117+
real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmax
126118
complex(wp) :: f, fs, g, gs, r, t
127119
! ..
128120
! .. Intrinsic Functions ..
@@ -145,6 +137,7 @@ subroutine ZROTG( a, b, c, s )
145137
else if( f == czero ) then
146138
c = zero
147139
g1 = max( abs(real(g)), abs(aimag(g)) )
140+
rtmax = sqrt( safmax/2 )
148141
if( g1 > rtmin .and. g1 < rtmax ) then
149142
!
150143
! Use unscaled algorithm
@@ -165,6 +158,7 @@ subroutine ZROTG( a, b, c, s )
165158
else
166159
f1 = max( abs(real(f)), abs(aimag(f)) )
167160
g1 = max( abs(real(g)), abs(aimag(g)) )
161+
rtmax = sqrt( safmax/4 )
168162
if( f1 > rtmin .and. f1 < rtmax .and. &
169163
g1 > rtmin .and. g1 < rtmax ) then
170164
!
@@ -173,14 +167,36 @@ subroutine ZROTG( a, b, c, s )
173167
f2 = ABSSQ( f )
174168
g2 = ABSSQ( g )
175169
h2 = f2 + g2
176-
if( f2 > rtmin .and. h2 < rtmax ) then
177-
d = sqrt( f2*h2 )
170+
! safmin <= f2 <= h2 <= safmax
171+
if( f2 >= h2 * safmin ) then
172+
! safmin <= f2/h2 <= 1, and h2/f2 is finite
173+
c = sqrt( f2 / h2 )
174+
r = f / c
175+
rtmax = rtmax * 2
176+
if( f2 > rtmin .and. h2 < rtmax ) then
177+
! safmin <= sqrt( f2*h2 ) <= safmax
178+
s = conjg( g ) * ( f / sqrt( f2*h2 ) )
179+
else
180+
s = conjg( g ) * ( r / h2 )
181+
end if
178182
else
179-
d = sqrt( f2 )*sqrt( h2 )
183+
! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow.
184+
! Moreover,
185+
! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax,
186+
! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax).
187+
! Also,
188+
! g2 >> f2, which means that h2 = g2.
189+
d = sqrt( f2 * h2 )
190+
c = f2 / d
191+
if( c >= safmin ) then
192+
r = f / c
193+
else
194+
! f2 / sqrt(f2 * h2) < safmin, then
195+
! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax
196+
r = f * ( h2 / d )
197+
end if
198+
s = conjg( g ) * ( f / d )
180199
end if
181-
c = f2 / d
182-
s = conjg( g )*( f / d )
183-
r = f*( h2 / d )
184200
else
185201
!
186202
! Use scaled algorithm
@@ -207,14 +223,39 @@ subroutine ZROTG( a, b, c, s )
207223
f2 = ABSSQ( fs )
208224
h2 = f2 + g2
209225
end if
210-
if( f2 > rtmin .and. h2 < rtmax ) then
211-
d = sqrt( f2*h2 )
226+
! safmin <= f2 <= h2 <= safmax
227+
if( f2 >= h2 * safmin ) then
228+
! safmin <= f2/h2 <= 1, and h2/f2 is finite
229+
c = sqrt( f2 / h2 )
230+
r = fs / c
231+
rtmax = rtmax * 2
232+
if( f2 > rtmin .and. h2 < rtmax ) then
233+
! safmin <= sqrt( f2*h2 ) <= safmax
234+
s = conjg( gs ) * ( fs / sqrt( f2*h2 ) )
235+
else
236+
s = conjg( gs ) * ( r / h2 )
237+
end if
212238
else
213-
d = sqrt( f2 )*sqrt( h2 )
239+
! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow.
240+
! Moreover,
241+
! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax,
242+
! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax).
243+
! Also,
244+
! g2 >> f2, which means that h2 = g2.
245+
d = sqrt( f2 * h2 )
246+
c = f2 / d
247+
if( c >= safmin ) then
248+
r = fs / c
249+
else
250+
! f2 / sqrt(f2 * h2) < safmin, then
251+
! h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax
252+
r = fs * ( h2 / d )
253+
end if
254+
s = conjg( gs ) * ( fs / d )
214255
end if
215-
c = ( f2 / d )*w
216-
s = conjg( gs )*( fs / d )
217-
r = ( fs*( h2 / d ) )*u
256+
! Rescale c and r
257+
c = c * w
258+
r = r * u
218259
end if
219260
end if
220261
a = r

0 commit comments

Comments
 (0)