Skip to content

Commit 7d90a67

Browse files
authored
Merge pull request #681 from angsch/laqr5
Improve FMA usage in laqr5
2 parents f40d220 + bd8f99b commit 7d90a67

File tree

4 files changed

+223
-174
lines changed

4 files changed

+223
-174
lines changed

SRC/claqr5.f

Lines changed: 52 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
279279
PARAMETER ( RZERO = 0.0e0, RONE = 1.0e0 )
280280
* ..
281281
* .. Local Scalars ..
282-
COMPLEX ALPHA, BETA, CDUM, REFSUM
282+
COMPLEX ALPHA, BETA, CDUM, REFSUM, T1, T2, T3
283283
REAL H11, H12, H21, H22, SAFMAX, SAFMIN, SCL,
284284
$ SMLNUM, TST1, TST2, ULP
285285
INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN,
@@ -424,12 +424,12 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
424424
* ==== Perform update from right within
425425
* . computational window. ====
426426
*
427+
T1 = V( 1, M22 )
428+
T2 = T1*CONJG( V( 2, M22 ) )
427429
DO 30 J = JTOP, MIN( KBOT, K+3 )
428-
REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
429-
$ H( J, K+2 ) )
430-
H( J, K+1 ) = H( J, K+1 ) - REFSUM
431-
H( J, K+2 ) = H( J, K+2 ) -
432-
$ REFSUM*CONJG( V( 2, M22 ) )
430+
REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 )
431+
H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
432+
H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
433433
30 CONTINUE
434434
*
435435
* ==== Perform update from left within
@@ -442,12 +442,13 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
442442
ELSE
443443
JBOT = KBOT
444444
END IF
445+
T1 = CONJG( V( 1, M22 ) )
446+
T2 = T1*V( 2, M22 )
445447
DO 40 J = K+1, JBOT
446-
REFSUM = CONJG( V( 1, M22 ) )*
447-
$ ( H( K+1, J )+CONJG( V( 2, M22 ) )*
448-
$ H( K+2, J ) )
449-
H( K+1, J ) = H( K+1, J ) - REFSUM
450-
H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
448+
REFSUM = H( K+1, J ) +
449+
$ CONJG( V( 2, M22 ) )*H( K+2, J )
450+
H( K+1, J ) = H( K+1, J ) - REFSUM*T1
451+
H( K+2, J ) = H( K+2, J ) - REFSUM*T2
451452
40 CONTINUE
452453
*
453454
* ==== The following convergence test requires that
@@ -610,25 +611,28 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
610611
* . deflation check. We still delay most of the
611612
* . updates from the left for efficiency. ====
612613
*
614+
T1 = V( 1, M )
615+
T2 = T1*CONJG( V( 2, M ) )
616+
T3 = T1*CONJG( V( 3, M ) )
613617
DO 70 J = JTOP, MIN( KBOT, K+3 )
614-
REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
615-
$ H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
616-
H( J, K+1 ) = H( J, K+1 ) - REFSUM
617-
H( J, K+2 ) = H( J, K+2 ) -
618-
$ REFSUM*CONJG( V( 2, M ) )
619-
H( J, K+3 ) = H( J, K+3 ) -
620-
$ REFSUM*CONJG( V( 3, M ) )
618+
REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 )
619+
$ + V( 3, M )*H( J, K+3 )
620+
H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
621+
H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
622+
H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3
621623
70 CONTINUE
622624
*
623625
* ==== Perform update from left for subsequent
624626
* . column. ====
625627
*
626-
REFSUM = CONJG( V( 1, M ) )*( H( K+1, K+1 )
627-
$ +CONJG( V( 2, M ) )*H( K+2, K+1 )
628-
$ +CONJG( V( 3, M ) )*H( K+3, K+1 ) )
629-
H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
630-
H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
631-
H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
628+
T1 = CONJG( V( 1, M ) )
629+
T2 = T1*V( 2, M )
630+
T3 = T1*V( 3, M )
631+
REFSUM = H( K+1, K+1 ) + CONJG( V( 2, M ) )*H( K+2, K+1 )
632+
$ + CONJG( V( 3, M ) )*H( K+3, K+1 )
633+
H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1
634+
H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2
635+
H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3
632636
*
633637
* ==== The following convergence test requires that
634638
* . the tradition small-compared-to-nearby-diagonals
@@ -688,13 +692,15 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
688692
*
689693
DO 100 M = MBOT, MTOP, -1
690694
K = KRCOL + 2*( M-1 )
695+
T1 = CONJG( V( 1, M ) )
696+
T2 = T1*V( 2, M )
697+
T3 = T1*V( 3, M )
691698
DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
692-
REFSUM = CONJG( V( 1, M ) )*
693-
$ ( H( K+1, J )+CONJG( V( 2, M ) )*
694-
$ H( K+2, J )+CONJG( V( 3, M ) )*H( K+3, J ) )
695-
H( K+1, J ) = H( K+1, J ) - REFSUM
696-
H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
697-
H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
699+
REFSUM = H( K+1, J ) + CONJG( V( 2, M ) )*
700+
$ H( K+2, J ) + CONJG( V( 3, M ) )*H( K+3, J )
701+
H( K+1, J ) = H( K+1, J ) - REFSUM*T1
702+
H( K+2, J ) = H( K+2, J ) - REFSUM*T2
703+
H( K+3, J ) = H( K+3, J ) - REFSUM*T3
698704
90 CONTINUE
699705
100 CONTINUE
700706
*
@@ -712,14 +718,15 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
712718
I2 = MAX( 1, KTOP-INCOL )
713719
I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
714720
I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
721+
T1 = V( 1, M )
722+
T2 = T1*CONJG( V( 2, M ) )
723+
T3 = T1*CONJG( V( 3, M ) )
715724
DO 110 J = I2, I4
716-
REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
717-
$ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
718-
U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
719-
U( J, KMS+2 ) = U( J, KMS+2 ) -
720-
$ REFSUM*CONJG( V( 2, M ) )
721-
U( J, KMS+3 ) = U( J, KMS+3 ) -
722-
$ REFSUM*CONJG( V( 3, M ) )
725+
REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 )
726+
$ + V( 3, M )*U( J, KMS+3 )
727+
U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1
728+
U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2
729+
U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3
723730
110 CONTINUE
724731
120 CONTINUE
725732
ELSE IF( WANTZ ) THEN
@@ -730,14 +737,15 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
730737
*
731738
DO 140 M = MBOT, MTOP, -1
732739
K = KRCOL + 2*( M-1 )
740+
T1 = V( 1, M )
741+
T2 = T1*CONJG( V( 2, M ) )
742+
T3 = T1*CONJG( V( 3, M ) )
733743
DO 130 J = ILOZ, IHIZ
734-
REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
735-
$ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
736-
Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
737-
Z( J, K+2 ) = Z( J, K+2 ) -
738-
$ REFSUM*CONJG( V( 2, M ) )
739-
Z( J, K+3 ) = Z( J, K+3 ) -
740-
$ REFSUM*CONJG( V( 3, M ) )
744+
REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 )
745+
$ + V( 3, M )*Z( J, K+3 )
746+
Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1
747+
Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2
748+
Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3
741749
130 CONTINUE
742750
140 CONTINUE
743751
END IF

SRC/dlaqr5.f

Lines changed: 59 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -286,8 +286,8 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
286286
* ..
287287
* .. Local Scalars ..
288288
DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM,
289-
$ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2,
290-
$ ULP
289+
$ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, T1, T2,
290+
$ T3, TST1, TST2, ULP
291291
INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN,
292292
$ JROW, JTOP, K, K1, KDU, KMS, KRCOL,
293293
$ M, M22, MBOT, MTOP, NBMPS, NDCOL,
@@ -447,11 +447,12 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
447447
* ==== Perform update from right within
448448
* . computational window. ====
449449
*
450+
T1 = V( 1, M22 )
451+
T2 = T1*V( 2, M22 )
450452
DO 30 J = JTOP, MIN( KBOT, K+3 )
451-
REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
452-
$ H( J, K+2 ) )
453-
H( J, K+1 ) = H( J, K+1 ) - REFSUM
454-
H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 )
453+
REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 )
454+
H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
455+
H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
455456
30 CONTINUE
456457
*
457458
* ==== Perform update from left within
@@ -464,11 +465,12 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
464465
ELSE
465466
JBOT = KBOT
466467
END IF
468+
T1 = V( 1, M22 )
469+
T2 = T1*V( 2, M22 )
467470
DO 40 J = K+1, JBOT
468-
REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )*
469-
$ H( K+2, J ) )
470-
H( K+1, J ) = H( K+1, J ) - REFSUM
471-
H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
471+
REFSUM = H( K+1, J ) + V( 2, M22 )*H( K+2, J )
472+
H( K+1, J ) = H( K+1, J ) - REFSUM*T1
473+
H( K+2, J ) = H( K+2, J ) - REFSUM*T2
472474
40 CONTINUE
473475
*
474476
* ==== The following convergence test requires that
@@ -522,18 +524,20 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
522524
*
523525
IF( ACCUM ) THEN
524526
KMS = K - INCOL
527+
T1 = V( 1, M22 )
528+
T2 = T1*V( 2, M22 )
525529
DO 50 J = MAX( 1, KTOP-INCOL ), KDU
526-
REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
527-
$ V( 2, M22 )*U( J, KMS+2 ) )
528-
U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
529-
U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 )
530+
REFSUM = U( J, KMS+1 ) + V( 2, M22 )*U( J, KMS+2 )
531+
U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1
532+
U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2
530533
50 CONTINUE
531534
ELSE IF( WANTZ ) THEN
535+
T1 = V( 1, M22 )
536+
T2 = T1*V( 2, M22 )
532537
DO 60 J = ILOZ, IHIZ
533-
REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
534-
$ Z( J, K+2 ) )
535-
Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
536-
Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 )
538+
REFSUM = Z( J, K+1 )+V( 2, M22 )*Z( J, K+2 )
539+
Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1
540+
Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2
537541
60 CONTINUE
538542
END IF
539543
END IF
@@ -631,22 +635,25 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
631635
* . deflation check. We still delay most of the
632636
* . updates from the left for efficiency. ====
633637
*
638+
T1 = V( 1, M )
639+
T2 = T1*V( 2, M )
640+
T3 = T1*V( 3, M )
634641
DO 70 J = JTOP, MIN( KBOT, K+3 )
635-
REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
636-
$ H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
637-
H( J, K+1 ) = H( J, K+1 ) - REFSUM
638-
H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M )
639-
H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M )
642+
REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 )
643+
$ + V( 3, M )*H( J, K+3 )
644+
H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
645+
H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
646+
H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3
640647
70 CONTINUE
641648
*
642649
* ==== Perform update from left for subsequent
643650
* . column. ====
644651
*
645-
REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )*
646-
$ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) )
647-
H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
648-
H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
649-
H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
652+
REFSUM = H( K+1, K+1 ) + V( 2, M )*H( K+2, K+1 )
653+
$ + V( 3, M )*H( K+3, K+1 )
654+
H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1
655+
H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2
656+
H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3
650657
*
651658
* ==== The following convergence test requires that
652659
* . the tradition small-compared-to-nearby-diagonals
@@ -706,12 +713,15 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
706713
*
707714
DO 100 M = MBOT, MTOP, -1
708715
K = KRCOL + 2*( M-1 )
716+
T1 = V( 1, M )
717+
T2 = T1*V( 2, M )
718+
T3 = T1*V( 3, M )
709719
DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
710-
REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )*
711-
$ H( K+2, J )+V( 3, M )*H( K+3, J ) )
712-
H( K+1, J ) = H( K+1, J ) - REFSUM
713-
H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
714-
H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
720+
REFSUM = H( K+1, J ) + V( 2, M )*H( K+2, J )
721+
$ + V( 3, M )*H( K+3, J )
722+
H( K+1, J ) = H( K+1, J ) - REFSUM*T1
723+
H( K+2, J ) = H( K+2, J ) - REFSUM*T2
724+
H( K+3, J ) = H( K+3, J ) - REFSUM*T3
715725
90 CONTINUE
716726
100 CONTINUE
717727
*
@@ -729,12 +739,15 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
729739
I2 = MAX( 1, KTOP-INCOL )
730740
I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
731741
I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
742+
T1 = V( 1, M )
743+
T2 = T1*V( 2, M )
744+
T3 = T1*V( 3, M )
732745
DO 110 J = I2, I4
733-
REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
734-
$ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
735-
U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
736-
U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M )
737-
U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M )
746+
REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 )
747+
$ + V( 3, M )*U( J, KMS+3 )
748+
U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1
749+
U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2
750+
U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3
738751
110 CONTINUE
739752
120 CONTINUE
740753
ELSE IF( WANTZ ) THEN
@@ -745,12 +758,15 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
745758
*
746759
DO 140 M = MBOT, MTOP, -1
747760
K = KRCOL + 2*( M-1 )
761+
T1 = V( 1, M )
762+
T2 = T1*V( 2, M )
763+
T3 = T1*V( 3, M )
748764
DO 130 J = ILOZ, IHIZ
749-
REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
750-
$ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
751-
Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
752-
Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M )
753-
Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M )
765+
REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 )
766+
$ + V( 3, M )*Z( J, K+3 )
767+
Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1
768+
Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2
769+
Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3
754770
130 CONTINUE
755771
140 CONTINUE
756772
END IF

0 commit comments

Comments
 (0)