Skip to content

Commit b8f3605

Browse files
authored
Merge pull request #23 from xianyi/develop
rebase
2 parents fbb8949 + b36018b commit b8f3605

18 files changed

+566
-28
lines changed

CONTRIBUTORS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,4 @@ In chronological order:
179179
* [2019-11-12] AVX512 CGEMM & ZGEMM kernels
180180
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
181181
* [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
182+
* [2020-01-07] optimize AVX2 SGEMM and STRMM

kernel/x86_64/KERNEL.HASWELL

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ DAXPYKERNEL = daxpy.c
3131
CAXPYKERNEL = caxpy.c
3232
ZAXPYKERNEL = zaxpy.c
3333

34-
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
35-
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
34+
STRMMKERNEL = sgemm_kernel_8x4_haswell.c
35+
SGEMMKERNEL = sgemm_kernel_8x4_haswell.c
3636
SGEMM_BETA = sgemm_beta_skylakex.c
37-
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
38-
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
37+
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
38+
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
3939
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
4040
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
4141
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)

kernel/x86_64/KERNEL.SKYLAKEX

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
include $(KERNELDIR)/KERNEL.HASWELL
22

33
SGEMMKERNEL = sgemm_kernel_16x4_skylakex_2.c
4-
4+
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
55
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
66
SGEMMITCOPY = sgemm_tcopy_16_skylakex.c
77
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c

kernel/x86_64/KERNEL.ZEN

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@ DAXPYKERNEL = daxpy.c
3030
CAXPYKERNEL = caxpy.c
3131
ZAXPYKERNEL = zaxpy.c
3232

33-
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
34-
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
35-
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
36-
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
33+
STRMMKERNEL = sgemm_kernel_8x4_haswell.c
34+
SGEMMKERNEL = sgemm_kernel_8x4_haswell.c
35+
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
36+
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
3737
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
3838
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
3939
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)

kernel/x86_64/sgemm_kernel_8x4_haswell.c

Lines changed: 490 additions & 0 deletions
Large diffs are not rendered by default.

lapack-netlib/LAPACKE/src/lapacke_cheev_work.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,11 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo,
7878
info = info - 1;
7979
}
8080
/* Transpose output matrices */
81-
LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
81+
if ( jobz == 'V') {
82+
LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
83+
} else {
84+
LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
85+
}
8286
/* Release memory and exit */
8387
LAPACKE_free( a_t );
8488
exit_level_0:

lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,11 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo,
7979
info = info - 1;
8080
}
8181
/* Transpose output matrices */
82-
LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
82+
if ( jobz == 'V') {
83+
LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
84+
} else {
85+
LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
86+
}
8387
/* Release memory and exit */
8488
LAPACKE_free( a_t );
8589
exit_level_0:

lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,11 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo,
7979
info = info - 1;
8080
}
8181
/* Transpose output matrices */
82-
LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
83-
82+
if ( jobz == 'V') {
83+
LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
84+
} else {
85+
LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
86+
}
8487
/* Release memory and exit */
8588
LAPACKE_free( a_t );
8689
exit_level_0:

lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,11 @@ lapack_int LAPACKE_dsyev_work( int matrix_layout, char jobz, char uplo,
7272
info = info - 1;
7373
}
7474
/* Transpose output matrices */
75-
LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
75+
if ( jobz == 'V') {
76+
LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
77+
} else {
78+
LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
79+
}
7680
/* Release memory and exit */
7781
LAPACKE_free( a_t );
7882
exit_level_0:

lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,11 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo,
7676
info = info - 1;
7777
}
7878
/* Transpose output matrices */
79-
LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
79+
if ( jobz == 'V') {
80+
LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
81+
} else {
82+
LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
83+
}
8084
/* Release memory and exit */
8185
LAPACKE_free( a_t );
8286
exit_level_0:

lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,11 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo,
7676
info = info - 1;
7777
}
7878
/* Transpose output matrices */
79-
LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
79+
if ( jobz == 'V') {
80+
LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
81+
} else {
82+
LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
83+
}
8084
/* Release memory and exit */
8185
LAPACKE_free( a_t );
8286
exit_level_0:

lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,11 @@ lapack_int LAPACKE_ssyev_work( int matrix_layout, char jobz, char uplo,
7272
info = info - 1;
7373
}
7474
/* Transpose output matrices */
75-
LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
75+
if ( jobz == 'V') {
76+
LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
77+
} else {
78+
LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
79+
}
7680
/* Release memory and exit */
7781
LAPACKE_free( a_t );
7882
exit_level_0:

lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,11 @@ lapack_int LAPACKE_ssyevd_2stage_work( int matrix_layout, char jobz, char uplo,
7676
info = info - 1;
7777
}
7878
/* Transpose output matrices */
79-
LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
79+
if ( jobz == 'V') {
80+
LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
81+
} else {
82+
LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
83+
}
8084
/* Release memory and exit */
8185
LAPACKE_free( a_t );
8286
exit_level_0:

lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,11 @@ lapack_int LAPACKE_ssyevd_work( int matrix_layout, char jobz, char uplo,
7676
info = info - 1;
7777
}
7878
/* Transpose output matrices */
79-
LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
79+
if ( jobz == 'V') {
80+
LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
81+
} else {
82+
LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
83+
}
8084
/* Release memory and exit */
8185
LAPACKE_free( a_t );
8286
exit_level_0:

lapack-netlib/LAPACKE/src/lapacke_zheev_work.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,11 @@ lapack_int LAPACKE_zheev_work( int matrix_layout, char jobz, char uplo,
7878
info = info - 1;
7979
}
8080
/* Transpose output matrices */
81-
LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
81+
if ( jobz == 'V') {
82+
LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
83+
} else {
84+
LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
85+
}
8286
/* Release memory and exit */
8387
LAPACKE_free( a_t );
8488
exit_level_0:

lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,11 @@ lapack_int LAPACKE_zheevd_2stage_work( int matrix_layout, char jobz, char uplo,
7979
info = info - 1;
8080
}
8181
/* Transpose output matrices */
82-
LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
82+
if ( jobz == 'V') {
83+
LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
84+
} else {
85+
LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
86+
}
8387
/* Release memory and exit */
8488
LAPACKE_free( a_t );
8589
exit_level_0:

lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,11 @@ lapack_int LAPACKE_zheevd_work( int matrix_layout, char jobz, char uplo,
7979
info = info - 1;
8080
}
8181
/* Transpose output matrices */
82-
LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
82+
if ( jobz == 'V') {
83+
LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
84+
} else {
85+
LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
86+
}
8387
/* Release memory and exit */
8488
LAPACKE_free( a_t );
8589
exit_level_0:

param.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -625,7 +625,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
625625

626626
#else
627627

628-
#define SGEMM_DEFAULT_UNROLL_M 16
628+
#define SGEMM_DEFAULT_UNROLL_M 8
629629
#define DGEMM_DEFAULT_UNROLL_M 4
630630
#define QGEMM_DEFAULT_UNROLL_M 2
631631
#define CGEMM_DEFAULT_UNROLL_M 8
@@ -666,7 +666,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
666666

667667
#else
668668

669-
#define SGEMM_DEFAULT_P 768
669+
#define SGEMM_DEFAULT_P 320
670670
#define DGEMM_DEFAULT_P 512
671671
#define CGEMM_DEFAULT_P 256
672672
#define ZGEMM_DEFAULT_P 192
@@ -675,7 +675,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
675675
#define SGEMM_DEFAULT_Q 320
676676
#define DGEMM_DEFAULT_Q 128
677677
#else
678-
#define SGEMM_DEFAULT_Q 384
678+
#define SGEMM_DEFAULT_Q 320
679679
#define DGEMM_DEFAULT_Q 256
680680
#endif
681681
#define CGEMM_DEFAULT_Q 256
@@ -1528,7 +1528,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15281528

15291529
#else
15301530

1531-
#define SGEMM_DEFAULT_UNROLL_M 16
1531+
#define SGEMM_DEFAULT_UNROLL_M 8
15321532
#define DGEMM_DEFAULT_UNROLL_M 4
15331533
#define QGEMM_DEFAULT_UNROLL_M 2
15341534
#define CGEMM_DEFAULT_UNROLL_M 8
@@ -1569,7 +1569,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15691569

15701570
#else
15711571

1572-
#define SGEMM_DEFAULT_P 768
1572+
#define SGEMM_DEFAULT_P 320
15731573
#define DGEMM_DEFAULT_P 512
15741574
#define CGEMM_DEFAULT_P 256
15751575
#define ZGEMM_DEFAULT_P 192
@@ -1578,7 +1578,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15781578
#define SGEMM_DEFAULT_Q 320
15791579
#define DGEMM_DEFAULT_Q 128
15801580
#else
1581-
#define SGEMM_DEFAULT_Q 384
1581+
#define SGEMM_DEFAULT_Q 320
15821582
#define DGEMM_DEFAULT_Q 256
15831583
#endif
15841584
#define CGEMM_DEFAULT_Q 256

0 commit comments

Comments
 (0)