Skip to content

Commit 26478eb

Browse files
authored
Merge pull request #2345 from wjc404/develop
Optimize AVX2 CGEMM
2 parents 0ae49d2 + eeecd62 commit 26478eb

File tree

5 files changed

+308
-9
lines changed

5 files changed

+308
-9
lines changed

CONTRIBUTORS.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,3 +171,10 @@ In chronological order:
171171
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
172172
* [2019-03-14] power9 dgemm/dtrmm kernel
173173
* [2019-04-29] power9 sgemm/strmm kernel
174+
175+
* Jiachen Wang <https://github.com/wjc404>
176+
* [2019-07-29] optimize AVX2 DGEMM
177+
* [2019-10-20] AVX512 DGEMM kernel (4x8)
178+
* [2019-11-06] optimize AVX512 SGEMM
179+
* [2019-11-12] AVX512 CGEMM & ZGEMM kernels
180+
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM

kernel/x86_64/KERNEL.HASWELL

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
5656
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
5757

5858
CTRMMKERNEL = cgemm_kernel_8x2_haswell.S
59-
CGEMMKERNEL = cgemm_kernel_8x2_haswell.S
59+
CGEMMKERNEL = cgemm_kernel_8x2_haswell.c
6060
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
6161
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
6262
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c

kernel/x86_64/KERNEL.ZEN

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
5353
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
5454

5555
CTRMMKERNEL = cgemm_kernel_8x2_haswell.S
56-
CGEMMKERNEL = cgemm_kernel_8x2_haswell.S
56+
CGEMMKERNEL = cgemm_kernel_8x2_haswell.c
5757
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
5858
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
5959
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
@@ -64,7 +64,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
6464
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
6565

6666
ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S
67-
ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S
67+
ZGEMMKERNEL = zgemm_kernel_4x2_haswell.c
6868
ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c
6969
ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c
7070
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
Lines changed: 292 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,292 @@
1+
#include "common.h"
2+
#include <stdint.h>
3+
4+
/* recommended settings: GEMM_P = 256, GEMM_Q = 256 */
5+
6+
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
7+
#define A_CONJ 0
8+
#define B_CONJ 0
9+
#endif
10+
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
11+
#define A_CONJ 1
12+
#define B_CONJ 0
13+
#endif
14+
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
15+
#define A_CONJ 0
16+
#define B_CONJ 1
17+
#endif
18+
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
19+
#define A_CONJ 1
20+
#define B_CONJ 1
21+
#endif
22+
23+
/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */
24+
/* r11 = m, r12 = k << 4, r13 = k, r14 = b_head, r15 = temp */
25+
26+
/* m=8, ymm 0-3 temp, ymm 4-15 acc */
27+
#if A_CONJ == B_CONJ
28+
#define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
29+
#define acc_m8n1_con(ua,la,b1,uc,lc) "vfmaddsub231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
30+
#else
31+
#define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
32+
#define acc_m8n1_con(ua,la,b1,uc,lc) "vfmsubadd231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
33+
#endif
34+
/* expanded accumulators for m8n1 and m8n2 */
35+
#define KERNEL_k1m8n1 \
36+
"vbroadcastsd (%1),%%ymm0; addq $8,%1;"\
37+
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;" acc_m4n1_exp(1,2,0,4,5)\
38+
"vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2;" acc_m4n1_exp(1,2,0,6,7)\
39+
"addq $64,%0;"
40+
#define KERNEL_k1m8n2 \
41+
"vbroadcastsd (%1),%%ymm0; vbroadcastsd 8(%1),%%ymm1; addq $16,%1;"\
42+
"vmovsldup (%0),%%ymm2; vmovshdup (%0),%%ymm3;" acc_m4n1_exp(2,3,0,4,5) acc_m4n1_exp(2,3,1,8,9)\
43+
"vmovsldup 32(%0),%%ymm2; vmovshdup 32(%0),%%ymm3;" acc_m4n1_exp(2,3,0,6,7) acc_m4n1_exp(2,3,1,10,11)\
44+
"addq $64,%0;"
45+
/* contracted accumulators for m8n4 and m8n6 */
46+
#define acc_m8n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \
47+
"vbroadcastss "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m8n1_con(ua,la,2,luc,llc)\
48+
"vbroadcastss "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m8n1_con(ua,la,3,ruc,rlc)
49+
#define KERNEL_1_k1m8n4 \
50+
"vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
51+
acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1)
52+
#define KERNEL_2_k1m8n4 \
53+
"vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\
54+
acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1)
55+
#define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2)
56+
#define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2)
57+
#define KERNEL_k1m8n4 KERNEL_1_k1m8n4 KERNEL_2_k1m8n4 "addq $16,%1;"
58+
#define KERNEL_k1m8n6 KERNEL_1_k1m8n6 KERNEL_2_k1m8n6 "addq $16,%1;"
59+
#define zero_4ymm(no1,no2,no3,no4) \
60+
"vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\
61+
"vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";"
62+
/* initialization and storage macros */
63+
#define INIT_m8n1 zero_4ymm(4,5,6,7)
64+
#define INIT_m8n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
65+
#define INIT_m8n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
66+
#define INIT_m8n6 INIT_m8n4 zero_4ymm(12,13,14,15)
67+
#if A_CONJ == B_CONJ
68+
#define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";"
69+
#else
70+
#define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";"
71+
#endif
72+
#if A_CONJ == 0
73+
#define save_1ymm(c,tmp,off,alpr,alpi,...) \
74+
"vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213ps "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\
75+
"vfmsubadd231ps %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovups %%ymm"#c","#off"("#__VA_ARGS__");"
76+
#else
77+
#define save_1ymm(c,tmp,off,alpr,alpi,...) \
78+
"vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213ps "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\
79+
"vfmaddsub231ps %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovups %%ymm"#tmp","#off"("#__VA_ARGS__");"
80+
#endif
81+
#define save_init_m8 "movq %2,%3; addq $64,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;"
82+
#define SAVE_m8n1 save_init_m8 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3)
83+
#define SAVE_m8n2 SAVE_m8n1\
84+
cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1)
85+
#define SAVE_m8n4 save_init_m8\
86+
save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\
87+
save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1)
88+
#define SAVE_m8n6 SAVE_m8n4 "leaq (%3,%4,2),%3;"\
89+
save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1)
90+
#define COMPUTE_m8(ndim) \
91+
"movq %%r14,%1;" INIT_m8n##ndim "movq %2,%3; movq %%r13,%5;"\
92+
"testq %5,%5; jz "#ndim"8883f; cmpq $10,%5; jb "#ndim"8882f;"\
93+
"movq $10,%5; movq $84,%%r15;"\
94+
#ndim"8881:\n\t"\
95+
"prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
96+
"prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\
97+
KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
98+
"testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
99+
KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
100+
"addq $4,%5; cmpq %5,%%r13; jnb "#ndim"8881b;"\
101+
"movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 7(%6);"\
102+
#ndim"8882:\n\t"\
103+
"prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\
104+
KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\
105+
#ndim"8883:\n\t"\
106+
"prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim
107+
108+
/* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */
109+
#define KERNEL_k1m4n1 \
110+
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
111+
"vbroadcastsd (%1),%%ymm0;" acc_m4n1_exp(1,2,0,4,5) "addq $8,%1;"
112+
#define acc_m4n2_exp(c1l,c1r,c2l,c2r,...) \
113+
"vbroadcastsd ("#__VA_ARGS__"),%%ymm2;" acc_m4n1_exp(0,1,2,c1l,c1r)\
114+
"vbroadcastsd 8("#__VA_ARGS__"),%%ymm3;" acc_m4n1_exp(0,1,3,c2l,c2r)
115+
#define KERNEL_h_k1m4n2 \
116+
"vmovsldup (%0),%%ymm0; vmovshdup (%0),%%ymm1; addq $32,%0;" acc_m4n2_exp(4,5,6,7,%1)
117+
#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 acc_m4n2_exp(8,9,10,11,%1,%%r12,1)
118+
#define KERNEL_h_k1m4n6 KERNEL_h_k1m4n4 acc_m4n2_exp(12,13,14,15,%1,%%r12,2)
119+
#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;"
120+
#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
121+
#define KERNEL_k1m4n6 KERNEL_h_k1m4n6 "addq $16,%1;"
122+
#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;"
123+
#define INIT_m4n2 zero_4ymm(4,5,6,7)
124+
#define INIT_m4n4 INIT_m4n2 zero_4ymm(8,9,10,11)
125+
#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15)
126+
#define save_init_m4 "movq %2,%3; addq $32,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;"
127+
#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3)
128+
#define SAVE_m4n2 SAVE_m4n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1)
129+
#define SAVE_m4n4 SAVE_m4n2 "leaq (%3,%4,2),%3;"\
130+
cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1)
131+
#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\
132+
cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1)
133+
#define COMPUTE_m4(ndim) \
134+
"movq %%r14,%1;" INIT_m4n##ndim "movq %%r13,%5;"\
135+
"testq %5,%5; jz "#ndim"4442f;"\
136+
#ndim"4441:\n\t"\
137+
KERNEL_k1m4n##ndim\
138+
"decq %5; jnz "#ndim"4441b;"\
139+
#ndim"4442:\n\t"\
140+
SAVE_m4n##ndim
141+
142+
/* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */
143+
#if A_CONJ == B_CONJ
144+
#define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
145+
#else
146+
#define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
147+
#endif
148+
#define KERNEL_h_k1m2n1 \
149+
"vmovsldup (%0),%%xmm0; vmovshdup (%0),%%xmm1; addq $16,%0;"\
150+
"vmovddup (%1),%%xmm2;" acc_m2n1_exp(0,1,2,4,5)
151+
#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1\
152+
"vmovddup 8(%1),%%xmm3;" acc_m2n1_exp(0,1,3,6,7)
153+
#define acc_m2n2_exp(c1,c2,c3,c4,...)\
154+
"vmovddup ("#__VA_ARGS__"),%%xmm2;" acc_m2n1_exp(0,1,2,c1,c2)\
155+
"vmovddup 8("#__VA_ARGS__"),%%xmm3;" acc_m2n1_exp(0,1,3,c3,c4)
156+
#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1)
157+
#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2)
158+
#define KERNEL_k1m2n1 KERNEL_h_k1m2n1 "addq $8,%1;"
159+
#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;"
160+
#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;"
161+
#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $16,%1;"
162+
#define zero_2xmm(no1,no2) "vpxor %%xmm"#no1",%%xmm"#no1",%%xmm"#no1"; vpxor %%xmm"#no2",%%xmm"#no2",%%xmm"#no2";"
163+
#define INIT_m2n1 zero_2xmm(4,5)
164+
#define INIT_m2n2 INIT_m2n1 zero_2xmm(6,7)
165+
#define INIT_m2n4 INIT_m2n2 zero_2xmm(8,9) zero_2xmm(10,11)
166+
#define INIT_m2n6 INIT_m2n4 zero_2xmm(12,13) zero_2xmm(14,15)
167+
#if A_CONJ == B_CONJ
168+
#define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";"
169+
#else
170+
#define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";"
171+
#endif
172+
#if A_CONJ == 0
173+
#define save_1xmm(c,tmp,alpr,alpi) \
174+
"vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213ps (%3),%%xmm"#alpr",%%xmm"#c";"\
175+
"vfmsubadd231ps %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovups %%xmm"#c",(%3); addq %4,%3;"
176+
#else
177+
#define save_1xmm(c,tmp,alpr,alpi) \
178+
"vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213ps (%3),%%xmm"#alpi",%%xmm"#tmp";"\
179+
"vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovups %%xmm"#tmp",(%3); addq %4,%3;"
180+
#endif
181+
#define save_init_m2 "movq %2,%3; addq $16,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;"
182+
#define SAVE_m2n1 save_init_m2 cont_expxmmacc(4,5,4) save_1xmm(4,2,0,1)
183+
#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1xmm(6,3,0,1)
184+
#define SAVE_m2n4 SAVE_m2n2 cont_expacc(8,9,8) save_1xmm(8,2,0,1) cont_expacc(10,11,10) save_1xmm(10,3,0,1)
185+
#define SAVE_m2n6 SAVE_m2n4 cont_expacc(12,13,12) save_1xmm(12,2,0,1) cont_expacc(14,15,14) save_1xmm(14,3,0,1)
186+
#define COMPUTE_m2(ndim) \
187+
"movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\
188+
"testq %5,%5; jz "#ndim"2222f;"\
189+
#ndim"2221:\n\t"\
190+
KERNEL_k1m2n##ndim\
191+
"decq %5; jnz "#ndim"2221b;"\
192+
#ndim"2222:\n\t"\
193+
SAVE_m2n##ndim
194+
195+
/* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */
196+
#if A_CONJ == B_CONJ
197+
#define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
198+
#define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";"
199+
#else
200+
#define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
201+
#define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfnmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";"
202+
#endif
203+
#define KERNEL_k1m1n1 \
204+
"vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\
205+
"vmovsd (%1),%%xmm2; addq $8,%1;" acc_m1n1_exp(0,1,2,4,5)
206+
#define KERNEL_h_k1m1n2 \
207+
"vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\
208+
"vmovups (%1),%%xmm2;" acc_m1n2_exp(0,1,2,4,5)
209+
#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovups (%1,%%r12,1),%%xmm2;" acc_m1n2_exp(0,1,2,6,7)
210+
#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovups (%1,%%r12,2),%%xmm2;" acc_m1n2_exp(0,1,2,8,9)
211+
#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;"
212+
#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;"
213+
#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $16,%1;"
214+
#define INIT_m1n1 zero_2xmm(4,5)
215+
#define INIT_m1n2 zero_2xmm(4,5)
216+
#define INIT_m1n4 INIT_m1n2 zero_2xmm(6,7)
217+
#define INIT_m1n6 INIT_m1n4 zero_2xmm(8,9)
218+
#if A_CONJ == 0
219+
#define save_m1n1(c,tmp1,tmp2,alpr,alpi) \
220+
"vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c";"\
221+
"vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c"; vmovsd %%xmm"#c",(%3);"
222+
#define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
223+
"vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\
224+
"vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c"; vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c";"\
225+
"vmovsd %%xmm"#c",(%3); vmovhpd %%xmm"#c",(%3,%4,1); leaq (%3,%4,2),%3;"
226+
#else
227+
#define save_m1n1(c,tmp1,tmp2,alpr,alpi) \
228+
"vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1";"\
229+
"vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1"; vmovsd %%xmm"#tmp1",(%3);"
230+
#define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
231+
"vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\
232+
"vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1"; vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1";"\
233+
"vmovsd %%xmm"#tmp1",(%3); vmovhpd %%xmm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;"
234+
#endif
235+
#define save_init_m1 "movq %2,%3; addq $8,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;"
236+
#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,3,0,1)
237+
#define SAVE_m1n2 save_init_m1 cont_expxmmacc(4,5,4) save_m1n2(4,2,3,0,1)
238+
#define SAVE_m1n4 SAVE_m1n2 cont_expxmmacc(6,7,6) save_m1n2(6,2,3,0,1)
239+
#define SAVE_m1n6 SAVE_m1n4 cont_expxmmacc(8,9,8) save_m1n2(8,2,3,0,1)
240+
#define COMPUTE_m1(ndim) \
241+
"movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\
242+
"testq %5,%5; jz "#ndim"1112f;"\
243+
#ndim"1111:\n\t"\
244+
KERNEL_k1m1n##ndim\
245+
"decq %5; jnz "#ndim"1111b;"\
246+
#ndim"1112:\n\t"\
247+
SAVE_m1n##ndim
248+
249+
#define COMPUTE(ndim) {\
250+
b_pref = b_ptr + ndim * K *2;\
251+
__asm__ __volatile__ (\
252+
"movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $4,%%r12; movq %7,%%r11;"\
253+
"cmpq $8,%7; jb "#ndim"9992f;"\
254+
#ndim"9991:\n\t"\
255+
COMPUTE_m8(ndim)\
256+
"subq $8,%7; cmpq $8,%7; jnb "#ndim"9991b;"\
257+
#ndim"9992:\n\t"\
258+
"cmpq $4,%7; jb "#ndim"9993f;"\
259+
COMPUTE_m4(ndim) "subq $4,%7;"\
260+
#ndim"9993:\n\t"\
261+
"cmpq $2,%7; jb "#ndim"9994f;"\
262+
COMPUTE_m2(ndim) "subq $2,%7;"\
263+
#ndim"9994:\n\t"\
264+
"testq %7,%7; jz "#ndim"9995f;"\
265+
COMPUTE_m1(ndim)\
266+
#ndim"9995:\n\t"\
267+
"movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\
268+
:"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\
269+
::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\
270+
"xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
271+
a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\
272+
}
273+
274+
int __attribute__ ((noinline))
275+
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
276+
{
277+
if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0;
278+
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2;
279+
#if A_CONJ == B_CONJ
280+
float const_val[2] = {-alphar, -alphai};
281+
#else
282+
float const_val[2] = {alphar, alphai};
283+
#endif
284+
int64_t M = (int64_t)m, K = (int64_t)k;
285+
BLASLONG n_count = n;
286+
float *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B;
287+
for(;n_count>5;n_count-=6) COMPUTE(6)
288+
for(;n_count>3;n_count-=4) COMPUTE(4)
289+
for(;n_count>1;n_count-=2) COMPUTE(2)
290+
if(n_count>0) COMPUTE(1)
291+
return 0;
292+
}

param.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -668,8 +668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
668668

669669
#define SGEMM_DEFAULT_P 768
670670
#define DGEMM_DEFAULT_P 512
671-
#define CGEMM_DEFAULT_P 384
672-
#define ZGEMM_DEFAULT_P 256
671+
#define CGEMM_DEFAULT_P 256
672+
#define ZGEMM_DEFAULT_P 192
673673

674674
#ifdef WINDOWS_ABI
675675
#define SGEMM_DEFAULT_Q 320
@@ -678,8 +678,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
678678
#define SGEMM_DEFAULT_Q 384
679679
#define DGEMM_DEFAULT_Q 256
680680
#endif
681-
#define CGEMM_DEFAULT_Q 192
682-
#define ZGEMM_DEFAULT_Q 128
681+
#define CGEMM_DEFAULT_Q 256
682+
#define ZGEMM_DEFAULT_Q 192
683683

684684
#define SGEMM_DEFAULT_R sgemm_r
685685
#define DGEMM_DEFAULT_R 13824
@@ -1571,7 +1571,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15711571

15721572
#define SGEMM_DEFAULT_P 768
15731573
#define DGEMM_DEFAULT_P 512
1574-
#define CGEMM_DEFAULT_P 384
1574+
#define CGEMM_DEFAULT_P 256
15751575
#define ZGEMM_DEFAULT_P 192
15761576

15771577
#ifdef WINDOWS_ABI
@@ -1581,7 +1581,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15811581
#define SGEMM_DEFAULT_Q 384
15821582
#define DGEMM_DEFAULT_Q 256
15831583
#endif
1584-
#define CGEMM_DEFAULT_Q 192
1584+
#define CGEMM_DEFAULT_Q 256
15851585
#define ZGEMM_DEFAULT_Q 192
15861586

15871587
#define SGEMM_DEFAULT_R sgemm_r

0 commit comments

Comments
 (0)