Skip to content

Commit c815b8f

Browse files
authored
Merge pull request #2323 from wjc404/develop
some optimizations of AVX512 DGEMM
2 parents a4c3668 + e20709e commit c815b8f

File tree

2 files changed

+83
-91
lines changed

2 files changed

+83
-91
lines changed

kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c

Lines changed: 80 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
#include <stdint.h>
33
#include <immintrin.h>
44

5-
//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
5+
//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
6+
67
/* row-major c_block */
78
#define INNER_KERNEL_k1m1n8 \
89
"prefetcht0 384(%1);"\
@@ -13,18 +14,6 @@
1314
INNER_KERNEL_k1m1n8\
1415
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
1516

16-
#define INNER_KERNEL_k1m4n8 \
17-
INNER_KERNEL_k1m2n8\
18-
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\
19-
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;"
20-
21-
#define INNER_KERNEL_k1m8n8 \
22-
INNER_KERNEL_k1m4n8\
23-
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\
24-
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\
25-
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\
26-
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;"
27-
2817
#define INNER_KERNEL_k1m1n16 \
2918
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\
3019
"vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\
@@ -34,18 +23,6 @@
3423
INNER_KERNEL_k1m1n16\
3524
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
3625

37-
#define INNER_KERNEL_k1m4n16 \
38-
INNER_KERNEL_k1m2n16\
39-
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\
40-
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;"
41-
42-
#define INNER_KERNEL_k1m8n16 \
43-
INNER_KERNEL_k1m4n16\
44-
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\
45-
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\
46-
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\
47-
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;"
48-
4926
#define INNER_KERNEL_k1m1n24 \
5027
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\
5128
"vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\
@@ -55,18 +32,48 @@
5532
INNER_KERNEL_k1m1n24\
5633
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
5734

35+
/* row-major z-partition c_block */
36+
#define INNER_KERNEL_k1m4n8 \
37+
"vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\
38+
"vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\
39+
"vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;"
40+
41+
#define INNER_KERNEL_k1m4n16 \
42+
INNER_KERNEL_k1m4n8\
43+
"vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\
44+
"vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;"
45+
5846
#define INNER_KERNEL_k1m4n24 \
59-
INNER_KERNEL_k1m2n24\
60-
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\
61-
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;"
47+
INNER_KERNEL_k1m4n16\
48+
"vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\
49+
"vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;"
6250

63-
#define INNER_KERNEL_k1m8n24 \
64-
INNER_KERNEL_k1m4n24\
65-
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\
66-
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\
67-
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\
68-
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;"
51+
#define INNER_KERNEL_k1m8n8 \
52+
"vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\
53+
"vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\
54+
"prefetcht0 128(%1);"\
55+
"vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\
56+
"vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\
57+
"vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\
58+
"vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;"
59+
60+
#define INNER_KERNEL_k1m8n16 \
61+
INNER_KERNEL_k1m8n8\
62+
"prefetcht0 128(%1,%%r12,2);"\
63+
"vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\
64+
"vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\
65+
"vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\
66+
"vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;"
6967

68+
#define INNER_KERNEL_k1m8n24 \
69+
INNER_KERNEL_k1m8n16\
70+
"prefetcht0 128(%1,%%r12,4);"\
71+
"vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\
72+
"vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\
73+
"vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\
74+
"vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;"
75+
76+
/* micro kernels */
7077
#define INNER_KERNELm1(nn) \
7178
"cmpq $1,%2;jb "#nn"3f;"\
7279
#nn"4:\n\t"\
@@ -84,26 +91,28 @@
8491
#define INNER_KERNELm4(nn) \
8592
"cmpq $1,%2;jb "#nn"00f;"\
8693
#nn"01:\n\t"\
87-
INNER_KERNEL_k1m4n##nn "addq $32,%0;"\
94+
INNER_KERNEL_k1m4n##nn "addq $64,%1;"\
8895
"decq %2;cmpq $1,%2;jnb "#nn"01b;"\
8996
#nn"00:\n\t"
9097

9198
/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
9299
#define INNER_KERNELm8(nn) \
93-
"movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\
100+
"movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\
94101
#nn"008:\n\t"\
95-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
96-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
102+
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
103+
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
104+
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
97105
"prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
98-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
99-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
100-
"prefetcht1 (%11); addq $16,%11;"\
101-
"subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\
106+
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
107+
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
108+
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
109+
"prefetcht1 (%11); addq $32,%11;"\
110+
"subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\
102111
"movq %3,%10;"\
103112
#nn"001:\n\t"\
104113
"cmpq $1,%2;jb "#nn"000f;"\
105114
"prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
106-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
115+
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
107116
"decq %2;jmp "#nn"001b;"\
108117
""#nn"000:\n\t"
109118

@@ -207,24 +216,19 @@
207216
INNER_STORE_m1n8(%%zmm13,8)
208217

209218
#define INNER_TRANS_4x8(c1,c2,c3,c4) \
210-
"vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
211-
"vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
212-
"vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\
213-
"vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
214-
"vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
215-
"vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
219+
"vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\
220+
"vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\
221+
"vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\
222+
"vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\
223+
224+
#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \
225+
"vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\
226+
"vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\
227+
"vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\
228+
"vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};"
216229

217230
#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
218-
INNER_TRANS_4x8(c1,c2,c3,c4)\
219-
INNER_TRANS_4x8(c5,c6,c7,c8)\
220-
"vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\
221-
"vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\
222-
"vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\
223-
"vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\
224-
"vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\
225-
"vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\
226-
"vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
227-
"vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};"
231+
INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8)
228232

229233
//%7 for k01(input) only when m=4
230234
#define INNER_STORE_4x8(c1,c2,c3,c4) \
@@ -250,41 +254,29 @@
250254
INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
251255

252256
#define INNER_SAVE_m4n16 \
253-
"movq %3,%10;"\
254-
INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
255-
INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
256-
INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
257-
INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
257+
INNER_SAVE_m4n8\
258+
INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
259+
INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
258260

259261
#define INNER_SAVE_m4n24 \
260-
"movq %3,%10;"\
261-
INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
262-
INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
263-
INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
264-
INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
265-
INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
266-
INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
262+
INNER_SAVE_m4n16\
263+
INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\
264+
INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
267265

268266
#define INNER_SAVE_m8n8 \
269267
"movq %3,%10;"\
270268
INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
271269
INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
272270

273271
#define INNER_SAVE_m8n16 \
274-
"movq %3,%10;"\
275-
INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
276-
INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
277-
INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
278-
INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
272+
INNER_SAVE_m8n8\
273+
INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\
274+
INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)
279275

280276
#define INNER_SAVE_m8n24 \
281-
"movq %3,%10;"\
282-
INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
283-
INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
284-
INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
285-
INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
286-
INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
287-
INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
277+
INNER_SAVE_m8n16\
278+
INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\
279+
INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)
288280

289281
#define COMPUTE_n8 {\
290282
b_pref = packed_b_pointer + 8 * K;\
@@ -327,7 +319,7 @@
327319
"shlq $3,%4;addq %4,%3;shrq $3,%4;"\
328320
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
329321
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
330-
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
322+
::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
331323
a_block_pointer -= M * K;\
332324
}
333325
#define COMPUTE_n16 {\
@@ -372,7 +364,7 @@
372364
"leaq (%1,%%r12,4),%1;"\
373365
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
374366
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
375-
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
367+
::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
376368
"zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
377369
a_block_pointer -= M * K;\
378370
}
@@ -417,9 +409,9 @@
417409
"shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
418410
"leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
419411
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
420-
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
421-
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
422-
"zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
412+
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\
413+
"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\
414+
"zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
423415
a_block_pointer -= M * K;\
424416
}
425417
static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8

param.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1691,16 +1691,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
16911691
#else
16921692

16931693
#define SGEMM_DEFAULT_P 768
1694-
#define DGEMM_DEFAULT_P 512
1694+
#define DGEMM_DEFAULT_P 384
16951695
#define CGEMM_DEFAULT_P 384
16961696
#define ZGEMM_DEFAULT_P 256
16971697

16981698
#ifdef WINDOWS_ABI
16991699
#define SGEMM_DEFAULT_Q 192
1700-
#define DGEMM_DEFAULT_Q 128
1700+
#define DGEMM_DEFAULT_Q 168
17011701
#else
17021702
#define SGEMM_DEFAULT_Q 192
1703-
#define DGEMM_DEFAULT_Q 128
1703+
#define DGEMM_DEFAULT_Q 168
17041704
#endif
17051705
#define CGEMM_DEFAULT_Q 192
17061706
#define ZGEMM_DEFAULT_Q 128

0 commit comments

Comments
 (0)