Skip to content

Commit eb1e9c8

Browse files
authored
some optimizations
1 parent f95989c commit eb1e9c8

File tree

1 file changed

+75
-85
lines changed

1 file changed

+75
-85
lines changed

kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c

Lines changed: 75 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
#include <stdint.h>
33
#include <immintrin.h>
44

5-
//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
5+
//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
6+
67
/* row-major c_block */
78
#define INNER_KERNEL_k1m1n8 \
89
"prefetcht0 384(%1);"\
@@ -13,18 +14,6 @@
1314
INNER_KERNEL_k1m1n8\
1415
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
1516

16-
#define INNER_KERNEL_k1m4n8 \
17-
INNER_KERNEL_k1m2n8\
18-
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\
19-
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;"
20-
21-
#define INNER_KERNEL_k1m8n8 \
22-
INNER_KERNEL_k1m4n8\
23-
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\
24-
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\
25-
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\
26-
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;"
27-
2817
#define INNER_KERNEL_k1m1n16 \
2918
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\
3019
"vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\
@@ -34,18 +23,6 @@
3423
INNER_KERNEL_k1m1n16\
3524
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
3625

37-
#define INNER_KERNEL_k1m4n16 \
38-
INNER_KERNEL_k1m2n16\
39-
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\
40-
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;"
41-
42-
#define INNER_KERNEL_k1m8n16 \
43-
INNER_KERNEL_k1m4n16\
44-
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\
45-
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\
46-
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\
47-
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;"
48-
4926
#define INNER_KERNEL_k1m1n24 \
5027
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\
5128
"vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\
@@ -55,18 +32,48 @@
5532
INNER_KERNEL_k1m1n24\
5633
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
5734

35+
/* row-major z-partition c_block */
36+
#define INNER_KERNEL_k1m4n8 \
37+
"vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\
38+
"vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\
39+
"vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;"
40+
41+
#define INNER_KERNEL_k1m4n16 \
42+
INNER_KERNEL_k1m4n8\
43+
"vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\
44+
"vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;"
45+
5846
#define INNER_KERNEL_k1m4n24 \
59-
INNER_KERNEL_k1m2n24\
60-
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\
61-
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;"
47+
INNER_KERNEL_k1m4n16\
48+
"vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\
49+
"vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;"
6250

63-
#define INNER_KERNEL_k1m8n24 \
64-
INNER_KERNEL_k1m4n24\
65-
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\
66-
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\
67-
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\
68-
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;"
51+
#define INNER_KERNEL_k1m8n8 \
52+
"vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\
53+
"vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\
54+
"prefetcht0 128(%1);"\
55+
"vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\
56+
"vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\
57+
"vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\
58+
"vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;"
59+
60+
#define INNER_KERNEL_k1m8n16 \
61+
INNER_KERNEL_k1m8n8\
62+
"prefetcht0 128(%1,%%r12,2);"\
63+
"vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\
64+
"vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\
65+
"vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\
66+
"vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;"
6967

68+
#define INNER_KERNEL_k1m8n24 \
69+
INNER_KERNEL_k1m8n16\
70+
"prefetcht0 128(%1,%%r12,4);"\
71+
"vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\
72+
"vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\
73+
"vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\
74+
"vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;"
75+
76+
/* micro kernels */
7077
#define INNER_KERNELm1(nn) \
7178
"cmpq $1,%2;jb "#nn"3f;"\
7279
#nn"4:\n\t"\
@@ -84,26 +91,26 @@
8491
#define INNER_KERNELm4(nn) \
8592
"cmpq $1,%2;jb "#nn"00f;"\
8693
#nn"01:\n\t"\
87-
INNER_KERNEL_k1m4n##nn "addq $32,%0;"\
94+
INNER_KERNEL_k1m4n##nn "addq $64,%1;"\
8895
"decq %2;cmpq $1,%2;jnb "#nn"01b;"\
8996
#nn"00:\n\t"
9097

9198
/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
9299
#define INNER_KERNELm8(nn) \
93100
"movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\
94101
#nn"008:\n\t"\
95-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
96-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
102+
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
103+
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
97104
"prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
98-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
99-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
105+
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
106+
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
100107
"prefetcht1 (%11); addq $16,%11;"\
101108
"subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\
102109
"movq %3,%10;"\
103110
#nn"001:\n\t"\
104111
"cmpq $1,%2;jb "#nn"000f;"\
105112
"prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
106-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
113+
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
107114
"decq %2;jmp "#nn"001b;"\
108115
""#nn"000:\n\t"
109116

@@ -207,24 +214,19 @@
207214
INNER_STORE_m1n8(%%zmm13,8)
208215

209216
#define INNER_TRANS_4x8(c1,c2,c3,c4) \
210-
"vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
211-
"vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
212-
"vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\
213-
"vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
214-
"vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
215-
"vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
217+
"vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\
218+
"vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\
219+
"vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\
220+
"vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\
221+
222+
#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \
223+
"vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\
224+
"vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\
225+
"vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\
226+
"vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};"
216227

217228
#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
218-
INNER_TRANS_4x8(c1,c2,c3,c4)\
219-
INNER_TRANS_4x8(c5,c6,c7,c8)\
220-
"vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\
221-
"vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\
222-
"vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\
223-
"vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\
224-
"vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\
225-
"vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\
226-
"vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
227-
"vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};"
229+
INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8)
228230

229231
//%7 for k01(input) only when m=4
230232
#define INNER_STORE_4x8(c1,c2,c3,c4) \
@@ -250,41 +252,29 @@
250252
INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
251253

252254
#define INNER_SAVE_m4n16 \
253-
"movq %3,%10;"\
254-
INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
255-
INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
256-
INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
257-
INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
255+
INNER_SAVE_m4n8\
256+
INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
257+
INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
258258

259259
#define INNER_SAVE_m4n24 \
260-
"movq %3,%10;"\
261-
INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
262-
INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
263-
INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
264-
INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
265-
INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
266-
INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
260+
INNER_SAVE_m4n16\
261+
INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\
262+
INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
267263

268264
#define INNER_SAVE_m8n8 \
269265
"movq %3,%10;"\
270266
INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
271267
INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
272268

273269
#define INNER_SAVE_m8n16 \
274-
"movq %3,%10;"\
275-
INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
276-
INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
277-
INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
278-
INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
270+
INNER_SAVE_m8n8\
271+
INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\
272+
INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)
279273

280274
#define INNER_SAVE_m8n24 \
281-
"movq %3,%10;"\
282-
INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
283-
INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
284-
INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
285-
INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
286-
INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
287-
INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
275+
INNER_SAVE_m8n16\
276+
INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\
277+
INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)
288278

289279
#define COMPUTE_n8 {\
290280
b_pref = packed_b_pointer + 8 * K;\
@@ -327,7 +317,7 @@
327317
"shlq $3,%4;addq %4,%3;shrq $3,%4;"\
328318
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
329319
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
330-
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
320+
::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
331321
a_block_pointer -= M * K;\
332322
}
333323
#define COMPUTE_n16 {\
@@ -372,7 +362,7 @@
372362
"leaq (%1,%%r12,4),%1;"\
373363
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
374364
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
375-
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
365+
::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
376366
"zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
377367
a_block_pointer -= M * K;\
378368
}
@@ -417,9 +407,9 @@
417407
"shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
418408
"leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
419409
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
420-
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
421-
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
422-
"zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
410+
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\
411+
"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\
412+
"zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
423413
a_block_pointer -= M * K;\
424414
}
425415
static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8

0 commit comments

Comments
 (0)