Skip to content

Commit 4c6a457

Browse files
authored
Merge pull request #2300 from wjc404/develop
Optimize SGEMM on SKYLAKEX CPUs
2 parents d403eb3 + 836c414 commit 4c6a457

File tree

4 files changed

+951
-60
lines changed

4 files changed

+951
-60
lines changed

kernel/x86_64/KERNEL.SKYLAKEX

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
include $(KERNELDIR)/KERNEL.HASWELL
22

3-
SGEMMKERNEL = sgemm_kernel_16x4_skylakex.c
3+
SGEMMKERNEL = sgemm_kernel_16x4_skylakex_2.c
44

55
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
66
SGEMMITCOPY = sgemm_tcopy_16_skylakex.c

kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c

Lines changed: 69 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -88,20 +88,21 @@
8888
"decq %2;cmpq $1,%2;jnb "#nn"01b;"\
8989
#nn"00:\n\t"
9090

91+
/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
9192
#define INNER_KERNELm8(nn) \
92-
"cmpq $8,%2;jb "#nn"001f;"\
93+
"movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\
9394
#nn"008:\n\t"\
9495
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
9596
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
97+
"prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
9698
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
9799
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
98-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
99-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
100-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
101-
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
102-
"subq $8,%2;cmpq $8,%2;jnb "#nn"008b;"\
100+
"prefetcht1 (%11); addq $16,%11;"\
101+
"subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\
102+
"movq %3,%10;"\
103103
#nn"001:\n\t"\
104104
"cmpq $1,%2;jb "#nn"000f;"\
105+
"prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
105106
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
106107
"decq %2;jmp "#nn"001b;"\
107108
""#nn"000:\n\t"
@@ -158,60 +159,61 @@
158159

159160
#define INNER_STORE_m1n8(c1,disp) \
160161
"kxnorw %%k1,%%k1,%%k1;"\
161-
"vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\
162+
"vgatherqpd "#disp"(%10,%%zmm6,1), %%zmm7 %{%%k1%};"\
162163
"vfmadd132pd %%zmm3,%%zmm7,"#c1";"\
163164
"kxnorw %%k1,%%k1,%%k1;"\
164-
"vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};"
165+
"vscatterqpd "#c1", "#disp"(%10,%%zmm6,1) %{%%k1%};"
165166

166167
#define INNER_SAVE_m1n8 \
168+
"movq %3,%10;"\
167169
INNER_SETINDEX\
168170
INNER_STORE_m1n8(%%zmm8,0)
169171

170172
#define INNER_SAVE_m1n16 \
171173
INNER_SAVE_m1n8\
172-
"leaq (%3,%4,8),%3;"\
174+
"leaq (%10,%4,8),%10;"\
173175
INNER_STORE_m1n8(%%zmm9,0)
174176

175177
#define INNER_SAVE_m1n24 \
176178
INNER_SAVE_m1n16\
177-
"leaq (%3,%4,8),%3;"\
179+
"leaq (%10,%4,8),%10;"\
178180
INNER_STORE_m1n8(%%zmm10,0)
179181

180182
#define INNER_SAVE_m2n8 \
183+
"movq %3,%10;"\
181184
INNER_SETINDEX\
182185
INNER_STORE_m1n8(%%zmm8,0)\
183186
INNER_STORE_m1n8(%%zmm9,8)
184187

185188
#define INNER_SAVE_m2n16 \
189+
"movq %3,%10;"\
186190
INNER_SETINDEX\
187191
INNER_STORE_m1n8(%%zmm8,0)\
188192
INNER_STORE_m1n8(%%zmm10,8)\
189-
"leaq (%3,%4,8),%3;"\
193+
"leaq (%10,%4,8),%10;"\
190194
INNER_STORE_m1n8(%%zmm9,0)\
191195
INNER_STORE_m1n8(%%zmm11,8)
196+
192197
#define INNER_SAVE_m2n24 \
198+
"movq %3,%10;"\
193199
INNER_SETINDEX\
194200
INNER_STORE_m1n8(%%zmm8,0)\
195201
INNER_STORE_m1n8(%%zmm11,8)\
196-
"leaq (%3,%4,8),%3;"\
202+
"leaq (%10,%4,8),%10;"\
197203
INNER_STORE_m1n8(%%zmm9,0)\
198204
INNER_STORE_m1n8(%%zmm12,8)\
199-
"leaq (%3,%4,8),%3;"\
205+
"leaq (%10,%4,8),%10;"\
200206
INNER_STORE_m1n8(%%zmm10,0)\
201207
INNER_STORE_m1n8(%%zmm13,8)
202-
#define INNER_PREF_8x8 \
203-
"prefetcht0 (%3); prefetcht0 56(%3); prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2);"\
204-
"prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,2),%3;"\
205-
"prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,1),%3;"\
206-
"prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4);"\
207-
"subq %4,%3; subq %4,%3; subq %4,%3;"
208+
208209
#define INNER_TRANS_4x8(c1,c2,c3,c4) \
209210
"vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
210211
"vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
211212
"vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\
212213
"vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
213214
"vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
214215
"vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
216+
215217
#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
216218
INNER_TRANS_4x8(c1,c2,c3,c4)\
217219
INNER_TRANS_4x8(c5,c6,c7,c8)\
@@ -223,64 +225,69 @@
223225
"vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\
224226
"vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
225227
"vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};"
228+
226229
//%7 for k01(input) only when m=4
227230
#define INNER_STORE_4x8(c1,c2,c3,c4) \
228-
"vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
229-
"vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
230-
"vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
231-
"vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
232-
"vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
233-
"vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
234-
"vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
235-
"vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
236-
"leaq (%3,%4,4),%3;"
231+
"vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
232+
"vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
233+
"vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
234+
"vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
235+
"vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
236+
"vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
237+
"vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
238+
"vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
239+
"leaq (%10,%4,4),%10;"
240+
237241
#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
238-
"prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
239-
"vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
240-
"prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
241-
"vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\
242-
"prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
243-
"vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\
244-
"prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
245-
"vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;"
242+
"vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\
243+
"vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\
244+
"vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\
245+
"vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;"
246+
246247
#define INNER_SAVE_m4n8 \
248+
"movq %3,%10;"\
247249
INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
248250
INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
251+
249252
#define INNER_SAVE_m4n16 \
253+
"movq %3,%10;"\
250254
INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
251255
INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
252256
INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
253257
INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
258+
254259
#define INNER_SAVE_m4n24 \
260+
"movq %3,%10;"\
255261
INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
256262
INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
257263
INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
258264
INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
259265
INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
260266
INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
267+
261268
#define INNER_SAVE_m8n8 \
262-
INNER_PREF_8x8\
269+
"movq %3,%10;"\
263270
INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
264271
INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
272+
265273
#define INNER_SAVE_m8n16 \
266-
INNER_PREF_8x8\
274+
"movq %3,%10;"\
267275
INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
268276
INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
269-
INNER_PREF_8x8\
270277
INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
271278
INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
279+
272280
#define INNER_SAVE_m8n24 \
273-
INNER_PREF_8x8\
281+
"movq %3,%10;"\
274282
INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
275283
INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
276-
INNER_PREF_8x8\
277284
INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
278285
INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
279-
INNER_PREF_8x8\
280286
INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
281287
INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
282288

283289
#define COMPUTE_n8 {\
290+
b_pref = packed_b_pointer + 8 * K;\
284291
__asm__ __volatile__(\
285292
"vbroadcastsd (%9),%%zmm3;"\
286293
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
@@ -290,15 +297,15 @@
290297
INNER_KERNELm8(8)\
291298
INNER_SAVE_m8n8\
292299
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
293-
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\
300+
"addq $64,%3;"\
294301
"subq $8,%8; cmpq $8,%8; jnb 42221b;"\
295302
"42222:\n\t"\
296303
"cmpq $4,%8; jb 42223f;"\
297304
INNER_INIT_m4n8\
298305
INNER_KERNELm4(8)\
299306
INNER_SAVE_m4n8\
300307
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
301-
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\
308+
"addq $32,%3;"\
302309
"subq $4,%8;"\
303310
"42223:\n\t"\
304311
"cmpq $2,%8; jb 42224f;"\
@@ -318,11 +325,13 @@
318325
"42225:\n\t"\
319326
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
320327
"shlq $3,%4;addq %4,%3;shrq $3,%4;"\
321-
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
328+
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
329+
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
322330
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
323331
a_block_pointer -= M * K;\
324332
}
325333
#define COMPUTE_n16 {\
334+
b_pref = packed_b_pointer + 16 * K;\
326335
__asm__ __volatile__(\
327336
"vbroadcastsd (%9),%%zmm3;"\
328337
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
@@ -332,41 +341,43 @@
332341
INNER_KERNELm8(16)\
333342
INNER_SAVE_m8n16\
334343
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
335-
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
344+
"addq $64,%3;"\
336345
"subq $8,%8; cmpq $8,%8; jnb 32221b;"\
337346
"32222:\n\t"\
338347
"cmpq $4,%8; jb 32223f;"\
339348
INNER_INIT_m4n16\
340349
INNER_KERNELm4(16)\
341350
INNER_SAVE_m4n16\
342351
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
343-
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
352+
"addq $32,%3;"\
344353
"subq $4,%8;"\
345354
"32223:\n\t"\
346355
"cmpq $2,%8; jb 32224f;"\
347356
INNER_INIT_m2n16\
348357
INNER_KERNELm2(16)\
349358
INNER_SAVE_m2n16\
350359
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
351-
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\
360+
"addq $16,%3;"\
352361
"subq $2,%8;"\
353362
"32224:\n\t"\
354363
"cmpq $1,%8; jb 32225f;"\
355364
INNER_INIT_m1n16\
356365
INNER_KERNELm1(16)\
357366
INNER_SAVE_m1n16\
358367
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
359-
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\
368+
"addq $8,%3;"\
360369
"32225:\n\t"\
361370
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
362371
"shlq $4,%4;addq %4,%3;shrq $4,%4;"\
363372
"leaq (%1,%%r12,4),%1;"\
364-
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
373+
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
374+
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
365375
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
366376
"zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
367377
a_block_pointer -= M * K;\
368378
}
369379
#define COMPUTE_n24 {\
380+
b_pref = packed_b_pointer + 24 * K;\
370381
__asm__ __volatile__(\
371382
"vbroadcastsd (%9),%%zmm3;"\
372383
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
@@ -376,36 +387,37 @@
376387
INNER_KERNELm8(24)\
377388
INNER_SAVE_m8n24\
378389
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
379-
"shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
390+
"addq $64,%3;"\
380391
"subq $8,%8; cmpq $8,%8; jnb 22221b;"\
381392
"22222:\n\t"\
382393
"cmpq $4,%8; jb 22223f;"\
383394
INNER_INIT_m4n24\
384395
INNER_KERNELm4(24)\
385396
INNER_SAVE_m4n24\
386397
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
387-
"shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
398+
"addq $32,%3;"\
388399
"subq $4,%8;"\
389400
"22223:\n\t"\
390401
"cmpq $2,%8; jb 22224f;"\
391402
INNER_INIT_m2n24\
392403
INNER_KERNELm2(24)\
393404
INNER_SAVE_m2n24\
394405
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
395-
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\
406+
"addq $16,%3;"\
396407
"subq $2,%8;"\
397408
"22224:\n\t"\
398409
"cmpq $1,%8; jb 22225f;"\
399410
INNER_INIT_m1n24\
400411
INNER_KERNELm1(24)\
401412
INNER_SAVE_m1n24\
402413
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
403-
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\
414+
"addq $8,%3;"\
404415
"22225:\n\t"\
405416
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
406417
"shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
407418
"leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
408-
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
419+
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
420+
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
409421
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
410422
"zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
411423
a_block_pointer -= M * K;\
@@ -415,8 +427,8 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
415427
if(k==0 || m==0 || ndiv8==0) return;
416428
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
417429
int64_t K = (int64_t)k; int64_t M = (int64_t)m;
418-
double *a_block_pointer;
419-
double *c_pointer = c;
430+
double *a_block_pointer,*b_pref;
431+
double *c_pointer = c,*c_store = c;
420432
__mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
421433
BLASLONG ndiv8_count;
422434
double *packed_b_pointer = packed_b;

0 commit comments

Comments
 (0)