88
88
"decq %2;cmpq $1,%2;jnb "#nn"01b;"\
89
89
#nn"00:\n\t"
90
90
91
+ /* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
91
92
#define INNER_KERNELm8 (nn ) \
92
- "cmpq $8 ,%2;jb "#nn"001f;"\
93
+ "movq %3,%10; cmpq $16 ,%2;jb "#nn"001f;"\
93
94
#nn"008:\n\t"\
94
95
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
95
96
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
97
+ "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
96
98
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
97
99
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
98
- INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
99
- INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
100
- INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
101
- INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
102
- "subq $8,%2;cmpq $8,%2;jnb "#nn"008b;"\
100
+ "prefetcht1 (%11); addq $16,%11;"\
101
+ "subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\
102
+ "movq %3,%10;"\
103
103
#nn"001:\n\t"\
104
104
"cmpq $1,%2;jb "#nn"000f;"\
105
+ "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
105
106
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
106
107
"decq %2;jmp "#nn"001b;"\
107
108
""#nn"000:\n\t"
158
159
159
160
#define INNER_STORE_m1n8 (c1 ,disp ) \
160
161
"kxnorw %%k1,%%k1,%%k1;"\
161
- "vgatherqpd "#disp"(%3 ,%%zmm6,1), %%zmm7 %{%%k1%};"\
162
+ "vgatherqpd "#disp"(%10 ,%%zmm6,1), %%zmm7 %{%%k1%};"\
162
163
"vfmadd132pd %%zmm3,%%zmm7,"#c1";"\
163
164
"kxnorw %%k1,%%k1,%%k1;"\
164
- "vscatterqpd "#c1", "#disp"(%3 ,%%zmm6,1) %{%%k1%};"
165
+ "vscatterqpd "#c1", "#disp"(%10 ,%%zmm6,1) %{%%k1%};"
165
166
166
167
#define INNER_SAVE_m1n8 \
168
+ "movq %3,%10;"\
167
169
INNER_SETINDEX\
168
170
INNER_STORE_m1n8(%%zmm8,0)
169
171
170
172
#define INNER_SAVE_m1n16 \
171
173
INNER_SAVE_m1n8\
172
- "leaq (%3 ,%4,8),%3 ;"\
174
+ "leaq (%10 ,%4,8),%10 ;"\
173
175
INNER_STORE_m1n8(%%zmm9,0)
174
176
175
177
#define INNER_SAVE_m1n24 \
176
178
INNER_SAVE_m1n16\
177
- "leaq (%3 ,%4,8),%3 ;"\
179
+ "leaq (%10 ,%4,8),%10 ;"\
178
180
INNER_STORE_m1n8(%%zmm10,0)
179
181
180
182
#define INNER_SAVE_m2n8 \
183
+ "movq %3,%10;"\
181
184
INNER_SETINDEX\
182
185
INNER_STORE_m1n8(%%zmm8,0)\
183
186
INNER_STORE_m1n8(%%zmm9,8)
184
187
185
188
#define INNER_SAVE_m2n16 \
189
+ "movq %3,%10;"\
186
190
INNER_SETINDEX\
187
191
INNER_STORE_m1n8(%%zmm8,0)\
188
192
INNER_STORE_m1n8(%%zmm10,8)\
189
- "leaq (%3 ,%4,8),%3 ;"\
193
+ "leaq (%10 ,%4,8),%10 ;"\
190
194
INNER_STORE_m1n8(%%zmm9,0)\
191
195
INNER_STORE_m1n8(%%zmm11,8)
196
+
192
197
#define INNER_SAVE_m2n24 \
198
+ "movq %3,%10;"\
193
199
INNER_SETINDEX\
194
200
INNER_STORE_m1n8(%%zmm8,0)\
195
201
INNER_STORE_m1n8(%%zmm11,8)\
196
- "leaq (%3 ,%4,8),%3 ;"\
202
+ "leaq (%10 ,%4,8),%10 ;"\
197
203
INNER_STORE_m1n8(%%zmm9,0)\
198
204
INNER_STORE_m1n8(%%zmm12,8)\
199
- "leaq (%3 ,%4,8),%3 ;"\
205
+ "leaq (%10 ,%4,8),%10 ;"\
200
206
INNER_STORE_m1n8(%%zmm10,0)\
201
207
INNER_STORE_m1n8(%%zmm13,8)
202
- #define INNER_PREF_8x8 \
203
- "prefetcht0 (%3); prefetcht0 56(%3); prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2);"\
204
- "prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,2),%3;"\
205
- "prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,1),%3;"\
206
- "prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4);"\
207
- "subq %4,%3; subq %4,%3; subq %4,%3;"
208
+
208
209
#define INNER_TRANS_4x8 (c1 ,c2 ,c3 ,c4 ) \
209
210
"vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
210
211
"vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
211
212
"vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\
212
213
"vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
213
214
"vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
214
215
"vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
216
+
215
217
#define INNER_TRANS_8x8 (c1 ,c2 ,c3 ,c4 ,c5 ,c6 ,c7 ,c8 ) \
216
218
INNER_TRANS_4x8(c1,c2,c3,c4)\
217
219
INNER_TRANS_4x8(c5,c6,c7,c8)\
223
225
"vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\
224
226
"vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
225
227
"vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};"
228
+
226
229
//%7 for k01(input) only when m=4
227
230
#define INNER_STORE_4x8 (c1 ,c2 ,c3 ,c4 ) \
228
- "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
229
- "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
230
- "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
231
- "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
232
- "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
233
- "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
234
- "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
235
- "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
236
- "leaq (%3,%4,4),%3;"
231
+ "vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
232
+ "vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
233
+ "vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
234
+ "vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
235
+ "vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
236
+ "vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
237
+ "vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
238
+ "vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
239
+ "leaq (%10,%4,4),%10;"
240
+
237
241
#define INNER_STORE_8x8 (c1 ,c2 ,c3 ,c4 ,c5 ,c6 ,c7 ,c8 ) \
238
- "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
239
- "vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
240
- "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
241
- "vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\
242
- "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
243
- "vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\
244
- "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
245
- "vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;"
242
+ "vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\
243
+ "vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\
244
+ "vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\
245
+ "vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;"
246
+
246
247
#define INNER_SAVE_m4n8 \
248
+ "movq %3,%10;"\
247
249
INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
248
250
INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
251
+
249
252
#define INNER_SAVE_m4n16 \
253
+ "movq %3,%10;"\
250
254
INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
251
255
INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
252
256
INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
253
257
INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
258
+
254
259
#define INNER_SAVE_m4n24 \
260
+ "movq %3,%10;"\
255
261
INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
256
262
INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
257
263
INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
258
264
INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
259
265
INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
260
266
INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
267
+
261
268
#define INNER_SAVE_m8n8 \
262
- INNER_PREF_8x8 \
269
+ "movq %3,%10;" \
263
270
INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
264
271
INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
272
+
265
273
#define INNER_SAVE_m8n16 \
266
- INNER_PREF_8x8 \
274
+ "movq %3,%10;" \
267
275
INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
268
276
INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
269
- INNER_PREF_8x8\
270
277
INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
271
278
INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
279
+
272
280
#define INNER_SAVE_m8n24 \
273
- INNER_PREF_8x8 \
281
+ "movq %3,%10;" \
274
282
INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
275
283
INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
276
- INNER_PREF_8x8\
277
284
INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
278
285
INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
279
- INNER_PREF_8x8\
280
286
INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
281
287
INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
282
288
283
289
#define COMPUTE_n8 {\
290
+ b_pref = packed_b_pointer + 8 * K;\
284
291
__asm__ __volatile__(\
285
292
"vbroadcastsd (%9),%%zmm3;"\
286
293
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
290
297
INNER_KERNELm8(8)\
291
298
INNER_SAVE_m8n8\
292
299
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
293
- "shlq $3,%4;subq %4,%3;shrq $3,%4; addq $64,%3;"\
300
+ "addq $64,%3;"\
294
301
"subq $8,%8; cmpq $8,%8; jnb 42221b;"\
295
302
"42222:\n\t"\
296
303
"cmpq $4,%8; jb 42223f;"\
297
304
INNER_INIT_m4n8\
298
305
INNER_KERNELm4(8)\
299
306
INNER_SAVE_m4n8\
300
307
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
301
- "shlq $3,%4;subq %4,%3;shrq $3,%4; addq $32,%3;"\
308
+ "addq $32,%3;"\
302
309
"subq $4,%8;"\
303
310
"42223:\n\t"\
304
311
"cmpq $2,%8; jb 42224f;"\
318
325
"42225:\n\t"\
319
326
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
320
327
"shlq $3,%4;addq %4,%3;shrq $3,%4;"\
321
- :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
328
+ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
329
+ "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
322
330
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
323
331
a_block_pointer -= M * K;\
324
332
}
325
333
#define COMPUTE_n16 {\
334
+ b_pref = packed_b_pointer + 16 * K;\
326
335
__asm__ __volatile__(\
327
336
"vbroadcastsd (%9),%%zmm3;"\
328
337
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
332
341
INNER_KERNELm8(16)\
333
342
INNER_SAVE_m8n16\
334
343
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
335
- "shlq $4,%4;subq %4,%3;shrq $4,%4; addq $64,%3;"\
344
+ "addq $64,%3;"\
336
345
"subq $8,%8; cmpq $8,%8; jnb 32221b;"\
337
346
"32222:\n\t"\
338
347
"cmpq $4,%8; jb 32223f;"\
339
348
INNER_INIT_m4n16\
340
349
INNER_KERNELm4(16)\
341
350
INNER_SAVE_m4n16\
342
351
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
343
- "shlq $4,%4;subq %4,%3;shrq $4,%4; addq $32,%3;"\
352
+ "addq $32,%3;"\
344
353
"subq $4,%8;"\
345
354
"32223:\n\t"\
346
355
"cmpq $2,%8; jb 32224f;"\
347
356
INNER_INIT_m2n16\
348
357
INNER_KERNELm2(16)\
349
358
INNER_SAVE_m2n16\
350
359
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
351
- "shlq $3,%4;subq %4,%3;shrq $3,%4; addq $16,%3;"\
360
+ "addq $16,%3;"\
352
361
"subq $2,%8;"\
353
362
"32224:\n\t"\
354
363
"cmpq $1,%8; jb 32225f;"\
355
364
INNER_INIT_m1n16\
356
365
INNER_KERNELm1(16)\
357
366
INNER_SAVE_m1n16\
358
367
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
359
- "shlq $3,%4;subq %4,%3;shrq $3,%4; addq $8,%3;"\
368
+ "addq $8,%3;"\
360
369
"32225:\n\t"\
361
370
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
362
371
"shlq $4,%4;addq %4,%3;shrq $4,%4;"\
363
372
"leaq (%1,%%r12,4),%1;"\
364
- :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
373
+ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
374
+ "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
365
375
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
366
376
"zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
367
377
a_block_pointer -= M * K;\
368
378
}
369
379
#define COMPUTE_n24 {\
380
+ b_pref = packed_b_pointer + 24 * K;\
370
381
__asm__ __volatile__(\
371
382
"vbroadcastsd (%9),%%zmm3;"\
372
383
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
376
387
INNER_KERNELm8(24)\
377
388
INNER_SAVE_m8n24\
378
389
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
379
- "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4; addq $64,%3;"\
390
+ "addq $64,%3;"\
380
391
"subq $8,%8; cmpq $8,%8; jnb 22221b;"\
381
392
"22222:\n\t"\
382
393
"cmpq $4,%8; jb 22223f;"\
383
394
INNER_INIT_m4n24\
384
395
INNER_KERNELm4(24)\
385
396
INNER_SAVE_m4n24\
386
397
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
387
- "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4; addq $32,%3;"\
398
+ "addq $32,%3;"\
388
399
"subq $4,%8;"\
389
400
"22223:\n\t"\
390
401
"cmpq $2,%8; jb 22224f;"\
391
402
INNER_INIT_m2n24\
392
403
INNER_KERNELm2(24)\
393
404
INNER_SAVE_m2n24\
394
405
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
395
- "shlq $4,%4;subq %4,%3;shrq $4,%4; addq $16,%3;"\
406
+ "addq $16,%3;"\
396
407
"subq $2,%8;"\
397
408
"22224:\n\t"\
398
409
"cmpq $1,%8; jb 22225f;"\
399
410
INNER_INIT_m1n24\
400
411
INNER_KERNELm1(24)\
401
412
INNER_SAVE_m1n24\
402
413
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
403
- "shlq $4,%4;subq %4,%3;shrq $4,%4; addq $8,%3;"\
414
+ "addq $8,%3;"\
404
415
"22225:\n\t"\
405
416
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
406
417
"shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
407
418
"leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
408
- :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
419
+ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
420
+ "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
409
421
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
410
422
"zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
411
423
a_block_pointer -= M * K;\
@@ -415,8 +427,8 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
415
427
if (k == 0 || m == 0 || ndiv8 == 0 ) return ;
416
428
int64_t ldc_in_bytes = (int64_t )LDC * sizeof (double );
417
429
int64_t K = (int64_t )k ; int64_t M = (int64_t )m ;
418
- double * a_block_pointer ;
419
- double * c_pointer = c ;
430
+ double * a_block_pointer , * b_pref ;
431
+ double * c_pointer = c , * c_store = c ;
420
432
__mmask16 k01 = 0x00f0 ,k02 = 0x000f ,k03 = 0x0033 ;
421
433
BLASLONG ndiv8_count ;
422
434
double * packed_b_pointer = packed_b ;
0 commit comments