25
25
"vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;"
26
26
27
27
#define INNER_KERNEL_k1m1n16 \
28
- "prefetcht0 384 (%1); prefetcht0 448(% 1);"\
29
- "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64 (%1),%%zmm6; addq $128 ,%1;"\
28
+ "prefetcht0 128 (%1); prefetcht0 128(%1,%%r12, 1);"\
29
+ "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1 ),%%zmm6; addq $64 ,%1;"\
30
30
"vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;"
31
31
32
32
#define INNER_KERNEL_k1m2n16 \
46
46
"vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;"
47
47
48
48
#define INNER_KERNEL_k1m1n24 \
49
- "prefetcht0 384 (%1); prefetcht0 448 (%1); prefetcht0 512 (%1);"\
50
- "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64 (%1),%%zmm6; vmovupd 128 (%1),%%zmm7; addq $192 ,%1;"\
49
+ "prefetcht0 128 (%1); prefetcht0 128 (%1,%%r12,1 ); prefetcht0 128 (%1,%%r12,2 );"\
50
+ "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1 ),%%zmm6; vmovupd (%1,%%r12,2 ),%%zmm7; addq $64 ,%1;"\
51
51
"vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;"
52
52
53
53
#define INNER_KERNEL_k1m2n24 \
292
292
293
293
#define COMPUTE_n8 {\
294
294
__asm__ __volatile__(\
295
- "movq %8,%%r14;movq %2,%%r13;"\
295
+ "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12; "\
296
296
"cmpq $8,%8; jb 42222f;"\
297
297
"42221:\n\t"\
298
298
INNER_INIT_m8n8\
299
299
INNER_KERNELm8(8)\
300
300
INNER_SAVE_m8n8\
301
- "movq %%r13,%2; shlq $6,%2; subq %2,%1;shrq $6,%2 ;"\
301
+ "movq %%r13,%2; subq %%r12,%1 ;"\
302
302
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\
303
303
"subq $8,%8; cmpq $8,%8; jnb 42221b;"\
304
304
"42222:\n\t"\
305
305
"cmpq $4,%8; jb 42223f;"\
306
306
INNER_INIT_m4n8\
307
307
INNER_KERNELm4(8)\
308
308
INNER_SAVE_m4n8\
309
- "movq %%r13,%2; shlq $6,%2; subq %2,%1;shrq $6,%2 ;"\
309
+ "movq %%r13,%2; subq %%r12,%1 ;"\
310
310
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\
311
311
"subq $4,%8;"\
312
312
"42223:\n\t"\
313
313
"cmpq $2,%8; jb 42224f;"\
314
314
INNER_INIT_m2n8\
315
315
INNER_KERNELm2(8)\
316
316
INNER_SAVE_m2n8\
317
- "movq %%r13,%2; shlq $6,%2; subq %2,%1;shrq $6,%2 ;"\
317
+ "movq %%r13,%2; subq %%r12,%1 ;"\
318
318
"addq $16,%3;"\
319
319
"subq $2,%8;"\
320
320
"42224:\n\t"\
321
321
"cmpq $1,%8; jb 42225f;"\
322
322
INNER_INIT_m1n8\
323
323
INNER_KERNELm1(8)\
324
324
INNER_SAVE_m1n8\
325
- "movq %%r13,%2; shlq $6,%2; subq %2,%1;shrq $6,%2 ;"\
325
+ "movq %%r13,%2; subq %%r12,%1 ;"\
326
326
"addq $8,%3;"\
327
327
"42225:\n\t"\
328
328
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
333
333
}
334
334
#define COMPUTE_n16 {\
335
335
__asm__ __volatile__(\
336
- "movq %8,%%r14;movq %2,%%r13;"\
336
+ "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12; "\
337
337
"cmpq $8,%8; jb 32222f;"\
338
338
"32221:\n\t"\
339
339
INNER_INIT_m8n16\
340
340
INNER_KERNELm8(16)\
341
341
INNER_SAVE_m8n16\
342
- "movq %%r13,%2; shlq $7,%2; subq %2,%1;shrq $7,%2 ;"\
342
+ "movq %%r13,%2; subq %%r12,%1 ;"\
343
343
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
344
344
"subq $8,%8; cmpq $8,%8; jnb 32221b;"\
345
345
"32222:\n\t"\
346
346
"cmpq $4,%8; jb 32223f;"\
347
347
INNER_INIT_m4n16\
348
348
INNER_KERNELm4(16)\
349
349
INNER_SAVE_m4n16\
350
- "movq %%r13,%2; shlq $7,%2; subq %2,%1;shrq $7,%2 ;"\
350
+ "movq %%r13,%2; subq %%r12,%1 ;"\
351
351
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
352
352
"subq $4,%8;"\
353
353
"32223:\n\t"\
354
354
"cmpq $2,%8; jb 32224f;"\
355
355
INNER_INIT_m2n16\
356
356
INNER_KERNELm2(16)\
357
357
INNER_SAVE_m2n16\
358
- "movq %%r13,%2; shlq $7,%2; subq %2,%1;shrq $7,%2 ;"\
358
+ "movq %%r13,%2; subq %%r12,%1 ;"\
359
359
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\
360
360
"subq $2,%8;"\
361
361
"32224:\n\t"\
362
362
"cmpq $1,%8; jb 32225f;"\
363
363
INNER_INIT_m1n16\
364
364
INNER_KERNELm1(16)\
365
365
INNER_SAVE_m1n16\
366
- "movq %%r13,%2; shlq $7,%2; subq %2,%1;shrq $7,%2 ;"\
366
+ "movq %%r13,%2; subq %%r12,%1 ;"\
367
367
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\
368
368
"32225:\n\t"\
369
369
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
370
370
"shlq $4,%4;addq %4,%3;shrq $4,%4;"\
371
- :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
371
+ "leaq (%1,%%r12,2),%1;"\
372
+ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
372
373
::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
373
- "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r13","r14");\
374
+ "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12"," r13","r14");\
374
375
a_block_pointer -= M * K;\
375
376
}
376
377
#define COMPUTE_n24 {\
377
378
__asm__ __volatile__(\
378
- "movq %8,%%r14;movq %9 ,%%r15 ;movq %2,%%r13 ;"\
379
+ "movq %8,%%r14;movq %2 ,%%r13 ;movq %2,%%r12;shlq $6,%%r12 ;"\
379
380
"cmpq $8,%8; jb 22222f;"\
380
381
"22221:\n\t"\
381
382
INNER_INIT_m8n24\
382
- "prefetcht2 (%%r15); prefetcht2 64(%%r15);"\
383
383
INNER_KERNELm8(24)\
384
- "prefetcht2 128(%%r15); prefetcht2 192(%%r15);"\
385
384
INNER_SAVE_m8n24\
386
- "prefetcht2 256(%%r15); prefetcht2 320(%%r15); addq $384,%%r15;"\
387
- "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\
385
+ "movq %%r13,%2; subq %%r12,%1;"\
388
386
"shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
389
387
"subq $8,%8; cmpq $8,%8; jnb 22221b;"\
390
388
"22222:\n\t"\
391
389
"cmpq $4,%8; jb 22223f;"\
392
390
INNER_INIT_m4n24\
393
391
INNER_KERNELm4(24)\
394
392
INNER_SAVE_m4n24\
395
- "movq %%r13,%2; shlq $6,%2; subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2 ;"\
393
+ "movq %%r13,%2; subq %%r12,%1 ;"\
396
394
"shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
397
395
"subq $4,%8;"\
398
396
"22223:\n\t"\
399
397
"cmpq $2,%8; jb 22224f;"\
400
398
INNER_INIT_m2n24\
401
399
INNER_KERNELm2(24)\
402
400
INNER_SAVE_m2n24\
403
- "movq %%r13,%2; shlq $6,%2; subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2 ;"\
401
+ "movq %%r13,%2; subq %%r12,%1 ;"\
404
402
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\
405
403
"subq $2,%8;"\
406
404
"22224:\n\t"\
407
405
"cmpq $1,%8; jb 22225f;"\
408
406
INNER_INIT_m1n24\
409
407
INNER_KERNELm1(24)\
410
408
INNER_SAVE_m1n24\
411
- "movq %%r13,%2; shlq $6,%2; subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2 ;"\
409
+ "movq %%r13,%2; subq %%r12,%1 ;"\
412
410
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\
413
411
"22225:\n\t"\
414
412
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
415
413
"shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
416
- :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes), \
417
- "+Yk"(k02 ),"+Yk"(k03 ),"+Yk"(k01 ),"+r"(M ),"+r"(packed_b_pointer )\
414
+ "leaq (%1,%%r12,2),%1; addq %%r12,%1;" \
415
+ :"+r"(a_block_pointer ),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+ Yk"(k02 ),"+Yk"(k03 ),"+Yk"(k01 ),"+r"(M )\
418
416
::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
419
- "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r13 ","r14 ","r15 ");\
417
+ "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12 ","r13 ","r14 ");\
420
418
a_block_pointer -= M * K;\
421
419
}
422
420
423
- static void KERNEL_MAIN (double * packed_a , double * packed_b , BLASLONG m , BLASLONG ndiv8 , BLASLONG k , BLASLONG LDC , double * c ){//icopy=8,ocopy=8
421
+ static void __attribute__ (( noinline )) KERNEL_MAIN (double * packed_a , double * packed_b , BLASLONG m , BLASLONG ndiv8 , BLASLONG k , BLASLONG LDC , double * c ){//icopy=8,ocopy=8
424
422
//perform C += A<pack> B<pack>
425
423
if (k == 0 || m == 0 || ndiv8 == 0 ) return ;
426
424
int64_t ldc_in_bytes = (int64_t )LDC * sizeof (double );
@@ -429,38 +427,17 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
429
427
double * c_pointer = c ;
430
428
__mmask16 k01 = 0x00f0 ,k02 = 0x000f ,k03 = 0x0033 ;
431
429
BLASLONG ndiv8_count ;
432
- double * b_scratch ;
433
- posix_memalign (& b_scratch ,64 ,192 * k );
434
430
double * packed_b_pointer = packed_b ;
435
431
a_block_pointer = packed_a ;
436
432
for (ndiv8_count = ndiv8 ;ndiv8_count > 2 ;ndiv8_count -= 3 ){
437
- __asm__ __volatile__ (
438
- "testq %2,%2; jz 100002f;movq %2,%%r13;shlq $6,%%r13;"
439
- "100001:\n\t"
440
- "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; vmovupd (%0,%%r13,2),%%zmm7; addq $64,%0;"
441
- "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); vmovupd %%zmm7,128(%1); addq $192,%1;"
442
- "decq %2; testq %2,%2; jnz 100001b;"
443
- "100002:\n\t"
444
- "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,2),%0;subq %%r13,%1;subq %%r13,%1;subq %%r13,%1;"
445
- :"+r" (packed_b_pointer ),"+r" (b_scratch ),"+r" (K )::"r13" ,"cc" ,"memory" ,"zmm5" ,"zmm6" ,"zmm7" );
446
433
COMPUTE_n24
447
434
}
448
435
for (;ndiv8_count > 1 ;ndiv8_count -= 2 ){
449
- __asm__ __volatile__ (
450
- "testq %2,%2; jz 1000002f;movq %2,%%r13;shlq $6,%%r13;"
451
- "1000001:\n\t"
452
- "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; addq $64,%0;"
453
- "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); addq $128,%1;"
454
- "decq %2; testq %2,%2; jnz 1000001b;"
455
- "1000002:\n\t"
456
- "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,1),%0;subq %%r13,%1;subq %%r13,%1;"
457
- :"+r" (packed_b_pointer ),"+r" (b_scratch ),"+r" (K )::"r13" ,"cc" ,"memory" ,"zmm5" ,"zmm6" );
458
436
COMPUTE_n16
459
437
}
460
438
if (ndiv8_count > 0 ){
461
439
COMPUTE_n8
462
440
}
463
- free (b_scratch );b_scratch = NULL ;
464
441
}
465
442
466
443
/* __m512d accumulators: zc1-zc4; temporary variables: za1,zb1-zb2 */
0 commit comments