42
42
#define unit_save_m16n2 (c1 ,c2 ) \
43
43
"vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\
44
44
"vfmadd213ps (%5),%%zmm0,%%zmm4; vfmadd213ps (%5,%3,1),%%zmm0,%%zmm5;"\
45
- "prefetcht1 127(%5); prefetcht1 127(%5,%3,1);"\
46
45
"vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;"
47
46
#define SAVE_h_m16n2 "movq %2,%5;" unit_save_m16n2(%%zmm8,%%zmm9)
48
47
#define SAVE_h_m16n4 SAVE_h_m16n2 unit_save_m16n2(%%zmm10,%%zmm11)
54
53
#define SAVE_m16 (ndim ) SAVE_h_m16n##ndim "addq $64,%2;"
55
54
#define COMPUTE_m16 (ndim ) \
56
55
INIT_m16n##ndim\
57
- "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\
58
- "cmpq $4 ,%4; jb "#ndim"016162f;"\
56
+ "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5; "\
57
+ "cmpq $16 ,%4; jb "#ndim"016162f;"\
59
58
#ndim"016161:\n\t"\
60
59
KERNEL_k1m16n##ndim\
61
60
KERNEL_k1m16n##ndim\
61
+ "prefetcht1 (%5); prefetcht1 63(%5); addq %3,%5;"\
62
62
KERNEL_k1m16n##ndim\
63
63
KERNEL_k1m16n##ndim\
64
- "subq $4,%4; cmpq $4,%4; jnb "#ndim"016161b;"\
64
+ "prefetcht1 (%8); addq $"#ndim",%8;"\
65
+ "subq $4,%4; cmpq $16,%4; jnb "#ndim"016161b;"\
66
+ "movq %2,%5;"\
65
67
#ndim"016162:\n\t"\
66
68
"testq %4,%4; jz "#ndim"016163f;"\
69
+ "prefetcht0 (%5); prefetcht0 63(%5); prefetcht0 (%5,%3,1); prefetcht0 63(%5,%3,1);"\
67
70
KERNEL_k1m16n##ndim\
71
+ "leaq (%5,%3,2),%5;"\
68
72
"decq %4; jmp "#ndim"016162b;"\
69
73
#ndim"016163:\n\t"\
74
+ "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\
70
75
SAVE_m16(ndim)
71
76
72
77
/* m = 8 */ /* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
350
355
#define COMPUTE_m1 (ndim ) COMPUTE_m1_n##ndim
351
356
352
357
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */
353
- /* %6 = "+r"(&alpha), %7 = "+r"(M) */
358
+ /* %6 = "+r"(&alpha), %7 = "+r"(M), %8 = "+r"(next_b) */
354
359
/* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */
355
360
356
361
#define COMPUTE (ndim ) {\
362
+ next_b = b_pointer + ndim * K;\
357
363
__asm__ __volatile__(\
358
364
"vbroadcastss (%6),%%zmm0;"\
359
365
"movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\
378
384
COMPUTE_m1(ndim)\
379
385
"33105"#ndim":\n\t"\
380
386
"movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\
381
- :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(alp),"+r"(M)\
387
+ :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(alp),"+r"(M),"+r"(next_b) \
382
388
::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\
383
389
"zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\
384
390
"cc","memory");\
@@ -391,7 +397,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
391
397
int64_t ldc_in_bytes = (int64_t )LDC * sizeof (float );float ALPHA = alpha ;
392
398
int64_t M = (int64_t )m , K = (int64_t )k ;
393
399
BLASLONG n_count = n ;
394
- float * a_pointer = A ,* b_pointer = B ,* c_pointer = C ,* ctemp = C ,* alp = & ALPHA ;
400
+ float * a_pointer = A ,* b_pointer = B ,* c_pointer = C ,* ctemp = C ,* alp = & ALPHA , * next_b = B ;
395
401
for (;n_count > 23 ;n_count -= 24 ) COMPUTE (24 )
396
402
for (;n_count > 19 ;n_count -= 20 ) COMPUTE (20 )
397
403
for (;n_count > 15 ;n_count -= 16 ) COMPUTE (16 )
0 commit comments