Skip to content

Commit 700fe5b

Browse files
authored
Add files via upload
1 parent bb2729c commit 700fe5b

File tree

1 file changed

+25
-13
lines changed

1 file changed

+25
-13
lines changed

kernel/x86_64/zgemm3m_kernel_4x4_haswell.c

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,26 @@
1818
#define KERNEL_h_k1m4n4 \
1919
KERNEL_h_k1m4n2 "vbroadcastf128 16(%1),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm6; vfmadd231pd %%ymm2,%%ymm3,%%ymm7;"
2020
#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $32,%1;"
21-
#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \
22-
"vbroadcastf128 ("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";"\
23-
"vbroadcastf128 16("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c3"; vfmadd231pd %%ymm2,%%ymm3,"#c4";"
24-
#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,%1,%%r12,1)
21+
#define unit_kernel_k1m4n4(c1,c2,c3,c4,off1,off2,...) \
22+
"vbroadcastf128 "#off1"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";"\
23+
"vbroadcastf128 "#off2"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c3"; vfmadd231pd %%ymm2,%%ymm3,"#c4";"
24+
#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,16,%1,%%r12,1)
2525
#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $32,%1;"
26-
#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,%1,%%r12,2)
26+
#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,16,%1,%%r12,2)
2727
#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $32,%1;"
28+
#define KERNEL_k2m4n1 KERNEL_k1m4n1 KERNEL_k1m4n1
29+
#define KERNEL_k2m4n2 KERNEL_k1m4n2 KERNEL_k1m4n2
30+
#define KERNEL_k2m4n4 KERNEL_k1m4n4 KERNEL_k1m4n4
31+
#define KERNEL_k2m4n8 KERNEL_k1m4n8 KERNEL_k1m4n8
32+
#define KERNEL_k2m4n12 \
33+
"vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2;"\
34+
unit_kernel_k1m4n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,16,%1)\
35+
unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,16,%1,%%r12,1)\
36+
unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,16,%1,%%r12,2)\
37+
"vmovddup 32(%0),%%ymm1; vmovddup 40(%0),%%ymm2; prefetcht0 512(%0); addq $64,%0;"\
38+
unit_kernel_k1m4n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,32,48,%1)\
39+
unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,32,48,%1,%%r12,1)\
40+
unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,32,48,%1,%%r12,2) "addq $64,%1;"
2841
#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
2942
#define INIT_m4n2 INIT_m4n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
3043
#define INIT_m4n4 INIT_m4n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;"
@@ -53,18 +66,17 @@
5366
"cmpq $24,%4; jb "#ndim"004042f;"\
5467
#ndim"004041:\n\t"\
5568
"cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\
56-
"prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
57-
"prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
58-
"prefetcht1 (%5); leaq -63(%5,%%r15,1),%5;"\
59-
"prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
60-
"prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
61-
"prefetcht1 (%8); addq $32,%8;"\
62-
"subq $8,%4; cmpq $24,%4; jnb "#ndim"004041b;"\
69+
KERNEL_k2m4n##ndim KERNEL_k2m4n##ndim\
70+
"prefetcht1 (%5); subq $63,%5;"\
71+
KERNEL_k2m4n##ndim KERNEL_k2m4n##ndim\
72+
"addq %%r15,%5; prefetcht1 (%8); addq $32,%8;"\
73+
"subq $8,%4; cmpq $16,%4; jnb "#ndim"004041b;"\
6374
"movq %2,%5;"\
6475
#ndim"004042:\n\t"\
6576
"testq %4,%4; jz "#ndim"004043f;"\
66-
"prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\
77+
"prefetcht0 (%5); prefetcht0 63(%5);"\
6778
KERNEL_k1m4n##ndim\
79+
"prefetcht0 (%5,%3,4); prefetcht0 63(%5,%3,4); addq %3,%5;"\
6880
"decq %4; jmp "#ndim"004042b;"\
6981
#ndim"004043:\n\t"\
7082
"prefetcht0 (%%r14); prefetcht0 64(%%r14);"\

0 commit comments

Comments
 (0)