|
18 | 18 | #define KERNEL_h_k1m4n4 \
|
19 | 19 | KERNEL_h_k1m4n2 "vbroadcastf128 16(%1),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm6; vfmadd231pd %%ymm2,%%ymm3,%%ymm7;"
|
20 | 20 | #define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $32,%1;"
|
21 |
| -#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ |
22 |
| - "vbroadcastf128 ("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";"\ |
23 |
| - "vbroadcastf128 16("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c3"; vfmadd231pd %%ymm2,%%ymm3,"#c4";" |
24 |
| -#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,%1,%%r12,1) |
| 21 | +#define unit_kernel_k1m4n4(c1,c2,c3,c4,off1,off2,...) \ |
| 22 | + "vbroadcastf128 "#off1"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";"\ |
| 23 | + "vbroadcastf128 "#off2"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c3"; vfmadd231pd %%ymm2,%%ymm3,"#c4";" |
| 24 | +#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,16,%1,%%r12,1) |
25 | 25 | #define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $32,%1;"
|
26 |
| -#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,%1,%%r12,2) |
| 26 | +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,16,%1,%%r12,2) |
27 | 27 | #define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $32,%1;"
|
| 28 | +#define KERNEL_k2m4n1 KERNEL_k1m4n1 KERNEL_k1m4n1 |
| 29 | +#define KERNEL_k2m4n2 KERNEL_k1m4n2 KERNEL_k1m4n2 |
| 30 | +#define KERNEL_k2m4n4 KERNEL_k1m4n4 KERNEL_k1m4n4 |
| 31 | +#define KERNEL_k2m4n8 KERNEL_k1m4n8 KERNEL_k1m4n8 |
| 32 | +#define KERNEL_k2m4n12 \ |
| 33 | + "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2;"\ |
| 34 | + unit_kernel_k1m4n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,16,%1)\ |
| 35 | + unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,16,%1,%%r12,1)\ |
| 36 | + unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,16,%1,%%r12,2)\ |
| 37 | + "vmovddup 32(%0),%%ymm1; vmovddup 40(%0),%%ymm2; prefetcht0 512(%0); addq $64,%0;"\ |
| 38 | + unit_kernel_k1m4n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,32,48,%1)\ |
| 39 | + unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,32,48,%1,%%r12,1)\ |
| 40 | + unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,32,48,%1,%%r12,2) "addq $64,%1;" |
28 | 41 | #define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
|
29 | 42 | #define INIT_m4n2 INIT_m4n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
|
30 | 43 | #define INIT_m4n4 INIT_m4n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;"
|
|
53 | 66 | "cmpq $24,%4; jb "#ndim"004042f;"\
|
54 | 67 | #ndim"004041:\n\t"\
|
55 | 68 | "cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\
|
56 |
| - "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ |
57 |
| - "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ |
58 |
| - "prefetcht1 (%5); leaq -63(%5,%%r15,1),%5;"\ |
59 |
| - "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ |
60 |
| - "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ |
61 |
| - "prefetcht1 (%8); addq $32,%8;"\ |
62 |
| - "subq $8,%4; cmpq $24,%4; jnb "#ndim"004041b;"\ |
| 69 | + KERNEL_k2m4n##ndim KERNEL_k2m4n##ndim\ |
| 70 | + "prefetcht1 (%5); subq $63,%5;"\ |
| 71 | + KERNEL_k2m4n##ndim KERNEL_k2m4n##ndim\ |
| 72 | + "addq %%r15,%5; prefetcht1 (%8); addq $32,%8;"\ |
| 73 | + "subq $8,%4; cmpq $16,%4; jnb "#ndim"004041b;"\ |
63 | 74 | "movq %2,%5;"\
|
64 | 75 | #ndim"004042:\n\t"\
|
65 | 76 | "testq %4,%4; jz "#ndim"004043f;"\
|
66 |
| - "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\ |
| 77 | + "prefetcht0 (%5); prefetcht0 63(%5);"\ |
67 | 78 | KERNEL_k1m4n##ndim\
|
| 79 | + "prefetcht0 (%5,%3,4); prefetcht0 63(%5,%3,4); addq %3,%5;"\ |
68 | 80 | "decq %4; jmp "#ndim"004042b;"\
|
69 | 81 | #ndim"004043:\n\t"\
|
70 | 82 | "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\
|
|
0 commit comments