|
| 1 | +#include "common.h" |
| 2 | +#include <stdint.h> |
| 3 | + |
| 4 | +/* recommended settings: GEMM_P = 256, GEMM_Q = 256 */ |
| 5 | + |
| 6 | +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) |
| 7 | + #define A_CONJ 0 |
| 8 | + #define B_CONJ 0 |
| 9 | +#endif |
| 10 | +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) |
| 11 | + #define A_CONJ 1 |
| 12 | + #define B_CONJ 0 |
| 13 | +#endif |
| 14 | +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) |
| 15 | + #define A_CONJ 0 |
| 16 | + #define B_CONJ 1 |
| 17 | +#endif |
| 18 | +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) |
| 19 | + #define A_CONJ 1 |
| 20 | + #define B_CONJ 1 |
| 21 | +#endif |
| 22 | + |
| 23 | +/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */ |
| 24 | +/* r11 = m, r12 = k << 4, r13 = k, r14 = b_head, r15 = temp */ |
| 25 | + |
| 26 | +/* m=8, ymm 0-3 temp, ymm 4-15 acc */ |
| 27 | +#if A_CONJ == B_CONJ |
| 28 | + #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" |
| 29 | + #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmaddsub231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" |
| 30 | +#else |
| 31 | + #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" |
| 32 | + #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmsubadd231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" |
| 33 | +#endif |
| 34 | +/* expanded accumulators for m8n1 and m8n2 */ |
| 35 | +#define KERNEL_k1m8n1 \ |
| 36 | + "vbroadcastsd (%1),%%ymm0; addq $8,%1;"\ |
| 37 | + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;" acc_m4n1_exp(1,2,0,4,5)\ |
| 38 | + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2;" acc_m4n1_exp(1,2,0,6,7)\ |
| 39 | + "addq $64,%0;" |
| 40 | +#define KERNEL_k1m8n2 \ |
| 41 | + "vbroadcastsd (%1),%%ymm0; vbroadcastsd 8(%1),%%ymm1; addq $16,%1;"\ |
| 42 | + "vmovsldup (%0),%%ymm2; vmovshdup (%0),%%ymm3;" acc_m4n1_exp(2,3,0,4,5) acc_m4n1_exp(2,3,1,8,9)\ |
| 43 | + "vmovsldup 32(%0),%%ymm2; vmovshdup 32(%0),%%ymm3;" acc_m4n1_exp(2,3,0,6,7) acc_m4n1_exp(2,3,1,10,11)\ |
| 44 | + "addq $64,%0;" |
| 45 | +/* contracted accumulators for m8n4 and m8n6 */ |
| 46 | +#define acc_m8n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \ |
| 47 | + "vbroadcastss "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m8n1_con(ua,la,2,luc,llc)\ |
| 48 | + "vbroadcastss "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m8n1_con(ua,la,3,ruc,rlc) |
| 49 | +#define KERNEL_1_k1m8n4 \ |
| 50 | + "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ |
| 51 | + acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1) |
| 52 | +#define KERNEL_2_k1m8n4 \ |
| 53 | + "vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\ |
| 54 | + acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1) |
| 55 | +#define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2) |
| 56 | +#define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2) |
| 57 | +#define KERNEL_k1m8n4 KERNEL_1_k1m8n4 KERNEL_2_k1m8n4 "addq $16,%1;" |
| 58 | +#define KERNEL_k1m8n6 KERNEL_1_k1m8n6 KERNEL_2_k1m8n6 "addq $16,%1;" |
| 59 | +#define zero_4ymm(no1,no2,no3,no4) \ |
| 60 | + "vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\ |
| 61 | + "vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";" |
| 62 | +/* initialization and storage macros */ |
| 63 | +#define INIT_m8n1 zero_4ymm(4,5,6,7) |
| 64 | +#define INIT_m8n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) |
| 65 | +#define INIT_m8n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) |
| 66 | +#define INIT_m8n6 INIT_m8n4 zero_4ymm(12,13,14,15) |
| 67 | +#if A_CONJ == B_CONJ |
| 68 | + #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";" |
| 69 | +#else |
| 70 | + #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";" |
| 71 | +#endif |
| 72 | +#if A_CONJ == 0 |
| 73 | + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ |
| 74 | + "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213ps "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\ |
| 75 | + "vfmsubadd231ps %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovups %%ymm"#c","#off"("#__VA_ARGS__");" |
| 76 | +#else |
| 77 | + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ |
| 78 | + "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213ps "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\ |
| 79 | + "vfmaddsub231ps %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovups %%ymm"#tmp","#off"("#__VA_ARGS__");" |
| 80 | +#endif |
| 81 | +#define save_init_m8 "movq %2,%3; addq $64,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;" |
| 82 | +#define SAVE_m8n1 save_init_m8 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3) |
| 83 | +#define SAVE_m8n2 SAVE_m8n1\ |
| 84 | + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1) |
| 85 | +#define SAVE_m8n4 save_init_m8\ |
| 86 | + save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\ |
| 87 | + save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1) |
| 88 | +#define SAVE_m8n6 SAVE_m8n4 "leaq (%3,%4,2),%3;"\ |
| 89 | + save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1) |
| 90 | +#define COMPUTE_m8(ndim) \ |
| 91 | + "movq %%r14,%1;" INIT_m8n##ndim "movq %2,%3; movq %%r13,%5;"\ |
| 92 | + "testq %5,%5; jz "#ndim"8883f; cmpq $10,%5; jb "#ndim"8882f;"\ |
| 93 | + "movq $10,%5; movq $84,%%r15;"\ |
| 94 | + #ndim"8881:\n\t"\ |
| 95 | + "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ |
| 96 | + "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ |
| 97 | + KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ |
| 98 | + "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ |
| 99 | + KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ |
| 100 | + "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"8881b;"\ |
| 101 | + "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 7(%6);"\ |
| 102 | + #ndim"8882:\n\t"\ |
| 103 | + "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\ |
| 104 | + KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\ |
| 105 | + #ndim"8883:\n\t"\ |
| 106 | + "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim |
| 107 | + |
| 108 | +/* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */ |
| 109 | +#define KERNEL_k1m4n1 \ |
| 110 | + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ |
| 111 | + "vbroadcastsd (%1),%%ymm0;" acc_m4n1_exp(1,2,0,4,5) "addq $8,%1;" |
| 112 | +#define acc_m4n2_exp(c1l,c1r,c2l,c2r,...) \ |
| 113 | + "vbroadcastsd ("#__VA_ARGS__"),%%ymm2;" acc_m4n1_exp(0,1,2,c1l,c1r)\ |
| 114 | + "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3;" acc_m4n1_exp(0,1,3,c2l,c2r) |
| 115 | +#define KERNEL_h_k1m4n2 \ |
| 116 | + "vmovsldup (%0),%%ymm0; vmovshdup (%0),%%ymm1; addq $32,%0;" acc_m4n2_exp(4,5,6,7,%1) |
| 117 | +#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 acc_m4n2_exp(8,9,10,11,%1,%%r12,1) |
| 118 | +#define KERNEL_h_k1m4n6 KERNEL_h_k1m4n4 acc_m4n2_exp(12,13,14,15,%1,%%r12,2) |
| 119 | +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;" |
| 120 | +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" |
| 121 | +#define KERNEL_k1m4n6 KERNEL_h_k1m4n6 "addq $16,%1;" |
| 122 | +#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;" |
| 123 | +#define INIT_m4n2 zero_4ymm(4,5,6,7) |
| 124 | +#define INIT_m4n4 INIT_m4n2 zero_4ymm(8,9,10,11) |
| 125 | +#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15) |
| 126 | +#define save_init_m4 "movq %2,%3; addq $32,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;" |
| 127 | +#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3) |
| 128 | +#define SAVE_m4n2 SAVE_m4n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1) |
| 129 | +#define SAVE_m4n4 SAVE_m4n2 "leaq (%3,%4,2),%3;"\ |
| 130 | + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1) |
| 131 | +#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\ |
| 132 | + cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1) |
| 133 | +#define COMPUTE_m4(ndim) \ |
| 134 | + "movq %%r14,%1;" INIT_m4n##ndim "movq %%r13,%5;"\ |
| 135 | + "testq %5,%5; jz "#ndim"4442f;"\ |
| 136 | + #ndim"4441:\n\t"\ |
| 137 | + KERNEL_k1m4n##ndim\ |
| 138 | + "decq %5; jnz "#ndim"4441b;"\ |
| 139 | + #ndim"4442:\n\t"\ |
| 140 | + SAVE_m4n##ndim |
| 141 | + |
| 142 | +/* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */ |
| 143 | +#if A_CONJ == B_CONJ |
| 144 | + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" |
| 145 | +#else |
| 146 | + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" |
| 147 | +#endif |
| 148 | +#define KERNEL_h_k1m2n1 \ |
| 149 | + "vmovsldup (%0),%%xmm0; vmovshdup (%0),%%xmm1; addq $16,%0;"\ |
| 150 | + "vmovddup (%1),%%xmm2;" acc_m2n1_exp(0,1,2,4,5) |
| 151 | +#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1\ |
| 152 | + "vmovddup 8(%1),%%xmm3;" acc_m2n1_exp(0,1,3,6,7) |
| 153 | +#define acc_m2n2_exp(c1,c2,c3,c4,...)\ |
| 154 | + "vmovddup ("#__VA_ARGS__"),%%xmm2;" acc_m2n1_exp(0,1,2,c1,c2)\ |
| 155 | + "vmovddup 8("#__VA_ARGS__"),%%xmm3;" acc_m2n1_exp(0,1,3,c3,c4) |
| 156 | +#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1) |
| 157 | +#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2) |
| 158 | +#define KERNEL_k1m2n1 KERNEL_h_k1m2n1 "addq $8,%1;" |
| 159 | +#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;" |
| 160 | +#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;" |
| 161 | +#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $16,%1;" |
| 162 | +#define zero_2xmm(no1,no2) "vpxor %%xmm"#no1",%%xmm"#no1",%%xmm"#no1"; vpxor %%xmm"#no2",%%xmm"#no2",%%xmm"#no2";" |
| 163 | +#define INIT_m2n1 zero_2xmm(4,5) |
| 164 | +#define INIT_m2n2 INIT_m2n1 zero_2xmm(6,7) |
| 165 | +#define INIT_m2n4 INIT_m2n2 zero_2xmm(8,9) zero_2xmm(10,11) |
| 166 | +#define INIT_m2n6 INIT_m2n4 zero_2xmm(12,13) zero_2xmm(14,15) |
| 167 | +#if A_CONJ == B_CONJ |
| 168 | + #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";" |
| 169 | +#else |
| 170 | + #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";" |
| 171 | +#endif |
| 172 | +#if A_CONJ == 0 |
| 173 | + #define save_1xmm(c,tmp,alpr,alpi) \ |
| 174 | + "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213ps (%3),%%xmm"#alpr",%%xmm"#c";"\ |
| 175 | + "vfmsubadd231ps %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovups %%xmm"#c",(%3); addq %4,%3;" |
| 176 | +#else |
| 177 | + #define save_1xmm(c,tmp,alpr,alpi) \ |
| 178 | + "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213ps (%3),%%xmm"#alpi",%%xmm"#tmp";"\ |
| 179 | + "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovups %%xmm"#tmp",(%3); addq %4,%3;" |
| 180 | +#endif |
| 181 | +#define save_init_m2 "movq %2,%3; addq $16,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;" |
| 182 | +#define SAVE_m2n1 save_init_m2 cont_expxmmacc(4,5,4) save_1xmm(4,2,0,1) |
| 183 | +#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1xmm(6,3,0,1) |
| 184 | +#define SAVE_m2n4 SAVE_m2n2 cont_expacc(8,9,8) save_1xmm(8,2,0,1) cont_expacc(10,11,10) save_1xmm(10,3,0,1) |
| 185 | +#define SAVE_m2n6 SAVE_m2n4 cont_expacc(12,13,12) save_1xmm(12,2,0,1) cont_expacc(14,15,14) save_1xmm(14,3,0,1) |
| 186 | +#define COMPUTE_m2(ndim) \ |
| 187 | + "movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\ |
| 188 | + "testq %5,%5; jz "#ndim"2222f;"\ |
| 189 | + #ndim"2221:\n\t"\ |
| 190 | + KERNEL_k1m2n##ndim\ |
| 191 | + "decq %5; jnz "#ndim"2221b;"\ |
| 192 | + #ndim"2222:\n\t"\ |
| 193 | + SAVE_m2n##ndim |
| 194 | + |
| 195 | +/* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */ |
| 196 | +#if A_CONJ == B_CONJ |
| 197 | + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" |
| 198 | + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";" |
| 199 | +#else |
| 200 | + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" |
| 201 | + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfnmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";" |
| 202 | +#endif |
| 203 | +#define KERNEL_k1m1n1 \ |
| 204 | + "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\ |
| 205 | + "vmovsd (%1),%%xmm2; addq $8,%1;" acc_m1n1_exp(0,1,2,4,5) |
| 206 | +#define KERNEL_h_k1m1n2 \ |
| 207 | + "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\ |
| 208 | + "vmovups (%1),%%xmm2;" acc_m1n2_exp(0,1,2,4,5) |
| 209 | +#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovups (%1,%%r12,1),%%xmm2;" acc_m1n2_exp(0,1,2,6,7) |
| 210 | +#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovups (%1,%%r12,2),%%xmm2;" acc_m1n2_exp(0,1,2,8,9) |
| 211 | +#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;" |
| 212 | +#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;" |
| 213 | +#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $16,%1;" |
| 214 | +#define INIT_m1n1 zero_2xmm(4,5) |
| 215 | +#define INIT_m1n2 zero_2xmm(4,5) |
| 216 | +#define INIT_m1n4 INIT_m1n2 zero_2xmm(6,7) |
| 217 | +#define INIT_m1n6 INIT_m1n4 zero_2xmm(8,9) |
| 218 | +#if A_CONJ == 0 |
| 219 | + #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \ |
| 220 | + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c";"\ |
| 221 | + "vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c"; vmovsd %%xmm"#c",(%3);" |
| 222 | + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ |
| 223 | + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\ |
| 224 | + "vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c"; vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c";"\ |
| 225 | + "vmovsd %%xmm"#c",(%3); vmovhpd %%xmm"#c",(%3,%4,1); leaq (%3,%4,2),%3;" |
| 226 | +#else |
| 227 | + #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \ |
| 228 | + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1";"\ |
| 229 | + "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1"; vmovsd %%xmm"#tmp1",(%3);" |
| 230 | + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ |
| 231 | + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\ |
| 232 | + "vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1"; vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1";"\ |
| 233 | + "vmovsd %%xmm"#tmp1",(%3); vmovhpd %%xmm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;" |
| 234 | +#endif |
| 235 | +#define save_init_m1 "movq %2,%3; addq $8,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;" |
| 236 | +#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,3,0,1) |
| 237 | +#define SAVE_m1n2 save_init_m1 cont_expxmmacc(4,5,4) save_m1n2(4,2,3,0,1) |
| 238 | +#define SAVE_m1n4 SAVE_m1n2 cont_expxmmacc(6,7,6) save_m1n2(6,2,3,0,1) |
| 239 | +#define SAVE_m1n6 SAVE_m1n4 cont_expxmmacc(8,9,8) save_m1n2(8,2,3,0,1) |
| 240 | +#define COMPUTE_m1(ndim) \ |
| 241 | + "movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\ |
| 242 | + "testq %5,%5; jz "#ndim"1112f;"\ |
| 243 | + #ndim"1111:\n\t"\ |
| 244 | + KERNEL_k1m1n##ndim\ |
| 245 | + "decq %5; jnz "#ndim"1111b;"\ |
| 246 | + #ndim"1112:\n\t"\ |
| 247 | + SAVE_m1n##ndim |
| 248 | + |
| 249 | +#define COMPUTE(ndim) {\ |
| 250 | + b_pref = b_ptr + ndim * K *2;\ |
| 251 | + __asm__ __volatile__ (\ |
| 252 | + "movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $4,%%r12; movq %7,%%r11;"\ |
| 253 | + "cmpq $8,%7; jb "#ndim"9992f;"\ |
| 254 | + #ndim"9991:\n\t"\ |
| 255 | + COMPUTE_m8(ndim)\ |
| 256 | + "subq $8,%7; cmpq $8,%7; jnb "#ndim"9991b;"\ |
| 257 | + #ndim"9992:\n\t"\ |
| 258 | + "cmpq $4,%7; jb "#ndim"9993f;"\ |
| 259 | + COMPUTE_m4(ndim) "subq $4,%7;"\ |
| 260 | + #ndim"9993:\n\t"\ |
| 261 | + "cmpq $2,%7; jb "#ndim"9994f;"\ |
| 262 | + COMPUTE_m2(ndim) "subq $2,%7;"\ |
| 263 | + #ndim"9994:\n\t"\ |
| 264 | + "testq %7,%7; jz "#ndim"9995f;"\ |
| 265 | + COMPUTE_m1(ndim)\ |
| 266 | + #ndim"9995:\n\t"\ |
| 267 | + "movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\ |
| 268 | + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\ |
| 269 | + ::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\ |
| 270 | + "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ |
| 271 | + a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\ |
| 272 | +} |
| 273 | + |
| 274 | +int __attribute__ ((noinline)) |
| 275 | +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) |
| 276 | +{ |
| 277 | + if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0; |
| 278 | + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2; |
| 279 | +#if A_CONJ == B_CONJ |
| 280 | + float const_val[2] = {-alphar, -alphai}; |
| 281 | +#else |
| 282 | + float const_val[2] = {alphar, alphai}; |
| 283 | +#endif |
| 284 | + int64_t M = (int64_t)m, K = (int64_t)k; |
| 285 | + BLASLONG n_count = n; |
| 286 | + float *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B; |
| 287 | + for(;n_count>5;n_count-=6) COMPUTE(6) |
| 288 | + for(;n_count>3;n_count-=4) COMPUTE(4) |
| 289 | + for(;n_count>1;n_count-=2) COMPUTE(2) |
| 290 | + if(n_count>0) COMPUTE(1) |
| 291 | + return 0; |
| 292 | +} |
0 commit comments