|
104 | 104 | KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\
|
105 | 105 | #ndim"8883:\n\t"\
|
106 | 106 | "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim
|
| 107 | + |
107 | 108 | /* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */
|
108 | 109 | #define KERNEL_k1m4n1 \
|
109 | 110 | "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
|
|
137 | 138 | "decq %5; jnz "#ndim"4441b;"\
|
138 | 139 | #ndim"4442:\n\t"\
|
139 | 140 | SAVE_m4n##ndim
|
| 141 | + |
140 | 142 | /* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */
|
141 | 143 | #if A_CONJ == B_CONJ
|
142 | 144 | #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
|
|
189 | 191 | "decq %5; jnz "#ndim"2221b;"\
|
190 | 192 | #ndim"2222:\n\t"\
|
191 | 193 | SAVE_m2n##ndim
|
| 194 | + |
192 | 195 | /* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */
|
193 | 196 | #if A_CONJ == B_CONJ
|
194 | 197 | #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
|
|
242 | 245 | "decq %5; jnz "#ndim"1111b;"\
|
243 | 246 | #ndim"1112:\n\t"\
|
244 | 247 | SAVE_m1n##ndim
|
| 248 | + |
245 | 249 | #define COMPUTE(ndim) {\
|
246 | 250 | b_pref = b_ptr + ndim * K *2;\
|
247 | 251 | __asm__ __volatile__ (\
|
|
266 | 270 | "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
|
267 | 271 | a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\
|
268 | 272 | }
|
| 273 | + |
269 | 274 | int __attribute__ ((noinline))
|
270 | 275 | CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
|
271 | 276 | {
|
|
0 commit comments