|
2 | 2 | #include <stdint.h>
|
3 | 3 | #include <immintrin.h>
|
4 | 4 |
|
5 |
| -//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. |
| 5 | +//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. |
| 6 | + |
6 | 7 | /* row-major c_block */
|
7 | 8 | #define INNER_KERNEL_k1m1n8 \
|
8 | 9 | "prefetcht0 384(%1);"\
|
|
13 | 14 | INNER_KERNEL_k1m1n8\
|
14 | 15 | "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
|
15 | 16 |
|
16 |
| -#define INNER_KERNEL_k1m4n8 \ |
17 |
| - INNER_KERNEL_k1m2n8\ |
18 |
| - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\ |
19 |
| - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;" |
20 |
| - |
21 |
| -#define INNER_KERNEL_k1m8n8 \ |
22 |
| - INNER_KERNEL_k1m4n8\ |
23 |
| - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\ |
24 |
| - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\ |
25 |
| - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\ |
26 |
| - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;" |
27 |
| - |
28 | 17 | #define INNER_KERNEL_k1m1n16 \
|
29 | 18 | "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\
|
30 | 19 | "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\
|
|
34 | 23 | INNER_KERNEL_k1m1n16\
|
35 | 24 | "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
|
36 | 25 |
|
37 |
| -#define INNER_KERNEL_k1m4n16 \ |
38 |
| - INNER_KERNEL_k1m2n16\ |
39 |
| - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\ |
40 |
| - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;" |
41 |
| - |
42 |
| -#define INNER_KERNEL_k1m8n16 \ |
43 |
| - INNER_KERNEL_k1m4n16\ |
44 |
| - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\ |
45 |
| - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\ |
46 |
| - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\ |
47 |
| - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;" |
48 |
| - |
49 | 26 | #define INNER_KERNEL_k1m1n24 \
|
50 | 27 | "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\
|
51 | 28 | "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\
|
|
55 | 32 | INNER_KERNEL_k1m1n24\
|
56 | 33 | "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
|
57 | 34 |
|
| 35 | +/* row-major z-partition c_block */ |
| 36 | +#define INNER_KERNEL_k1m4n8 \ |
| 37 | + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\ |
| 38 | + "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\ |
| 39 | + "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;" |
| 40 | + |
| 41 | +#define INNER_KERNEL_k1m4n16 \ |
| 42 | + INNER_KERNEL_k1m4n8\ |
| 43 | + "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\ |
| 44 | + "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;" |
| 45 | + |
58 | 46 | #define INNER_KERNEL_k1m4n24 \
|
59 |
| - INNER_KERNEL_k1m2n24\ |
60 |
| - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\ |
61 |
| - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;" |
| 47 | + INNER_KERNEL_k1m4n16\ |
| 48 | + "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\ |
| 49 | + "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;" |
62 | 50 |
|
63 |
| -#define INNER_KERNEL_k1m8n24 \ |
64 |
| - INNER_KERNEL_k1m4n24\ |
65 |
| - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\ |
66 |
| - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\ |
67 |
| - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\ |
68 |
| - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;" |
| 51 | +#define INNER_KERNEL_k1m8n8 \ |
| 52 | + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\ |
| 53 | + "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\ |
| 54 | + "prefetcht0 128(%1);"\ |
| 55 | + "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\ |
| 56 | + "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\ |
| 57 | + "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\ |
| 58 | + "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;" |
| 59 | + |
| 60 | +#define INNER_KERNEL_k1m8n16 \ |
| 61 | + INNER_KERNEL_k1m8n8\ |
| 62 | + "prefetcht0 128(%1,%%r12,2);"\ |
| 63 | + "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\ |
| 64 | + "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\ |
| 65 | + "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\ |
| 66 | + "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;" |
69 | 67 |
|
| 68 | +#define INNER_KERNEL_k1m8n24 \ |
| 69 | + INNER_KERNEL_k1m8n16\ |
| 70 | + "prefetcht0 128(%1,%%r12,4);"\ |
| 71 | + "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\ |
| 72 | + "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\ |
| 73 | + "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\ |
| 74 | + "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;" |
| 75 | + |
| 76 | +/* micro kernels */ |
70 | 77 | #define INNER_KERNELm1(nn) \
|
71 | 78 | "cmpq $1,%2;jb "#nn"3f;"\
|
72 | 79 | #nn"4:\n\t"\
|
|
84 | 91 | #define INNER_KERNELm4(nn) \
|
85 | 92 | "cmpq $1,%2;jb "#nn"00f;"\
|
86 | 93 | #nn"01:\n\t"\
|
87 |
| - INNER_KERNEL_k1m4n##nn "addq $32,%0;"\ |
| 94 | + INNER_KERNEL_k1m4n##nn "addq $64,%1;"\ |
88 | 95 | "decq %2;cmpq $1,%2;jnb "#nn"01b;"\
|
89 | 96 | #nn"00:\n\t"
|
90 | 97 |
|
91 | 98 | /* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
|
92 | 99 | #define INNER_KERNELm8(nn) \
|
93 |
| - "movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\ |
| 100 | + "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\ |
94 | 101 | #nn"008:\n\t"\
|
95 |
| - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ |
96 |
| - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ |
| 102 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
| 103 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
| 104 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
97 | 105 | "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
|
98 |
| - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ |
99 |
| - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ |
100 |
| - "prefetcht1 (%11); addq $16,%11;"\ |
101 |
| - "subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\ |
| 106 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
| 107 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
| 108 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
| 109 | + "prefetcht1 (%11); addq $32,%11;"\ |
| 110 | + "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\ |
102 | 111 | "movq %3,%10;"\
|
103 | 112 | #nn"001:\n\t"\
|
104 | 113 | "cmpq $1,%2;jb "#nn"000f;"\
|
105 | 114 | "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
|
106 |
| - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ |
| 115 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
107 | 116 | "decq %2;jmp "#nn"001b;"\
|
108 | 117 | ""#nn"000:\n\t"
|
109 | 118 |
|
|
207 | 216 | INNER_STORE_m1n8(%%zmm13,8)
|
208 | 217 |
|
209 | 218 | #define INNER_TRANS_4x8(c1,c2,c3,c4) \
|
210 |
| - "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\ |
211 |
| - "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\ |
212 |
| - "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\ |
213 |
| - "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\ |
214 |
| - "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\ |
215 |
| - "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";" |
| 219 | + "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\ |
| 220 | + "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\ |
| 221 | + "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\ |
| 222 | + "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\ |
| 223 | + |
| 224 | +#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \ |
| 225 | + "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\ |
| 226 | + "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\ |
| 227 | + "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\ |
| 228 | + "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};" |
216 | 229 |
|
217 | 230 | #define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
|
218 |
| - INNER_TRANS_4x8(c1,c2,c3,c4)\ |
219 |
| - INNER_TRANS_4x8(c5,c6,c7,c8)\ |
220 |
| - "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\ |
221 |
| - "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\ |
222 |
| - "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\ |
223 |
| - "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\ |
224 |
| - "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\ |
225 |
| - "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\ |
226 |
| - "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\ |
227 |
| - "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};" |
| 231 | + INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8) |
228 | 232 |
|
229 | 233 | //%7 for k01(input) only when m=4
|
230 | 234 | #define INNER_STORE_4x8(c1,c2,c3,c4) \
|
|
250 | 254 | INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
|
251 | 255 |
|
252 | 256 | #define INNER_SAVE_m4n16 \
|
253 |
| - "movq %3,%10;"\ |
254 |
| - INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ |
255 |
| - INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ |
256 |
| - INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\ |
257 |
| - INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15) |
| 257 | + INNER_SAVE_m4n8\ |
| 258 | + INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ |
| 259 | + INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15) |
258 | 260 |
|
259 | 261 | #define INNER_SAVE_m4n24 \
|
260 |
| - "movq %3,%10;"\ |
261 |
| - INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ |
262 |
| - INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ |
263 |
| - INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ |
264 |
| - INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ |
265 |
| - INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\ |
266 |
| - INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19) |
| 262 | + INNER_SAVE_m4n16\ |
| 263 | + INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\ |
| 264 | + INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19) |
267 | 265 |
|
268 | 266 | #define INNER_SAVE_m8n8 \
|
269 | 267 | "movq %3,%10;"\
|
270 | 268 | INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
|
271 | 269 | INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
|
272 | 270 |
|
273 | 271 | #define INNER_SAVE_m8n16 \
|
274 |
| - "movq %3,%10;"\ |
275 |
| - INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ |
276 |
| - INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ |
277 |
| - INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\ |
278 |
| - INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23) |
| 272 | + INNER_SAVE_m8n8\ |
| 273 | + INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\ |
| 274 | + INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23) |
279 | 275 |
|
280 | 276 | #define INNER_SAVE_m8n24 \
|
281 |
| - "movq %3,%10;"\ |
282 |
| - INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ |
283 |
| - INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ |
284 |
| - INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ |
285 |
| - INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ |
286 |
| - INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\ |
287 |
| - INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31) |
| 277 | + INNER_SAVE_m8n16\ |
| 278 | + INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\ |
| 279 | + INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31) |
288 | 280 |
|
289 | 281 | #define COMPUTE_n8 {\
|
290 | 282 | b_pref = packed_b_pointer + 8 * K;\
|
|
327 | 319 | "shlq $3,%4;addq %4,%3;shrq $3,%4;"\
|
328 | 320 | :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
|
329 | 321 | "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
|
330 |
| - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ |
| 322 | + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ |
331 | 323 | a_block_pointer -= M * K;\
|
332 | 324 | }
|
333 | 325 | #define COMPUTE_n16 {\
|
|
372 | 364 | "leaq (%1,%%r12,4),%1;"\
|
373 | 365 | :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
|
374 | 366 | "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
|
375 |
| - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ |
| 367 | + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ |
376 | 368 | "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
|
377 | 369 | a_block_pointer -= M * K;\
|
378 | 370 | }
|
|
417 | 409 | "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
|
418 | 410 | "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
|
419 | 411 | :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
|
420 |
| - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ |
421 |
| - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ |
422 |
| - "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ |
| 412 | + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\ |
| 413 | + "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\ |
| 414 | + "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ |
423 | 415 | a_block_pointer -= M * K;\
|
424 | 416 | }
|
425 | 417 | static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8
|
|
0 commit comments