Skip to content

Commit 4255a58

Browse files
authored
Rename operands to put lda on the input/output constraint list
1 parent 46e415b commit 4255a58

File tree

1 file changed

+61
-65
lines changed

1 file changed

+61
-65
lines changed

kernel/x86_64/sgemv_n_microk_haswell-4.c

Lines changed: 61 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -37,43 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
3737
__asm__ __volatile__
3838
(
3939
"vzeroupper \n\t"
40-
"vbroadcastss (%2), %%ymm12 \n\t" // x0
41-
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
42-
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
43-
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
44-
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
45-
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
46-
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
47-
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
40+
"vbroadcastss (%3), %%ymm12 \n\t" // x0
41+
"vbroadcastss 4(%3), %%ymm13 \n\t" // x1
42+
"vbroadcastss 8(%3), %%ymm14 \n\t" // x2
43+
"vbroadcastss 12(%3), %%ymm15 \n\t" // x3
44+
"vbroadcastss 16(%3), %%ymm0 \n\t" // x4
45+
"vbroadcastss 20(%3), %%ymm1 \n\t" // x5
46+
"vbroadcastss 24(%3), %%ymm2 \n\t" // x6
47+
"vbroadcastss 28(%3), %%ymm3 \n\t" // x7
4848

4949
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
5050

51-
"movq %8, %%xmm10 \n\t" //save lda
52-
5351
"testq $0x04, %1 \n\t"
5452
"jz 2f \n\t"
5553

56-
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
54+
"vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
5755
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
5856
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
5957

60-
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
61-
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
62-
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
63-
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
58+
"vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t"
59+
"vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t"
60+
"vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t"
61+
"vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t"
6462

65-
"vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
66-
"vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
67-
"vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
68-
"vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
63+
"vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
64+
"vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
65+
"vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
66+
"vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
6967

7068
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
7169
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
7270
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
7371

74-
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
72+
"vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
7573

76-
"addq $4 , %8 \n\t"
74+
"addq $4 , %2 \n\t"
7775
"addq $4 , %0 \n\t"
7876
"subq $4 , %1 \n\t"
7977

@@ -82,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
8280
"testq $0x08, %1 \n\t"
8381
"jz 3f \n\t"
8482

85-
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
83+
"vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
8684
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
8785
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
8886

89-
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
90-
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
91-
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
92-
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
87+
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
88+
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t"
89+
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
90+
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t"
9391

94-
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
95-
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
96-
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
97-
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
92+
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
93+
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t"
94+
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
95+
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t"
9896

9997
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
10098
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
10199
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
102100

103101

104-
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
102+
"vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
105103

106-
"addq $8 , %8 \n\t"
104+
"addq $8 , %2 \n\t"
107105
"addq $8 , %0 \n\t"
108106
"subq $8 , %1 \n\t"
109107

@@ -118,61 +116,59 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
118116

119117
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
120118
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
121-
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
122-
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
123-
124-
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
125-
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
126-
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
127-
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
128-
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
129-
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
130-
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
131-
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
132-
133-
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
119+
"vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y
120+
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y
121+
122+
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
123+
"vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t"
124+
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t"
125+
"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t"
126+
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
127+
"vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t"
128+
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t"
129+
"vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t"
130+
131+
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
134132
"addq $16, %0 \n\t"
135-
"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
136-
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
137-
"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
138-
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
139-
"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
140-
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
141-
"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
133+
"vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t"
134+
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t"
135+
"vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t"
136+
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
137+
"vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t"
138+
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t"
139+
"vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t"
142140

143141
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
144142
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
145143

146-
"addq $16, %8 \n\t"
147-
"vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
144+
"addq $16, %2 \n\t"
145+
"vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y
148146
"subq $16, %1 \n\t"
149-
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
147+
"vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y
150148

151149
"jnz 1b \n\t"
152150

153151
"4: \n\t"
154152
"vzeroupper \n\t"
155-
"movq %%xmm10, %8 \n\t" //restore lda
156153

157154
:
158155
"+r" (i), // 0
159-
"+r" (n) // 1
156+
"+r" (n), // 1
157+
"+r" (lda4) // 2
160158
:
161-
"r" (x), // 2
162-
"r" (y), // 3
163-
"r" (ap[0]), // 4
164-
"r" (ap[1]), // 5
165-
"r" (ap[2]), // 6
166-
"r" (ap[3]), // 7
167-
"r" (lda4), // 8
159+
"r" (x), // 3
160+
"r" (y), // 4
161+
"r" (ap[0]), // 5
162+
"r" (ap[1]), // 6
163+
"r" (ap[2]), // 7
164+
"r" (ap[3]), // 8
168165
"r" (alpha) // 9
169166
: "cc",
170167
"%xmm0", "%xmm1",
171168
"%xmm2", "%xmm3",
172169
"%xmm4", "%xmm5",
173170
"%xmm6", "%xmm7",
174171
"%xmm8", "%xmm9",
175-
"%xmm10",
176172
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
177173
"memory"
178174
);

0 commit comments

Comments
 (0)