Skip to content

Commit efb9038

Browse files
authored
Fix inline assembly constraints
1 parent e976557 commit efb9038

File tree

1 file changed

+97
-97
lines changed

1 file changed

+97
-97
lines changed

kernel/x86_64/sgemv_n_microk_bulldozer-4.c

Lines changed: 97 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
3737

3838
__asm__ __volatile__
3939
(
40-
"vbroadcastss (%2), %%xmm12 \n\t" // x0
41-
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
42-
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
43-
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
44-
"vbroadcastss 16(%2), %%xmm0 \n\t" // x4
45-
"vbroadcastss 20(%2), %%xmm1 \n\t" // x5
46-
"vbroadcastss 24(%2), %%xmm2 \n\t" // x6
47-
"vbroadcastss 28(%2), %%xmm3 \n\t" // x7
40+
"vbroadcastss (%3), %%xmm12 \n\t" // x0
41+
"vbroadcastss 4(%3), %%xmm13 \n\t" // x1
42+
"vbroadcastss 8(%3), %%xmm14 \n\t" // x2
43+
"vbroadcastss 12(%3), %%xmm15 \n\t" // x3
44+
"vbroadcastss 16(%3), %%xmm0 \n\t" // x4
45+
"vbroadcastss 20(%3), %%xmm1 \n\t" // x5
46+
"vbroadcastss 24(%3), %%xmm2 \n\t" // x6
47+
"vbroadcastss 28(%3), %%xmm3 \n\t" // x7
4848

4949
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
5050

@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
5454
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
5555
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
5656

57-
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
58-
"vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
59-
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
60-
"vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
57+
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
58+
"vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t"
59+
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
60+
"vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t"
6161
"addq $4 , %0 \n\t"
6262

63-
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
64-
"vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
65-
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
66-
"vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
67-
"addq $4 , %8 \n\t"
63+
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
64+
"vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
65+
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
66+
"vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
67+
"addq $4 , %2 \n\t"
6868

6969
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
70-
"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
70+
"vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
7171
"subq $4 , %1 \n\t"
72-
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
72+
"vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y
7373

7474
"2: \n\t"
7575

@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
7979
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
8080
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
8181

82-
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
83-
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
84-
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
85-
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
86-
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
87-
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
88-
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
89-
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
90-
91-
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
92-
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
93-
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
94-
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
95-
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
96-
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
97-
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
98-
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
82+
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
83+
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
84+
"vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
85+
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
86+
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
87+
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
88+
"vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
89+
"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
90+
91+
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
92+
"vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
93+
"vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
94+
"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
95+
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
96+
"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
97+
"vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
98+
"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
9999

100-
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
101-
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
102-
"vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
103-
"vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
100+
"vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
101+
"vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
102+
"vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y
103+
"vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y
104104

105105
"addq $8 , %0 \n\t"
106-
"addq $8 , %8 \n\t"
106+
"addq $8 , %2 \n\t"
107107
"subq $8 , %1 \n\t"
108108

109109

@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
120120
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
121121
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
122122

123-
"prefetcht0 192(%4,%0,4) \n\t"
124-
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
125-
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
126123
"prefetcht0 192(%5,%0,4) \n\t"
127-
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
128-
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
124+
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
125+
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
129126
"prefetcht0 192(%6,%0,4) \n\t"
130-
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
131-
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
127+
"vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
128+
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
132129
"prefetcht0 192(%7,%0,4) \n\t"
133-
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
130+
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
131+
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
132+
"prefetcht0 192(%8,%0,4) \n\t"
133+
"vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
134134
".align 2 \n\t"
135-
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
136-
137-
"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
138-
"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
139-
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
140-
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
141-
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
142-
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
143-
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
144-
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
145-
146-
"prefetcht0 192(%4,%8,4) \n\t"
147-
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
148-
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
149-
"prefetcht0 192(%5,%8,4) \n\t"
150-
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
151-
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
152-
"prefetcht0 192(%6,%8,4) \n\t"
153-
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
154-
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
155-
"prefetcht0 192(%7,%8,4) \n\t"
156-
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
157-
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
135+
"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
136+
137+
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t"
138+
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t"
139+
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t"
140+
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t"
141+
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t"
142+
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t"
143+
"vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t"
144+
"vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t"
145+
146+
"prefetcht0 192(%5,%2,4) \n\t"
147+
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
148+
"vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
149+
"prefetcht0 192(%6,%2,4) \n\t"
150+
"vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
151+
"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
152+
"prefetcht0 192(%7,%2,4) \n\t"
153+
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
154+
"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
155+
"prefetcht0 192(%8,%2,4) \n\t"
156+
"vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
157+
"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
158158

159-
"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
160-
"vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
161-
"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
162-
"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
163-
"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
164-
"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
165-
"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
166-
"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
159+
"vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t"
160+
"vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t"
161+
"vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t"
162+
"vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t"
163+
"vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t"
164+
"vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t"
165+
"vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t"
166+
"vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t"
167167

168-
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
169-
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
170-
"vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
171-
"vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
168+
"vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
169+
"vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
170+
"vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
171+
"vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
172172

173173
"addq $16, %0 \n\t"
174-
"vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
175-
"vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
176-
"addq $16, %8 \n\t"
177-
"vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
178-
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
174+
"vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y
175+
"vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y
176+
"addq $16, %2 \n\t"
177+
"vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y
178+
"vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y
179179

180180
"subq $16, %1 \n\t"
181181
"jnz 1b \n\t"
@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
184184

185185
:
186186
"+r" (i), // 0
187-
"+r" (n) // 1
187+
"+r" (n), // 1
188+
"+r" (lda4) // 2
188189
:
189-
"r" (x), // 2
190-
"r" (y), // 3
191-
"r" (ap[0]), // 4
192-
"r" (ap[1]), // 5
193-
"r" (ap[2]), // 6
194-
"r" (ap[3]), // 7
195-
"r" (lda4), // 8
190+
"r" (x), // 3
191+
"r" (y), // 4
192+
"r" (ap[0]), // 5
193+
"r" (ap[1]), // 6
194+
"r" (ap[2]), // 7
195+
"r" (ap[3]), // 8
196196
"r" (alpha) // 9
197197
: "cc",
198198
"%xmm0", "%xmm1",

0 commit comments

Comments
 (0)