Skip to content

Commit 8242b1f

Browse files
authored
Fix inline assembly constraints
1 parent efb9038 commit 8242b1f

File tree

1 file changed

+247
-0
lines changed

1 file changed

+247
-0
lines changed

dgemv_n_microk_piledriver-4.c

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
/***************************************************************************
2+
Copyright (c) 2014, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
29+
30+
#define HAVE_KERNEL_4x8 1
31+
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
32+
33+
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
34+
{
35+
36+
BLASLONG register i = 0;
37+
38+
__asm__ __volatile__
39+
(
40+
"vzeroupper \n\t"
41+
"vbroadcastsd (%3), %%ymm12 \n\t" // x0
42+
"vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
43+
"vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
44+
"vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
45+
"vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
46+
"vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
47+
"vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
48+
"vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
49+
50+
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
51+
52+
"testq $0x04, %1 \n\t"
53+
"jz 2f \n\t"
54+
55+
"vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
56+
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
57+
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
58+
59+
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
60+
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
61+
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
62+
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
63+
64+
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
65+
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
66+
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
67+
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
68+
69+
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
70+
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
71+
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
72+
73+
74+
"vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
75+
76+
"addq $4 , %2 \n\t"
77+
"addq $4 , %0 \n\t"
78+
"subq $4 , %1 \n\t"
79+
80+
"2: \n\t"
81+
82+
"cmpq $0, %1 \n\t"
83+
"je 3f \n\t"
84+
85+
86+
".align 16 \n\t"
87+
"1: \n\t"
88+
89+
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
90+
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
91+
"vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
92+
"vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
93+
94+
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
95+
"vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
96+
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
97+
"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
98+
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
99+
"vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
100+
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
101+
"vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
102+
103+
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
104+
"addq $8 , %0 \n\t"
105+
"vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
106+
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
107+
"vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
108+
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
109+
"vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
110+
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
111+
"vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
112+
113+
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
114+
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
115+
116+
"addq $8 , %2 \n\t"
117+
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
118+
"subq $8 , %1 \n\t"
119+
"vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
120+
121+
"jnz 1b \n\t"
122+
123+
"3: \n\t"
124+
"vzeroupper \n\t"
125+
126+
:
127+
"+r" (i), // 0
128+
"+r" (n), // 1
129+
"+r" (lda4) // 2
130+
:
131+
"r" (x), // 3
132+
"r" (y), // 4
133+
"r" (ap[0]), // 5
134+
"r" (ap[1]), // 6
135+
"r" (ap[2]), // 7
136+
"r" (ap[3]), // 8
137+
"r" (alpha) // 9
138+
: "cc",
139+
"%xmm0", "%xmm1",
140+
"%xmm2", "%xmm3",
141+
"%xmm4", "%xmm5",
142+
"%xmm6", "%xmm7",
143+
"%xmm8", "%xmm9",
144+
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
145+
"memory"
146+
);
147+
148+
}
149+
150+
151+
152+
#define HAVE_KERNEL_4x4 1
153+
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
154+
155+
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
156+
{
157+
158+
BLASLONG register i = 0;
159+
160+
__asm__ __volatile__
161+
(
162+
"vzeroupper \n\t"
163+
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
164+
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
165+
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
166+
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
167+
168+
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha
169+
170+
"testq $0x04, %1 \n\t"
171+
"jz 2f \n\t"
172+
173+
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
174+
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
175+
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
176+
177+
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
178+
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
179+
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
180+
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
181+
182+
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
183+
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
184+
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
185+
186+
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
187+
188+
"addq $4 , %0 \n\t"
189+
"subq $4 , %1 \n\t"
190+
191+
"2: \n\t"
192+
193+
"cmpq $0, %1 \n\t"
194+
"je 3f \n\t"
195+
196+
197+
".align 16 \n\t"
198+
"1: \n\t"
199+
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
200+
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
201+
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
202+
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
203+
204+
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
205+
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
206+
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
207+
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
208+
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
209+
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
210+
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
211+
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
212+
213+
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
214+
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
215+
216+
"vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
217+
"vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
218+
219+
"addq $8 , %0 \n\t"
220+
"subq $8 , %1 \n\t"
221+
"jnz 1b \n\t"
222+
223+
"3: \n\t"
224+
"vzeroupper \n\t"
225+
226+
:
227+
"+r" (i), // 0
228+
"+r" (n) // 1
229+
:
230+
"r" (x), // 2
231+
"r" (y), // 3
232+
"r" (ap[0]), // 4
233+
"r" (ap[1]), // 5
234+
"r" (ap[2]), // 6
235+
"r" (ap[3]), // 7
236+
"r" (alpha) // 8
237+
: "cc",
238+
"%xmm4", "%xmm5",
239+
"%xmm6", "%xmm7",
240+
"%xmm8", "%xmm9",
241+
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
242+
"memory"
243+
);
244+
245+
}
246+
247+

0 commit comments

Comments
 (0)