Skip to content

Commit dd38b4e

Browse files
authored
Merge pull request #5225 from annop-w/gemv_n
Improve performance for SGEMVN on NEONVERSEN1
2 parents 3a088de + d535728 commit dd38b4e

File tree

2 files changed

+220
-1
lines changed

2 files changed

+220
-1
lines changed

kernel/arm64/KERNEL.NEOVERSEN1

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ DSCALKERNEL = scal.S
6060
CSCALKERNEL = zscal.S
6161
ZSCALKERNEL = zscal.S
6262

63-
SGEMVNKERNEL = gemv_n.S
63+
SGEMVNKERNEL = sgemv_n_neon.c
6464
DGEMVNKERNEL = gemv_n.S
6565
CGEMVNKERNEL = zgemv_n.S
6666
ZGEMVNKERNEL = zgemv_n.S

kernel/arm64/sgemv_n_neon.c

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
/***************************************************************************
2+
Copyright (c) 2025, The OpenBLAS Project
3+
All rights reserved.
4+
5+
Redistribution and use in source and binary forms, with or without
6+
modification, are permitted provided that the following conditions are
7+
met:
8+
9+
1. Redistributions of source code must retain the above copyright
10+
notice, this list of conditions and the following disclaimer.
11+
12+
2. Redistributions in binary form must reproduce the above copyright
13+
notice, this list of conditions and the following disclaimer in
14+
the documentation and/or other materials provided with the
15+
distribution.
16+
3. Neither the name of the OpenBLAS project nor the names of
17+
its contributors may be used to endorse or promote products
18+
derived from this software without specific prior written
19+
permission.
20+
21+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
30+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31+
*****************************************************************************/
32+
33+
#include <arm_neon.h>
34+
#include "common.h"
35+
36+
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
37+
{
38+
BLASLONG i;
39+
BLASLONG ix,iy;
40+
BLASLONG j;
41+
FLOAT *a_ptr;
42+
FLOAT temp;
43+
44+
ix = 0;
45+
a_ptr = a;
46+
47+
if (inc_x == 1 && inc_y == 1) {
48+
FLOAT *a0_ptr = a + lda * 0;
49+
FLOAT *a1_ptr = a + lda * 1;
50+
FLOAT *a2_ptr = a + lda * 2;
51+
FLOAT *a3_ptr = a + lda * 3;
52+
FLOAT *a4_ptr = a + lda * 4;
53+
FLOAT *a5_ptr = a + lda * 5;
54+
FLOAT *a6_ptr = a + lda * 6;
55+
FLOAT *a7_ptr = a + lda * 7;
56+
57+
j = 0;
58+
while (j + 3 < n) {
59+
float32x4_t x0_vec = vld1q_f32(x + j);
60+
x0_vec = vmulq_n_f32(x0_vec, alpha);
61+
i = 0;
62+
while (i + 7 < m) {
63+
float32x4_t a00_vec = vld1q_f32(a0_ptr + i);
64+
float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4);
65+
float32x4_t a10_vec = vld1q_f32(a1_ptr + i);
66+
float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4);
67+
float32x4_t a20_vec = vld1q_f32(a2_ptr + i);
68+
float32x4_t a21_vec = vld1q_f32(a2_ptr + i + 4);
69+
float32x4_t a30_vec = vld1q_f32(a3_ptr + i);
70+
float32x4_t a31_vec = vld1q_f32(a3_ptr + i + 4);
71+
72+
float32x4_t y0_vec = vld1q_f32(y + i);
73+
float32x4_t y1_vec = vld1q_f32(y + i + 4);
74+
y0_vec = vmlaq_laneq_f32(y0_vec, a00_vec, x0_vec, 0);
75+
y0_vec = vmlaq_laneq_f32(y0_vec, a10_vec, x0_vec, 1);
76+
y0_vec = vmlaq_laneq_f32(y0_vec, a20_vec, x0_vec, 2);
77+
y0_vec = vmlaq_laneq_f32(y0_vec, a30_vec, x0_vec, 3);
78+
y1_vec = vmlaq_laneq_f32(y1_vec, a01_vec, x0_vec, 0);
79+
y1_vec = vmlaq_laneq_f32(y1_vec, a11_vec, x0_vec, 1);
80+
y1_vec = vmlaq_laneq_f32(y1_vec, a21_vec, x0_vec, 2);
81+
y1_vec = vmlaq_laneq_f32(y1_vec, a31_vec, x0_vec, 3);
82+
83+
vst1q_f32(y + i, y0_vec);
84+
vst1q_f32(y + i + 4, y1_vec);
85+
86+
i += 8;
87+
}
88+
while (i + 3 < m) {
89+
float32x4_t a0_vec = vld1q_f32(a0_ptr + i);
90+
float32x4_t a1_vec = vld1q_f32(a1_ptr + i);
91+
float32x4_t a2_vec = vld1q_f32(a2_ptr + i);
92+
float32x4_t a3_vec = vld1q_f32(a3_ptr + i);
93+
94+
float32x4_t y_vec = vld1q_f32(y + i);
95+
y_vec = vmlaq_laneq_f32(y_vec, a0_vec, x0_vec, 0);
96+
y_vec = vmlaq_laneq_f32(y_vec, a1_vec, x0_vec, 1);
97+
y_vec = vmlaq_laneq_f32(y_vec, a2_vec, x0_vec, 2);
98+
y_vec = vmlaq_laneq_f32(y_vec, a3_vec, x0_vec, 3);
99+
100+
vst1q_f32(y + i, y_vec);
101+
102+
i += 4;
103+
}
104+
while (i + 1 < m) {
105+
float32x2_t a0_vec = vld1_f32(a0_ptr + i);
106+
float32x2_t a1_vec = vld1_f32(a1_ptr + i);
107+
float32x2_t a2_vec = vld1_f32(a2_ptr + i);
108+
float32x2_t a3_vec = vld1_f32(a3_ptr + i);
109+
110+
float32x2_t y_vec = vld1_f32(y + i);
111+
y_vec = vmla_laneq_f32(y_vec, a0_vec, x0_vec, 0);
112+
y_vec = vmla_laneq_f32(y_vec, a1_vec, x0_vec, 1);
113+
y_vec = vmla_laneq_f32(y_vec, a2_vec, x0_vec, 2);
114+
y_vec = vmla_laneq_f32(y_vec, a3_vec, x0_vec, 3);
115+
116+
vst1_f32(y + i, y_vec);
117+
118+
i += 2;
119+
}
120+
while (i < m) {
121+
y[i] += a0_ptr[i] * x0_vec[0];
122+
y[i] += a1_ptr[i] * x0_vec[1];
123+
y[i] += a2_ptr[i] * x0_vec[2];
124+
y[i] += a3_ptr[i] * x0_vec[3];
125+
126+
i++;
127+
}
128+
129+
a0_ptr += lda * 4;
130+
a1_ptr += lda * 4;
131+
a2_ptr += lda * 4;
132+
a3_ptr += lda * 4;
133+
134+
j += 4;
135+
}
136+
while (j + 1 < n) {
137+
float32x2_t x0_vec = vld1_f32(x + j);
138+
x0_vec = vmul_n_f32(x0_vec, alpha);
139+
i = 0;
140+
while (i + 7 < m) {
141+
float32x4_t a00_vec = vld1q_f32(a0_ptr + i);
142+
float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4);
143+
float32x4_t a10_vec = vld1q_f32(a1_ptr + i);
144+
float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4);
145+
146+
float32x4_t y0_vec = vld1q_f32(y + i);
147+
float32x4_t y1_vec = vld1q_f32(y + i + 4);
148+
y0_vec = vmlaq_lane_f32(y0_vec, a00_vec, x0_vec, 0);
149+
y0_vec = vmlaq_lane_f32(y0_vec, a10_vec, x0_vec, 1);
150+
y1_vec = vmlaq_lane_f32(y1_vec, a01_vec, x0_vec, 0);
151+
y1_vec = vmlaq_lane_f32(y1_vec, a11_vec, x0_vec, 1);
152+
153+
vst1q_f32(y + i, y0_vec);
154+
vst1q_f32(y + i + 4, y1_vec);
155+
156+
i += 8;
157+
}
158+
while (i + 3 < m) {
159+
float32x4_t a0_vec = vld1q_f32(a0_ptr + i);
160+
float32x4_t a1_vec = vld1q_f32(a1_ptr + i);
161+
162+
float32x4_t y_vec = vld1q_f32(y + i);
163+
y_vec = vmlaq_lane_f32(y_vec, a0_vec, x0_vec, 0);
164+
y_vec = vmlaq_lane_f32(y_vec, a1_vec, x0_vec, 1);
165+
166+
vst1q_f32(y + i, y_vec);
167+
168+
i += 4;
169+
}
170+
while (i + 1 < m) {
171+
float32x2_t a0_vec = vld1_f32(a0_ptr + i);
172+
float32x2_t a1_vec = vld1_f32(a1_ptr + i);
173+
174+
float32x2_t y_vec = vld1_f32(y + i);
175+
y_vec = vmla_lane_f32(y_vec, a0_vec, x0_vec, 0);
176+
y_vec = vmla_lane_f32(y_vec, a1_vec, x0_vec, 1);
177+
178+
vst1_f32(y + i, y_vec);
179+
180+
i += 2;
181+
}
182+
while (i < m) {
183+
y[i] += a0_ptr[i] * x0_vec[0];
184+
y[i] += a1_ptr[i] * x0_vec[1];
185+
186+
i++;
187+
}
188+
189+
a0_ptr += lda * 2;
190+
a1_ptr += lda * 2;
191+
192+
j += 2;
193+
}
194+
while (j < n) {
195+
i = 0;
196+
temp = alpha * x[j];
197+
while (i < m) {
198+
y[i] += a0_ptr[i] * temp;
199+
i++;
200+
}
201+
202+
a0_ptr += lda;
203+
j++;
204+
}
205+
return (0);
206+
}
207+
208+
for (j = 0; j < n; j++) {
209+
temp = alpha * x[ix];
210+
iy = 0;
211+
for (i = 0; i < m; i++) {
212+
y[iy] += temp * a_ptr[i];
213+
iy += inc_y;
214+
}
215+
a_ptr += lda;
216+
ix += inc_x;
217+
}
218+
return (0);
219+
}

0 commit comments

Comments
 (0)