Skip to content

Commit d711906

Browse files
committed
Add symv kernels for arm64
1 parent 39718cd commit d711906

File tree

8 files changed

+644
-0
lines changed

8 files changed

+644
-0
lines changed

kernel/arm64/KERNEL.ARMV8SVE

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ DGEMVTKERNEL = gemv_t_sve_v1x3.c
8484
CGEMVTKERNEL = zgemv_t.S
8585
ZGEMVTKERNEL = zgemv_t.S
8686

87+
SSYMV_L_KERNEL = symv_L_sve_v1x4.c
88+
SSYMV_U_KERNEL = symv_U_sve_v1x4.c
89+
DSYMV_L_KERNEL = symv_L_sve_v1x4.c
90+
DSYMV_U_KERNEL = symv_U_sve_v1x4.c
91+
8792
SASUMKERNEL = sasum_thunderx2t99.c
8893
DASUMKERNEL = dasum_thunderx2t99.c
8994
CASUMKERNEL = casum_thunderx2t99.c

kernel/arm64/KERNEL.NEOVERSEN1

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ DGEMVTKERNEL = gemv_t.S
7070
CGEMVTKERNEL = zgemv_t.S
7171
ZGEMVTKERNEL = zgemv_t.S
7272

73+
SSYMV_L_KERNEL = symv_L_asimd_4x4.c
74+
SSYMV_U_KERNEL = symv_U_asimd_4x4.c
75+
DSYMV_L_KERNEL = symv_L_asimd_4x4.c
76+
DSYMV_U_KERNEL = symv_U_asimd_4x4.c
7377

7478
SASUMKERNEL = sasum_thunderx2t99.c
7579
DASUMKERNEL = dasum_thunderx2t99.c

kernel/arm64/symv_L_asimd_4x4.c

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
/***************************************************************************
2+
Copyright (c) 2025, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
8+
1. Redistributions of source code must retain the above copyright
9+
notice, this list of conditions and the following disclaimer.
10+
11+
2. Redistributions in binary form must reproduce the above copyright
12+
notice, this list of conditions and the following disclaimer in
13+
the documentation and/or other materials provided with the
14+
distribution.
15+
3. Neither the name of the OpenBLAS project nor the names of
16+
its contributors may be used to endorse or promote products
17+
derived from this software without specific prior written
18+
permission.
19+
20+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
29+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30+
*****************************************************************************/
31+
32+
#include "symv_microk_asimd_4x4.c"
33+
34+
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda,
35+
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
36+
{
37+
BLASLONG i, j;
38+
FLOAT temp1, temp2;
39+
FLOAT tmp1[4];
40+
FLOAT tmp2[4];
41+
FLOAT *a0, *a1, *a2, *a3;
42+
FLOAT x0, x1, x2, x3;
43+
FLOAT *X = x;
44+
FLOAT *Y = y;
45+
46+
if (inc_y != 1) {
47+
Y = buffer;
48+
COPY_K(m, y, inc_y, Y, 1);
49+
}
50+
if (inc_x != 1) {
51+
if (inc_y != 1) {
52+
X = Y + m;
53+
} else {
54+
X = buffer;
55+
}
56+
COPY_K(m, x, inc_x, X, 1);
57+
}
58+
59+
BLASLONG offset1 = (offset / 4) * 4;
60+
for (j = 0; j < offset1; j+=4) {
61+
a0 = &a[j*lda];
62+
a1 = a0 + lda;
63+
a2 = a1 + lda;
64+
a3 = a2 + lda;
65+
x0 = X[j];
66+
x1 = X[j+1];
67+
x2 = X[j+2];
68+
x3 = X[j+3];
69+
tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3;
70+
tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3;
71+
tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3;
72+
tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3;
73+
tmp1[0] = alpha * x0;
74+
tmp1[1] = alpha * x1;
75+
tmp1[2] = alpha * x2;
76+
tmp1[3] = alpha * x3;
77+
78+
BLASLONG m2 = (m/4)*4;
79+
if (m2 > j+4)
80+
symv_kernel_4x4(j+4, m2, a0, a1, a2, a3, X, Y, tmp1, tmp2);
81+
82+
for (i = m2; i < m; i++) {
83+
Y[i] += tmp1[0] * a0[i];
84+
tmp2[0] += a0[i] * X[i];
85+
Y[i] += tmp1[1] * a1[i];
86+
tmp2[1] += a1[i] * X[i];
87+
Y[i] += tmp1[2] * a2[i];
88+
tmp2[2] += a2[i] * X[i];
89+
Y[i] += tmp1[3] * a3[i];
90+
tmp2[3] += a3[i] * X[i];
91+
}
92+
Y[j] += alpha * tmp2[0];
93+
Y[j+1] += alpha * tmp2[1];
94+
Y[j+2] += alpha * tmp2[2];
95+
Y[j+3] += alpha * tmp2[3];
96+
}
97+
98+
for (j = offset1; j < offset; j++) {
99+
temp1 = alpha * X[j];
100+
temp2 = 0.0;
101+
Y[j] += temp1 * a[j*lda+j];
102+
for (i = j+1; i < m; i++) {
103+
Y[i] += temp1 * a[j*lda+i];
104+
temp2 += a[j*lda+i] * X[i];
105+
}
106+
Y[j] += alpha * temp2;
107+
}
108+
109+
if (inc_y != 1) {
110+
COPY_K(m, Y, 1, y, inc_y);
111+
}
112+
return(0);
113+
}

kernel/arm64/symv_L_sve_v1x4.c

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
/***************************************************************************
2+
Copyright (c) 2025, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
8+
1. Redistributions of source code must retain the above copyright
9+
notice, this list of conditions and the following disclaimer.
10+
11+
2. Redistributions in binary form must reproduce the above copyright
12+
notice, this list of conditions and the following disclaimer in
13+
the documentation and/or other materials provided with the
14+
distribution.
15+
3. Neither the name of the OpenBLAS project nor the names of
16+
its contributors may be used to endorse or promote products
17+
derived from this software without specific prior written
18+
permission.
19+
20+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
29+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30+
*****************************************************************************/
31+
32+
#include "symv_microk_sve_v1x4.c"
33+
34+
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda,
35+
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
36+
{
37+
BLASLONG i, j;
38+
FLOAT temp1, temp2;
39+
FLOAT tmp1[4];
40+
FLOAT tmp2[4];
41+
FLOAT *a0, *a1, *a2, *a3;
42+
FLOAT x0, x1, x2, x3;
43+
FLOAT *X = x;
44+
FLOAT *Y = y;
45+
46+
if (inc_y != 1) {
47+
Y = buffer;
48+
COPY_K(m, y, inc_y, Y, 1);
49+
}
50+
if (inc_x != 1) {
51+
if (inc_y != 1) {
52+
X = Y + m;
53+
} else {
54+
X = buffer;
55+
}
56+
COPY_K(m, x, inc_x, X, 1);
57+
}
58+
59+
BLASLONG offset1 = (offset / 4) * 4;
60+
61+
for (j = 0; j < offset1; j+=4) {
62+
a0 = &a[j*lda];
63+
a1 = a0 + lda;
64+
a2 = a1 + lda;
65+
a3 = a2 + lda;
66+
x0 = X[j];
67+
x1 = X[j+1];
68+
x2 = X[j+2];
69+
x3 = X[j+3];
70+
tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3;
71+
tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3;
72+
tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3;
73+
tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3;
74+
tmp1[0] = alpha * x0;
75+
tmp1[1] = alpha * x1;
76+
tmp1[2] = alpha * x2;
77+
tmp1[3] = alpha * x3;
78+
79+
symv_kernel_v1x4(j+4, m, a0, a1, a2, a3, X, Y, tmp1, tmp2);
80+
81+
Y[j] += alpha * tmp2[0];
82+
Y[j+1] += alpha * tmp2[1];
83+
Y[j+2] += alpha * tmp2[2];
84+
Y[j+3] += alpha * tmp2[3];
85+
}
86+
87+
for (j = offset1; j < offset; j++) {
88+
temp1 = alpha * X[j];
89+
temp2 = 0.0;
90+
a0 = &a[j*lda];
91+
Y[j] += temp1 * a0[j];
92+
for (i = j+1; i < m; i++) {
93+
Y[i] += temp1 * a0[i];
94+
temp2 += a0[i] * X[i];
95+
}
96+
Y[j] += alpha * temp2;
97+
}
98+
99+
if (inc_y != 1) {
100+
COPY_K(m, Y, 1, y, inc_y);
101+
}
102+
return(0);
103+
}

kernel/arm64/symv_U_asimd_4x4.c

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
/***************************************************************************
2+
Copyright (c) 2025, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
8+
1. Redistributions of source code must retain the above copyright
9+
notice, this list of conditions and the following disclaimer.
10+
11+
2. Redistributions in binary form must reproduce the above copyright
12+
notice, this list of conditions and the following disclaimer in
13+
the documentation and/or other materials provided with the
14+
distribution.
15+
3. Neither the name of the OpenBLAS project nor the names of
16+
its contributors may be used to endorse or promote products
17+
derived from this software without specific prior written
18+
permission.
19+
20+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
29+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30+
*****************************************************************************/
31+
32+
#include "symv_microk_asimd_4x4.c"
33+
34+
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda,
35+
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
36+
{
37+
BLASLONG i, j, j1, j2, m2;
38+
FLOAT temp1, temp2;
39+
FLOAT tmp1[4];
40+
FLOAT tmp2[4];
41+
FLOAT *a0, *a1, *a2, *a3;
42+
FLOAT *X = x;
43+
FLOAT *Y = y;
44+
45+
BLASLONG m1 = m - offset;
46+
if (inc_y != 1) {
47+
Y = buffer;
48+
COPY_K(m, y, inc_y, Y, 1);
49+
}
50+
if (inc_x != 1) {
51+
if (inc_y != 1) {
52+
X = Y + m;
53+
} else {
54+
X = buffer;
55+
}
56+
COPY_K(m, x, inc_x, X, 1);
57+
}
58+
59+
m2 = m - (offset % 4);
60+
for (j = m1; j < m2; j += 4) {
61+
tmp1[0] = alpha * X[j];
62+
tmp1[1] = alpha * X[j+1];
63+
tmp1[2] = alpha * X[j+2];
64+
tmp1[3] = alpha * X[j+3];
65+
tmp2[0] = 0.0;
66+
tmp2[1] = 0.0;
67+
tmp2[2] = 0.0;
68+
tmp2[3] = 0.0;
69+
a0 = &a[j*lda];
70+
a1 = a0 + lda;
71+
a2 = a1 + lda;
72+
a3 = a2 + lda;
73+
j1 = (j / 4) * 4;
74+
if ( j1 )
75+
symv_kernel_4x4(0, j1, a0, a1, a2, a3, X, Y, tmp1, tmp2);
76+
77+
j2 = 0;
78+
for (j1 = j ; j1 < j+4 ; j1++) {
79+
temp1 = tmp1[j2];
80+
temp2 = tmp2[j2];
81+
a0 = &a[j1*lda];
82+
for (i=j ; i<j1; i++) {
83+
Y[i] += temp1 * a0[i];
84+
temp2 += a0[i] * X[i];
85+
}
86+
Y[j1] += temp1 * a0[j1] + alpha * temp2;
87+
j2++;
88+
}
89+
}
90+
91+
for ( ; j < m; j++) {
92+
temp1 = alpha * X[j];
93+
temp2 = 0.0;
94+
a0 = &a[j*lda];
95+
for (i = 0 ; i < j; i++) {
96+
Y[i] += temp1 * a0[i];
97+
temp2 += a0[i] * X[i];
98+
}
99+
Y[j] += temp1 * a0[j] + alpha * temp2;
100+
}
101+
102+
if (inc_y != 1) {
103+
COPY_K(m, Y, 1, y, inc_y);
104+
}
105+
return(0);
106+
}

0 commit comments

Comments
 (0)