Skip to content

Commit 246ca29

Browse files
authored
Add ZARCH implementation of ?sum
as trivial copies of the respective ?asum kernels with the ABS and vflpsb calls removed
1 parent 9d717cb commit 246ca29

File tree

7 files changed

+587
-0
lines changed

7 files changed

+587
-0
lines changed

kernel/zarch/KERNEL.Z13

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c
3535
CASUMKERNEL = ../arm/zasum.c
3636
ZASUMKERNEL = zasum.c
3737

38+
SSUMKERNEL = ../arm/asum.c
39+
DSUMKERNEL = dasum.c
40+
CSUMKERNEL = ../arm/zasum.c
41+
ZSUMKERNEL = zasum.c
42+
3843
SAXPYKERNEL = ../arm/axpy.c
3944
DAXPYKERNEL = daxpy.c
4045
CAXPYKERNEL = ../arm/zaxpy.c

kernel/zarch/KERNEL.Z14

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c
3535
CASUMKERNEL = casum.c
3636
ZASUMKERNEL = zasum.c
3737

38+
SSUMKERNEL = ssum.c
39+
DSUMKERNEL = dsum.c
40+
CSUMKERNEL = csum.c
41+
ZSUMKERNEL = zsum.c
42+
3843
SAXPYKERNEL = saxpy.c
3944
DAXPYKERNEL = daxpy.c
4045
CAXPYKERNEL = caxpy.c

kernel/zarch/KERNEL.ZARCH_GENERIC

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c
3535
CASUMKERNEL = ../arm/zasum.c
3636
ZASUMKERNEL = ../arm/zasum.c
3737

38+
SSUMKERNEL = ../arm/sum.c
39+
DSUMKERNEL = ../arm/sum.c
40+
CSUMKERNEL = ../arm/zsum.c
41+
ZSUMKERNEL = ../arm/zsum.c
42+
3843
SAXPYKERNEL = ../arm/axpy.c
3944
DAXPYKERNEL = ../arm/axpy.c
4045
CAXPYKERNEL = ../arm/zaxpy.c

kernel/zarch/csum.c

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
/***************************************************************************
2+
Copyright (c) 2013-2019, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#include "common.h"
29+
#include <math.h>
30+
31+
static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) {
32+
FLOAT sum;
33+
34+
__asm__("vzero %%v24\n\t"
35+
"vzero %%v25\n\t"
36+
"vzero %%v26\n\t"
37+
"vzero %%v27\n\t"
38+
"vzero %%v28\n\t"
39+
"vzero %%v29\n\t"
40+
"vzero %%v30\n\t"
41+
"vzero %%v31\n\t"
42+
"srlg %[n],%[n],5\n\t"
43+
"xgr %%r1,%%r1\n\t"
44+
"0:\n\t"
45+
"pfd 1, 1024(%%r1,%[x])\n\t"
46+
"vl %%v16, 0(%%r1,%[x])\n\t"
47+
"vl %%v17, 16(%%r1,%[x])\n\t"
48+
"vl %%v18, 32(%%r1,%[x])\n\t"
49+
"vl %%v19, 48(%%r1,%[x])\n\t"
50+
"vl %%v20, 64(%%r1,%[x])\n\t"
51+
"vl %%v21, 80(%%r1,%[x])\n\t"
52+
"vl %%v22, 96(%%r1,%[x])\n\t"
53+
"vl %%v23, 112(%%r1,%[x])\n\t"
54+
"vfasb %%v24,%%v24,%%v16\n\t"
55+
"vfasb %%v25,%%v25,%%v17\n\t"
56+
"vfasb %%v26,%%v26,%%v18\n\t"
57+
"vfasb %%v27,%%v27,%%v19\n\t"
58+
"vfasb %%v28,%%v28,%%v20\n\t"
59+
"vfasb %%v29,%%v29,%%v21\n\t"
60+
"vfasb %%v30,%%v30,%%v22\n\t"
61+
"vfasb %%v31,%%v31,%%v23\n\t"
62+
"vl %%v16, 128(%%r1,%[x])\n\t"
63+
"vl %%v17, 144(%%r1,%[x])\n\t"
64+
"vl %%v18, 160(%%r1,%[x])\n\t"
65+
"vl %%v19, 176(%%r1,%[x])\n\t"
66+
"vl %%v20, 192(%%r1,%[x])\n\t"
67+
"vl %%v21, 208(%%r1,%[x])\n\t"
68+
"vl %%v22, 224(%%r1,%[x])\n\t"
69+
"vl %%v23, 240(%%r1,%[x])\n\t"
70+
"vfasb %%v24,%%v24,%%v16\n\t"
71+
"vfasb %%v25,%%v25,%%v17\n\t"
72+
"vfasb %%v26,%%v26,%%v18\n\t"
73+
"vfasb %%v27,%%v27,%%v19\n\t"
74+
"vfasb %%v28,%%v28,%%v20\n\t"
75+
"vfasb %%v29,%%v29,%%v21\n\t"
76+
"vfasb %%v30,%%v30,%%v22\n\t"
77+
"vfasb %%v31,%%v31,%%v23\n\t"
78+
"agfi %%r1,256\n\t"
79+
"brctg %[n],0b\n\t"
80+
"vfasb %%v24,%%v24,%%v25\n\t"
81+
"vfasb %%v24,%%v24,%%v26\n\t"
82+
"vfasb %%v24,%%v24,%%v27\n\t"
83+
"vfasb %%v24,%%v24,%%v28\n\t"
84+
"vfasb %%v24,%%v24,%%v29\n\t"
85+
"vfasb %%v24,%%v24,%%v30\n\t"
86+
"vfasb %%v24,%%v24,%%v31\n\t"
87+
"veslg %%v25,%%v24,32\n\t"
88+
"vfasb %%v24,%%v24,%%v25\n\t"
89+
"vrepf %%v25,%%v24,2\n\t"
90+
"vfasb %%v24,%%v24,%%v25\n\t"
91+
"vstef %%v24,%[asum],0"
92+
: [sum] "=Q"(sum),[n] "+&r"(n)
93+
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
94+
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
95+
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
96+
97+
return sum;
98+
}
99+
100+
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
101+
BLASLONG i = 0;
102+
BLASLONG ip = 0;
103+
FLOAT sumf = 0.0;
104+
BLASLONG n1;
105+
BLASLONG inc_x2;
106+
107+
if (n <= 0 || inc_x <= 0)
108+
return (sumf);
109+
110+
if (inc_x == 1) {
111+
112+
n1 = n & -32;
113+
if (n1 > 0) {
114+
115+
sumf = csum_kernel_32(n1, x);
116+
i = n1;
117+
ip = 2 * n1;
118+
}
119+
120+
while (i < n) {
121+
sumf += x[ip] + x[ip + 1];
122+
i++;
123+
ip += 2;
124+
}
125+
126+
} else {
127+
inc_x2 = 2 * inc_x;
128+
129+
while (i < n) {
130+
sumf += x[ip] + x[ip + 1];
131+
ip += inc_x2;
132+
i++;
133+
}
134+
135+
}
136+
return (sumf);
137+
}

kernel/zarch/dsum.c

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
/***************************************************************************
2+
Copyright (c) 2013-2019, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#include "common.h"
29+
#include <math.h>
30+
31+
static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) {
32+
FLOAT sum;
33+
34+
__asm__("vzero %%v24\n\t"
35+
"vzero %%v25\n\t"
36+
"vzero %%v26\n\t"
37+
"vzero %%v27\n\t"
38+
"vzero %%v28\n\t"
39+
"vzero %%v29\n\t"
40+
"vzero %%v30\n\t"
41+
"vzero %%v31\n\t"
42+
"srlg %[n],%[n],5\n\t"
43+
"xgr %%r1,%%r1\n\t"
44+
"0:\n\t"
45+
"pfd 1, 1024(%%r1,%[x])\n\t"
46+
"vl %%v16, 0(%%r1,%[x])\n\t"
47+
"vl %%v17, 16(%%r1,%[x])\n\t"
48+
"vl %%v18, 32(%%r1,%[x])\n\t"
49+
"vl %%v19, 48(%%r1,%[x])\n\t"
50+
"vl %%v20, 64(%%r1,%[x])\n\t"
51+
"vl %%v21, 80(%%r1,%[x])\n\t"
52+
"vl %%v22, 96(%%r1,%[x])\n\t"
53+
"vl %%v23, 112(%%r1,%[x])\n\t"
54+
"vfadb %%v24,%%v24,%%v16\n\t"
55+
"vfadb %%v25,%%v25,%%v17\n\t"
56+
"vfadb %%v26,%%v26,%%v18\n\t"
57+
"vfadb %%v27,%%v27,%%v19\n\t"
58+
"vfadb %%v28,%%v28,%%v20\n\t"
59+
"vfadb %%v29,%%v29,%%v21\n\t"
60+
"vfadb %%v30,%%v30,%%v22\n\t"
61+
"vfadb %%v31,%%v31,%%v23\n\t"
62+
"vl %%v16, 128(%%r1,%[x])\n\t"
63+
"vl %%v17, 144(%%r1,%[x])\n\t"
64+
"vl %%v18, 160(%%r1,%[x])\n\t"
65+
"vl %%v19, 176(%%r1,%[x])\n\t"
66+
"vl %%v20, 192(%%r1,%[x])\n\t"
67+
"vl %%v21, 208(%%r1,%[x])\n\t"
68+
"vl %%v22, 224(%%r1,%[x])\n\t"
69+
"vl %%v23, 240(%%r1,%[x])\n\t"
70+
"vfadb %%v24,%%v24,%%v16\n\t"
71+
"vfadb %%v25,%%v25,%%v17\n\t"
72+
"vfadb %%v26,%%v26,%%v18\n\t"
73+
"vfadb %%v27,%%v27,%%v19\n\t"
74+
"vfadb %%v28,%%v28,%%v20\n\t"
75+
"vfadb %%v29,%%v29,%%v21\n\t"
76+
"vfadb %%v30,%%v30,%%v22\n\t"
77+
"vfadb %%v31,%%v31,%%v23\n\t"
78+
"agfi %%r1,256\n\t"
79+
"brctg %[n],0b\n\t"
80+
"vfadb %%v24,%%v24,%%v25\n\t"
81+
"vfadb %%v24,%%v24,%%v26\n\t"
82+
"vfadb %%v24,%%v24,%%v27\n\t"
83+
"vfadb %%v24,%%v24,%%v28\n\t"
84+
"vfadb %%v24,%%v24,%%v29\n\t"
85+
"vfadb %%v24,%%v24,%%v30\n\t"
86+
"vfadb %%v24,%%v24,%%v31\n\t"
87+
"vrepg %%v25,%%v24,1\n\t"
88+
"vfadb %%v24,%%v24,%%v25\n\t"
89+
"vsteg %%v24,%[asum],0"
90+
: [sum] "=Q"(sum),[n] "+&r"(n)
91+
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
92+
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
93+
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
94+
95+
return sum;
96+
}
97+
98+
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
99+
BLASLONG i = 0;
100+
BLASLONG j = 0;
101+
FLOAT sumf = 0.0;
102+
BLASLONG n1;
103+
104+
if (n <= 0 || inc_x <= 0)
105+
return sumf;
106+
107+
if (inc_x == 1) {
108+
109+
n1 = n & -32;
110+
111+
if (n1 > 0) {
112+
113+
sumf = dsum_kernel_32(n1, x);
114+
i = n1;
115+
}
116+
117+
while (i < n) {
118+
sumf += x[i];
119+
i++;
120+
}
121+
122+
} else {
123+
BLASLONG n1 = n & -4;
124+
register FLOAT sum1, sum2;
125+
sum1 = 0.0;
126+
sum2 = 0.0;
127+
while (j < n1) {
128+
129+
sum1 += x[i];
130+
sum2 += x[i + inc_x];
131+
sum1 += x[i + 2 * inc_x];
132+
sum2 += x[i + 3 * inc_x];
133+
134+
i += inc_x * 4;
135+
j += 4;
136+
137+
}
138+
sumf = sum1 + sum2;
139+
while (j < n) {
140+
141+
sumf += x[i];
142+
i += inc_x;
143+
j++;
144+
}
145+
146+
}
147+
return sumf;
148+
}

0 commit comments

Comments
 (0)