Skip to content

Commit b1e0bcc

Browse files
authored
Merge pull request #2844 from RajalakshmiSR/daxpy_p10
Optimize daxpy/zaxpy for POWER10
2 parents 2855e60 + be43d2c commit b1e0bcc

File tree

5 files changed

+580
-2
lines changed

5 files changed

+580
-2
lines changed

kernel/power/KERNEL.POWER10

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,13 @@ CASUMKERNEL = casum.c
142142
ZASUMKERNEL = zasum.c
143143
#
144144
SAXPYKERNEL = saxpy.c
145-
DAXPYKERNEL = daxpy.c
145+
DAXPYKERNEL = daxpy_power10.c
146146
ifneq ($(GCCVERSIONGTEQ9),1)
147147
CAXPYKERNEL = caxpy_power9.S
148148
else
149149
CAXPYKERNEL = caxpy.c
150150
endif
151-
ZAXPYKERNEL = zaxpy.c
151+
ZAXPYKERNEL = zaxpy_power10.c
152152
#
153153
SCOPYKERNEL = scopy.c
154154
DCOPYKERNEL = dcopy.c

kernel/power/daxpy_microk_power10.c

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
/***************************************************************************
2+
Copyright (c) 2020, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define HAVE_KERNEL_8 1
29+
30+
static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
31+
{
32+
__vector double t0;
33+
34+
__asm__
35+
(
36+
XXSPLTD_S(%x4,%x6,0)
37+
38+
"dcbt 0, %2 \n\t"
39+
"dcbt 0, %3 \n\t"
40+
41+
"lxvp 32, 0(%2) \n\t"
42+
"lxvp 34, 32(%2) \n\t"
43+
"lxvp 40, 64(%2) \n\t"
44+
"lxvp 42, 96(%2) \n\t"
45+
46+
"lxvp 36, 0(%3) \n\t"
47+
"lxvp 38, 32(%3) \n\t"
48+
"lxvp 44, 64(%3) \n\t"
49+
"lxvp 46, 96(%3) \n\t"
50+
51+
"addi %2, %2, 128 \n\t"
52+
53+
"addic. %1, %1, -16 \n\t"
54+
"ble two%= \n\t"
55+
56+
".align 5 \n"
57+
"one%=: \n\t"
58+
59+
"xvmaddadp 36, 32, %x4 \n\t"
60+
"xvmaddadp 37, 33, %x4 \n\t"
61+
62+
"lxvp 32, 0(%2) \n\t"
63+
"stxvp 36, 0(%3) \n\t"
64+
65+
"xvmaddadp 38, 34, %x4 \n\t"
66+
"xvmaddadp 39, 35, %x4 \n\t"
67+
68+
"lxvp 34, 32(%2) \n\t"
69+
"stxvp 38, 32(%3) \n\t"
70+
71+
72+
"lxvp 36, 128(%3) \n\t"
73+
"lxvp 38, 160(%3) \n\t"
74+
75+
"xvmaddadp 44, 40, %x4 \n\t"
76+
"xvmaddadp 45, 41, %x4 \n\t"
77+
78+
"lxvp 40, 64(%2) \n\t"
79+
"stxvp 44, 64(%3) \n\t"
80+
81+
"xvmaddadp 46, 42, %x4 \n\t"
82+
"xvmaddadp 47, 43, %x4 \n\t"
83+
84+
"lxvp 42, 96(%2) \n\t"
85+
"stxvp 46, 96(%3) \n\t"
86+
87+
"addi %2, %2, 128 \n\t"
88+
"addi %3, %3, 128 \n\t"
89+
90+
"lxvp 44, 64(%3) \n\t"
91+
"lxvp 46, 96(%3) \n\t"
92+
93+
"addic. %1, %1, -16 \n\t"
94+
"bgt one%= \n"
95+
96+
"two%=: \n\t"
97+
98+
"xvmaddadp 36, 32, %x4 \n\t"
99+
"xvmaddadp 37, 33, %x4 \n\t"
100+
"xvmaddadp 38, 34, %x4 \n\t"
101+
"xvmaddadp 39, 35, %x4 \n\t"
102+
103+
"xvmaddadp 44, 40, %x4 \n\t"
104+
"xvmaddadp 45, 41, %x4 \n\t"
105+
"xvmaddadp 46, 42, %x4 \n\t"
106+
"xvmaddadp 47, 43, %x4 \n\t"
107+
108+
"stxvp 36, 0(%3) \n\t"
109+
"stxvp 38, 32(%3) \n\t"
110+
"stxvp 44, 64(%3) \n\t"
111+
"stxvp 46, 96(%3) \n\t"
112+
113+
"#n=%1 x=%5=%2 y=%0=%3 alpha=%6 t0=%x4\n"
114+
:
115+
"+m" (*y),
116+
"+r" (n), // 1
117+
"+b" (x), // 2
118+
"+b" (y), // 3
119+
"=wa" (t0) // 4
120+
:
121+
"m" (*x),
122+
"d" (alpha) // 6
123+
:
124+
"cr0",
125+
"vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39",
126+
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
127+
);
128+
129+
}
130+
131+

kernel/power/daxpy_power10.c

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
/***************************************************************************
2+
Copyright (c) 2020, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#include "common.h"
29+
30+
#if defined(__VEC__) || defined(__ALTIVEC__)
31+
#include "daxpy_microk_power10.c"
32+
#endif
33+
34+
35+
#ifndef HAVE_KERNEL_8
36+
37+
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
38+
{
39+
BLASLONG register i = 0;
40+
41+
while(i < n)
42+
{
43+
y[i] += alpha * x[i];
44+
y[i+1] += alpha * x[i+1];
45+
y[i+2] += alpha * x[i+2];
46+
y[i+3] += alpha * x[i+3];
47+
y[i+4] += alpha * x[i+4];
48+
y[i+5] += alpha * x[i+5];
49+
y[i+6] += alpha * x[i+6];
50+
y[i+7] += alpha * x[i+7];
51+
i+=8 ;
52+
53+
}
54+
55+
}
56+
57+
#endif
58+
59+
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
60+
{
61+
BLASLONG i=0;
62+
BLASLONG ix=0,iy=0;
63+
64+
if ( n <= 0 ) return(0);
65+
66+
if ( (inc_x == 1) && (inc_y == 1) )
67+
{
68+
69+
BLASLONG n1 = n & -16;
70+
71+
if ( n1 )
72+
daxpy_kernel_8(n1, x, y, da);
73+
74+
i = n1;
75+
while(i < n)
76+
{
77+
78+
y[i] += da * x[i] ;
79+
i++ ;
80+
81+
}
82+
return(0);
83+
84+
85+
}
86+
87+
BLASLONG n1 = n & -4;
88+
89+
while(i < n1)
90+
{
91+
92+
FLOAT m1 = da * x[ix] ;
93+
FLOAT m2 = da * x[ix+inc_x] ;
94+
FLOAT m3 = da * x[ix+2*inc_x] ;
95+
FLOAT m4 = da * x[ix+3*inc_x] ;
96+
97+
y[iy] += m1 ;
98+
y[iy+inc_y] += m2 ;
99+
y[iy+2*inc_y] += m3 ;
100+
y[iy+3*inc_y] += m4 ;
101+
102+
ix += inc_x*4 ;
103+
iy += inc_y*4 ;
104+
i+=4 ;
105+
106+
}
107+
108+
while(i < n)
109+
{
110+
111+
y[iy] += da * x[ix] ;
112+
ix += inc_x ;
113+
iy += inc_y ;
114+
i++ ;
115+
116+
}
117+
return(0);
118+
119+
}
120+
121+

0 commit comments

Comments
 (0)