Skip to content

Commit 1d254d3

Browse files
authored
Merge pull request #3129 from RajalakshmiSR/asum_p10
Optimize s/dasum function for POWER10
2 parents 3679781 + 41646ed commit 1d254d3

File tree

4 files changed

+343
-2
lines changed

4 files changed

+343
-2
lines changed

kernel/power/dasum.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4646

4747
#endif
4848

49-
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
5049
#if defined(__VEC__) || defined(__ALTIVEC__)
50+
#if defined(POWER8) || defined(POWER9)
5151
#include "dasum_microk_power8.c"
52+
#elif defined(POWER10)
53+
#include "dasum_microk_power10.c"
5254
#endif
5355
#endif
5456

@@ -110,13 +112,29 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
110112
if ( inc_x == 1 )
111113
{
112114

115+
#if defined(POWER10)
116+
if ( n >= 16 )
117+
{
118+
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
119+
for (i = 0; i < align; i++) {
120+
sumf += ABS(x[i]);
121+
}
122+
}
123+
n1 = (n-i) & -16;
124+
if ( n1 > 0 )
125+
{
126+
sumf += dasum_kernel_16(n1, &x[i]);
127+
i+=n1;
128+
}
129+
#else
113130
n1 = n & -16;
114131
if ( n1 > 0 )
115132
{
116133

117134
sumf = dasum_kernel_16(n1, x);
118135
i=n1;
119136
}
137+
#endif
120138

121139
while(i < n)
122140
{

kernel/power/dasum_microk_power10.c

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
/***************************************************************************
2+
Copyright (c) 2021, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define HAVE_KERNEL_16 1
29+
30+
static double dasum_kernel_16 (long n, double *x)
31+
{
32+
double sum;
33+
__vector double t0;
34+
__vector double t1;
35+
__vector double t2;
36+
__vector double t3;
37+
38+
__asm__
39+
(
40+
"dcbt 0, %2 \n\t"
41+
42+
"xxlxor 32, 32, 32 \n\t"
43+
"xxlxor 33, 33, 33 \n\t"
44+
"xxlxor 34, 34, 34 \n\t"
45+
"xxlxor 35, 35, 35 \n\t"
46+
"xxlxor 36, 36, 36 \n\t"
47+
"xxlxor 37, 37, 37 \n\t"
48+
"xxlxor 38, 38, 38 \n\t"
49+
"xxlxor 39, 39, 39 \n\t"
50+
51+
"lxvp 40, 0(%2) \n\t"
52+
"lxvp 42, 32(%2) \n\t"
53+
"lxvp 44, 64(%2) \n\t"
54+
"lxvp 46, 96(%2) \n\t"
55+
56+
"addi %2, %2, 128 \n\t"
57+
58+
"addic. %1, %1, -16 \n\t"
59+
"ble two%= \n\t"
60+
61+
".align 5 \n"
62+
"one%=: \n\t"
63+
64+
"xvabsdp 48, 40 \n\t"
65+
"xvabsdp 49, 41 \n\t"
66+
"xvabsdp 50, 42 \n\t"
67+
"xvabsdp 51, 43 \n\t"
68+
"lxvp 40, 0(%2) \n\t"
69+
70+
71+
"xvabsdp %x3, 44 \n\t"
72+
"xvabsdp %x4, 45 \n\t"
73+
"lxvp 42, 32(%2) \n\t"
74+
75+
76+
"xvabsdp %x5, 46 \n\t"
77+
"xvabsdp %x6, 47 \n\t"
78+
"lxvp 44, 64(%2) \n\t"
79+
80+
81+
"xvadddp 32, 32, 48 \n\t"
82+
"xvadddp 33, 33, 49 \n\t"
83+
84+
"lxvp 46, 96(%2) \n\t"
85+
86+
"xvadddp 34, 34, 50 \n\t"
87+
"xvadddp 35, 35, 51 \n\t"
88+
"addi %2, %2, 128 \n\t"
89+
"xvadddp 36, 36, %x3 \n\t"
90+
"xvadddp 37, 37, %x4 \n\t"
91+
"addic. %1, %1, -16 \n\t"
92+
"xvadddp 38, 38, %x5 \n\t"
93+
"xvadddp 39, 39, %x6 \n\t"
94+
95+
"bgt one%= \n"
96+
97+
"two%=: \n\t"
98+
99+
"xvabsdp 48, 40 \n\t"
100+
"xvabsdp 49, 41 \n\t"
101+
"xvabsdp 50, 42 \n\t"
102+
"xvabsdp 51, 43 \n\t"
103+
"xvabsdp %x3, 44 \n\t"
104+
"xvabsdp %x4, 45 \n\t"
105+
"xvabsdp %x5, 46 \n\t"
106+
"xvabsdp %x6, 47 \n\t"
107+
108+
"xvadddp 32, 32, 48 \n\t"
109+
"xvadddp 33, 33, 49 \n\t"
110+
"xvadddp 34, 34, 50 \n\t"
111+
"xvadddp 35, 35, 51 \n\t"
112+
"xvadddp 36, 36, %x3 \n\t"
113+
"xvadddp 37, 37, %x4 \n\t"
114+
"xvadddp 38, 38, %x5 \n\t"
115+
"xvadddp 39, 39, %x6 \n\t"
116+
117+
"xvadddp 32, 32, 33 \n\t"
118+
"xvadddp 34, 34, 35 \n\t"
119+
"xvadddp 36, 36, 37 \n\t"
120+
"xvadddp 38, 38, 39 \n\t"
121+
122+
"xvadddp 32, 32, 34 \n\t"
123+
"xvadddp 36, 36, 38 \n\t"
124+
125+
"xvadddp 32, 32, 36 \n\t"
126+
127+
XXSWAPD_S(33,32)
128+
"xsadddp %x0, 32, 33 \n"
129+
130+
"#n=%1 x=%3=%2 sum=%0\n"
131+
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
132+
:
133+
"=d" (sum), // 0
134+
"+r" (n), // 1
135+
"+b" (x), // 2
136+
"=wa" (t0), // 3
137+
"=wa" (t1), // 4
138+
"=wa" (t2), // 5
139+
"=wa" (t3) // 6
140+
:
141+
"m" (*x)
142+
:
143+
"cr0",
144+
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
145+
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
146+
"vs48","vs49","vs50","vs51"
147+
);
148+
149+
return sum;
150+
}
151+
152+

kernel/power/sasum.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4646

4747
#endif
4848

49-
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
5049
#if defined(__VEC__) || defined(__ALTIVEC__)
50+
#if defined(POWER8) || defined(POWER9)
5151
#include "sasum_microk_power8.c"
52+
#elif defined(POWER10)
53+
#include "sasum_microk_power10.c"
5254
#endif
5355
#endif
5456

@@ -110,13 +112,29 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
110112
if ( inc_x == 1 )
111113
{
112114

115+
#if defined(POWER10)
116+
if ( n >= 32 )
117+
{
118+
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
119+
for (i = 0; i < align; i++) {
120+
sumf += ABS(x[i]);
121+
}
122+
}
123+
n1 = (n-i) & -32;
124+
if ( n1 > 0 )
125+
{
126+
sumf += sasum_kernel_32(n1, &x[i]);
127+
i+=n1;
128+
}
129+
#else
113130
n1 = n & -32;
114131
if ( n1 > 0 )
115132
{
116133

117134
sumf = sasum_kernel_32(n1, x);
118135
i=n1;
119136
}
137+
#endif
120138

121139
while(i < n)
122140
{

kernel/power/sasum_microk_power10.c

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
/***************************************************************************
2+
Copyright (c) 2021, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
29+
#define HAVE_KERNEL_32 1
30+
31+
static float sasum_kernel_32 (long n, float *x)
32+
{
33+
float sum;
34+
__vector float t0;
35+
__vector float t1;
36+
__vector float t2;
37+
__vector float t3;
38+
39+
__asm__
40+
(
41+
"dcbt 0, %2 \n\t"
42+
43+
"xxlxor 32, 32, 32 \n\t"
44+
"xxlxor 33, 33, 33 \n\t"
45+
"xxlxor 34, 34, 34 \n\t"
46+
"xxlxor 35, 35, 35 \n\t"
47+
"xxlxor 36, 36, 36 \n\t"
48+
"xxlxor 37, 37, 37 \n\t"
49+
"xxlxor 38, 38, 38 \n\t"
50+
"xxlxor 39, 39, 39 \n\t"
51+
52+
"lxvp 40, 0(%2) \n\t"
53+
"lxvp 42, 32(%2) \n\t"
54+
"lxvp 44, 64(%2) \n\t"
55+
"lxvp 46, 96(%2) \n\t"
56+
57+
"addi %2, %2, 128 \n\t"
58+
59+
"addic. %1, %1, -32 \n\t"
60+
"ble two%= \n\t"
61+
62+
".align 5 \n"
63+
"one%=: \n\t"
64+
65+
"xvabssp 48, 40 \n\t"
66+
"xvabssp 49, 41 \n\t"
67+
"xvabssp 50, 42 \n\t"
68+
"xvabssp 51, 43 \n\t"
69+
"lxvp 40, 0(%2) \n\t"
70+
71+
"xvabssp %x3, 44 \n\t"
72+
"xvabssp %x4, 45 \n\t"
73+
"lxvp 42, 32(%2) \n\t"
74+
75+
"xvabssp %x5, 46 \n\t"
76+
"xvabssp %x6, 47 \n\t"
77+
"lxvp 44, 64(%2) \n\t"
78+
79+
"xvaddsp 32, 32, 48 \n\t"
80+
"xvaddsp 33, 33, 49 \n\t"
81+
82+
"lxvp 46, 96(%2) \n\t"
83+
84+
"xvaddsp 34, 34, 50 \n\t"
85+
"xvaddsp 35, 35, 51 \n\t"
86+
"addi %2, %2, 128 \n\t"
87+
"xvaddsp 36, 36, %x3 \n\t"
88+
"xvaddsp 37, 37, %x4 \n\t"
89+
"addic. %1, %1, -32 \n\t"
90+
"xvaddsp 38, 38, %x5 \n\t"
91+
"xvaddsp 39, 39, %x6 \n\t"
92+
93+
"bgt one%= \n"
94+
95+
"two%=: \n\t"
96+
97+
"xvabssp 48, 40 \n\t"
98+
"xvabssp 49, 41 \n\t"
99+
"xvabssp 50, 42 \n\t"
100+
"xvabssp 51, 43 \n\t"
101+
"xvabssp %x3, 44 \n\t"
102+
"xvabssp %x4, 45 \n\t"
103+
"xvabssp %x5, 46 \n\t"
104+
"xvabssp %x6, 47 \n\t"
105+
106+
"xvaddsp 32, 32, 48 \n\t"
107+
"xvaddsp 33, 33, 49 \n\t"
108+
"xvaddsp 34, 34, 50 \n\t"
109+
"xvaddsp 35, 35, 51 \n\t"
110+
"xvaddsp 36, 36, %x3 \n\t"
111+
"xvaddsp 37, 37, %x4 \n\t"
112+
"xvaddsp 38, 38, %x5 \n\t"
113+
"xvaddsp 39, 39, %x6 \n\t"
114+
115+
"xvaddsp 32, 32, 33 \n\t"
116+
"xvaddsp 34, 34, 35 \n\t"
117+
"xvaddsp 36, 36, 37 \n\t"
118+
"xvaddsp 38, 38, 39 \n\t"
119+
120+
"xvaddsp 32, 32, 34 \n\t"
121+
"xvaddsp 36, 36, 38 \n\t"
122+
123+
"xvaddsp 32, 32, 36 \n\t"
124+
125+
"xxsldwi 33, 32, 32, 2 \n\t"
126+
"xvaddsp 32, 32, 33 \n\t"
127+
128+
"xxsldwi 33, 32, 32, 1 \n\t"
129+
"xvaddsp 32, 32, 33 \n\t"
130+
131+
"xscvspdp %x0, 32 \n"
132+
133+
"#n=%1 x=%3=%2 sum=%0\n"
134+
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
135+
:
136+
"=f" (sum), // 0
137+
"+r" (n), // 1
138+
"+b" (x), // 2
139+
"=wa" (t0), // 3
140+
"=wa" (t1), // 4
141+
"=wa" (t2), // 5
142+
"=wa" (t3) // 6
143+
:
144+
"m" (*x)
145+
:
146+
"cr0",
147+
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
148+
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
149+
"vs48","vs49","vs50","vs51"
150+
);
151+
152+
return sum;
153+
}

0 commit comments

Comments
 (0)