Skip to content

Commit 09d47af

Browse files
author
Rajalakshmi Srinivasaraghavan
committed
Optimize zscal function for POWER10
This patch makes use of new POWER10 vector pair instructions for loads and stores.
1 parent ef0238b commit 09d47af

File tree

2 files changed

+196
-1
lines changed

2 files changed

+196
-1
lines changed

kernel/power/zscal.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4545
#endif
4646
#elif defined(POWER10)
4747
#if defined(DOUBLE)
48-
#include "zscal_microk_power8.c"
48+
#include "zscal_microk_power10.c"
4949
#else
5050
#include "cscal_microk_power10.c"
5151
#endif

kernel/power/zscal_microk_power10.c

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
/***************************************************************************
2+
Copyright (c) 2021, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define HAVE_KERNEL_8 1
29+
30+
static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
31+
{
32+
__vector double t0;
33+
__vector double t1;
34+
__vector double t2;
35+
__vector double t3;
36+
__vector double t4;
37+
__vector double t5;
38+
39+
__asm__
40+
(
41+
"dcbt 0, %2 \n\t"
42+
43+
"xsnegdp 33, %x10 \n\t" // -alpha_i
44+
XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r
45+
XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i
46+
47+
"lxvp 40, 0(%2) \n\t"
48+
"lxvp 42, 32(%2) \n\t"
49+
"lxvp 44, 64(%2) \n\t"
50+
"lxvp 46, 96(%2) \n\t"
51+
52+
"addic. %1, %1, -8 \n\t"
53+
"ble two%= \n\t"
54+
55+
".align 5 \n"
56+
"one%=: \n\t"
57+
58+
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
59+
"xvmuldp 49, 41, 32 \n\t"
60+
"xvmuldp 50, 42, 32 \n\t"
61+
"xvmuldp 51, 43, 32 \n\t"
62+
"xvmuldp 34, 44, 32 \n\t"
63+
"xvmuldp 35, 45, 32 \n\t"
64+
"xvmuldp 36, 46, 32 \n\t"
65+
"xvmuldp 37, 47, 32 \n\t"
66+
67+
XXSWAPD_S(38,40)
68+
XXSWAPD_S(39,41)
69+
XXSWAPD_S(%x3,42)
70+
XXSWAPD_S(%x4,43)
71+
XXSWAPD_S(%x5,44)
72+
XXSWAPD_S(%x6,45)
73+
XXSWAPD_S(%x7,46)
74+
XXSWAPD_S(%x8,47)
75+
76+
"xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
77+
"xvmuldp 39, 39, 33 \n\t"
78+
79+
80+
"xvmuldp %x3, %x3, 33 \n\t"
81+
"xvmuldp %x4, %x4, 33 \n\t"
82+
83+
84+
"lxvp 40, 128(%2) \n\t"
85+
"lxvp 42, 160(%2) \n\t"
86+
"xvmuldp %x5, %x5, 33 \n\t"
87+
"xvmuldp %x6, %x6, 33 \n\t"
88+
89+
90+
"xvmuldp %x7, %x7, 33 \n\t"
91+
"xvmuldp %x8, %x8, 33 \n\t"
92+
"lxvp 44, 192(%2) \n\t"
93+
"lxvp 46, 224(%2) \n\t"
94+
95+
96+
"xvadddp 48, 48, 38 \n\t"
97+
"xvadddp 49, 49, 39 \n\t"
98+
"xvadddp 50, 50, %x3 \n\t"
99+
"xvadddp 51, 51, %x4 \n\t"
100+
"stxv 49, 0(%2) \n\t"
101+
"stxv 48, 16(%2) \n\t"
102+
"stxv 51, 32(%2) \n\t"
103+
"stxv 50, 48(%2) \n\t"
104+
105+
106+
"xvadddp 34, 34, %x5 \n\t"
107+
"xvadddp 35, 35, %x6 \n\t"
108+
109+
110+
"xvadddp 36, 36, %x7 \n\t"
111+
"xvadddp 37, 37, %x8 \n\t"
112+
113+
"stxv 35, 64(%2) \n\t"
114+
"stxv 34, 80(%2) \n\t"
115+
"stxv 37, 96(%2) \n\t"
116+
"stxv 36, 112(%2) \n\t"
117+
118+
"addi %2, %2, 128 \n\t"
119+
120+
"addic. %1, %1, -8 \n\t"
121+
"bgt one%= \n"
122+
123+
"two%=: \n\t"
124+
125+
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
126+
"xvmuldp 49, 41, 32 \n\t"
127+
"xvmuldp 50, 42, 32 \n\t"
128+
"xvmuldp 51, 43, 32 \n\t"
129+
"xvmuldp 34, 44, 32 \n\t"
130+
"xvmuldp 35, 45, 32 \n\t"
131+
"xvmuldp 36, 46, 32 \n\t"
132+
"xvmuldp 37, 47, 32 \n\t"
133+
134+
XXSWAPD_S(38,40)
135+
XXSWAPD_S(39,41)
136+
XXSWAPD_S(%x3,42)
137+
XXSWAPD_S(%x4,43)
138+
XXSWAPD_S(%x5,44)
139+
XXSWAPD_S(%x6,45)
140+
XXSWAPD_S(%x7,46)
141+
XXSWAPD_S(%x8,47)
142+
143+
144+
"xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
145+
"xvmuldp 39, 39, 33 \n\t"
146+
"xvmuldp %x3, %x3, 33 \n\t"
147+
"xvmuldp %x4, %x4, 33 \n\t"
148+
"xvmuldp %x5, %x5, 33 \n\t"
149+
"xvmuldp %x6, %x6, 33 \n\t"
150+
"xvmuldp %x7, %x7, 33 \n\t"
151+
"xvmuldp %x8, %x8, 33 \n\t"
152+
153+
"xvadddp 48, 48, 38 \n\t"
154+
"xvadddp 49, 49, 39 \n\t"
155+
156+
"xvadddp 50, 50, %x3 \n\t"
157+
"xvadddp 51, 51, %x4 \n\t"
158+
"stxv 49, 0(%2) \n\t"
159+
"stxv 48, 16(%2) \n\t"
160+
"stxv 51, 32(%2) \n\t"
161+
"stxv 50, 48(%2) \n\t"
162+
163+
"xvadddp 34, 34, %x5 \n\t"
164+
"xvadddp 35, 35, %x6 \n\t"
165+
166+
167+
"xvadddp 36, 36, %x7 \n\t"
168+
"xvadddp 37, 37, %x8 \n\t"
169+
170+
"stxv 35, 64(%2) \n\t"
171+
"stxv 34, 80(%2) \n\t"
172+
"stxv 37, 96(%2) \n\t"
173+
"stxv 36, 112(%2) \n\t"
174+
175+
"#n=%1 x=%0=%2 alpha=(%9,%10) \n"
176+
:
177+
"+m" (*x),
178+
"+r" (n), // 1
179+
"+b" (x), // 2
180+
"=wa" (t0), // 3
181+
"=wa" (t1), // 4
182+
"=wa" (t2), // 5
183+
"=wa" (t3), // 6
184+
"=wa" (t4), // 7
185+
"=wa" (t5) // 8
186+
:
187+
"d" (alpha_r), // 9
188+
"d" (alpha_i) // 10
189+
:
190+
"cr0",
191+
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
192+
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
193+
"vs48","vs49","vs50","vs51"
194+
);
195+
}

0 commit comments

Comments
 (0)