Skip to content

Commit 55a0718

Browse files
authored
Merge pull request #4369 from ChipKerchner/power10Copies
Replace two vector loads with one vector pair load.
2 parents d9f1478 + 93747fb commit 55a0718

File tree

3 files changed

+332
-0
lines changed

3 files changed

+332
-0
lines changed

kernel/power/sgemm_tcopy_16_power8.S

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
108108

109109
#define o0 0
110110

111+
#ifdef POWER10
112+
#include "sgemm_tcopy_macros_16_power10.S"
113+
#endif
111114
#include "sgemm_tcopy_macros_16_power8.S"
112115

113116
#define STACKSIZE 144
Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
/***************************************************************************
2+
Copyright (c) 2013-2016, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
/**************************************************************************************
29+
* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
30+
* BLASTEST : OK
31+
* CTEST : OK
32+
* TEST : OK
33+
* LAPACK-TEST : OK
34+
**************************************************************************************/
35+
36+
37+
/**********************************************************************************************
38+
* Macros for N=4 and M=16
39+
**********************************************************************************************/
40+
41+
#if defined(_AIX)
42+
define(`COPY_4x16', `
43+
#else
44+
.macro COPY_4x16
45+
#endif
46+
47+
lxvpx vs32, o0, A0
48+
lxvpx vs34, o32, A0
49+
50+
lxvpx vs36, o0, A1
51+
lxvpx vs38, o32, A1
52+
53+
lxvpx vs40, o0, A2
54+
lxvpx vs42, o32, A2
55+
56+
lxvpx vs44, o0, A3
57+
lxvpx vs46, o32, A3
58+
59+
mr T1, BO
60+
61+
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
62+
stxvx vs32, o0, T1
63+
stxvx vs33, o16, T1
64+
stxvx vs34, o32, T1
65+
stxvx vs35, o48, T1
66+
#else
67+
stxvx vs33, o0, T1
68+
stxvx vs32, o16, T1
69+
stxvx vs35, o32, T1
70+
stxvx vs34, o48, T1
71+
#endif
72+
73+
addi T1, T1, 64
74+
75+
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
76+
stxvx vs36, o0, T1
77+
stxvx vs37, o16, T1
78+
stxvx vs38, o32, T1
79+
stxvx vs39, o48, T1
80+
#else
81+
stxvx vs37, o0, T1
82+
stxvx vs36, o16, T1
83+
stxvx vs39, o32, T1
84+
stxvx vs38, o48, T1
85+
#endif
86+
87+
addi T1, T1, 64
88+
89+
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
90+
stxvx vs40, o0, T1
91+
stxvx vs41, o16, T1
92+
stxvx vs42, o32, T1
93+
stxvx vs43, o48, T1
94+
#else
95+
stxvx vs41, o0, T1
96+
stxvx vs40, o16, T1
97+
stxvx vs43, o32, T1
98+
stxvx vs42, o48, T1
99+
#endif
100+
101+
addi T1, T1, 64
102+
103+
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
104+
stxvx vs44, o0, T1
105+
stxvx vs45, o16, T1
106+
stxvx vs46, o32, T1
107+
stxvx vs47, o48, T1
108+
#else
109+
stxvx vs45, o0, T1
110+
stxvx vs44, o16, T1
111+
stxvx vs47, o32, T1
112+
stxvx vs46, o48, T1
113+
#endif
114+
115+
#if defined(_AIX)
116+
')
117+
#else
118+
.endm
119+
#endif
120+
121+
/**********************************************************************************************
122+
* Macros for N=4 and M=8
123+
**********************************************************************************************/
124+
125+
#if defined(_AIX)
126+
define(`COPY_4x8', `
127+
#else
128+
.macro COPY_4x8
129+
#endif
130+
131+
lxvpx vs32, o0, A0
132+
133+
lxvpx vs34, o0, A1
134+
135+
lxvpx vs36, o0, A2
136+
137+
lxvpx vs38, o0, A3
138+
139+
mr T1, BO
140+
141+
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
142+
stxvx vs32, o0, T1
143+
stxvx vs33, o16, T1
144+
145+
stxvx vs34, o32, T1
146+
stxvx vs35, o48, T1
147+
#else
148+
stxvx vs33, o0, T1
149+
stxvx vs32, o16, T1
150+
151+
stxvx vs35, o32, T1
152+
stxvx vs34, o48, T1
153+
#endif
154+
155+
addi T1, T1, 64
156+
157+
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
158+
stxvx vs36, o0, T1
159+
stxvx vs37, o16, T1
160+
161+
stxvx vs38, o32, T1
162+
stxvx vs39, o48, T1
163+
#else
164+
stxvx vs37, o0, T1
165+
stxvx vs36, o16, T1
166+
167+
stxvx vs39, o32, T1
168+
stxvx vs38, o48, T1
169+
#endif
170+
171+
#if defined(_AIX)
172+
')
173+
#else
174+
.endm
175+
#endif
176+
177+
/**********************************************************************************************
178+
* Macros for N=2 and M=16
179+
**********************************************************************************************/
180+
181+
#if defined(_AIX)
182+
define(`COPY_2x16', `
183+
#else
184+
.macro COPY_2x16
185+
#endif
186+
187+
lxvpx vs32, o0, A0
188+
lxvpx vs34, o32, A0
189+
190+
lxvpx vs36, o0, A1
191+
lxvpx vs38, o32, A1
192+
193+
mr T1, BO
194+
195+
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
196+
stxvx vs32, o0, T1
197+
stxvx vs33, o16, T1
198+
stxvx vs34, o32, T1
199+
stxvx vs35, o48, T1
200+
#else
201+
stxvx vs33, o0, T1
202+
stxvx vs32, o16, T1
203+
stxvx vs35, o32, T1
204+
stxvx vs34, o48, T1
205+
#endif
206+
207+
addi T1, T1, 64
208+
209+
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
210+
stxvx vs36, o0, T1
211+
stxvx vs37, o16, T1
212+
stxvx vs38, o32, T1
213+
stxvx vs39, o48, T1
214+
#else
215+
stxvx vs37, o0, T1
216+
stxvx vs36, o16, T1
217+
stxvx vs39, o32, T1
218+
stxvx vs38, o48, T1
219+
#endif
220+
221+
#if defined(_AIX)
222+
')
223+
#else
224+
.endm
225+
#endif
226+
227+
/**********************************************************************************************
228+
* Macros for N=2 and M=8
229+
**********************************************************************************************/
230+
231+
#if defined(_AIX)
232+
define(`COPY_2x8', `
233+
#else
234+
.macro COPY_2x8
235+
#endif
236+
237+
lxvpx vs32, o0, A0
238+
239+
lxvpx vs34, o0, A1
240+
241+
mr T1, BO
242+
243+
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
244+
stxvx vs32, o0, T1
245+
stxvx vs33, o16, T1
246+
247+
stxvx vs34, o32, T1
248+
stxvx vs35, o48, T1
249+
#else
250+
stxvx vs33, o0, T1
251+
stxvx vs32, o16, T1
252+
253+
stxvx vs35, o32, T1
254+
stxvx vs34, o48, T1
255+
#endif
256+
257+
#if defined(_AIX)
258+
')
259+
#else
260+
.endm
261+
#endif
262+
263+
/**********************************************************************************************
264+
* Macros for N=1 and M=16
265+
**********************************************************************************************/
266+
267+
#if defined(_AIX)
268+
define(`COPY_1x16', `
269+
#else
270+
.macro COPY_1x16
271+
#endif
272+
273+
lxvpx vs32, o0, A0
274+
lxvpx vs34, o32, A0
275+
276+
mr T1, BO
277+
278+
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
279+
stxvx vs32, o0, T1
280+
stxvx vs33, o16, T1
281+
stxvx vs34, o32, T1
282+
stxvx vs35, o48, T1
283+
#else
284+
stxvx vs33, o0, T1
285+
stxvx vs32, o16, T1
286+
stxvx vs35, o32, T1
287+
stxvx vs34, o48, T1
288+
#endif
289+
290+
#if defined(_AIX)
291+
')
292+
#else
293+
.endm
294+
#endif
295+
296+
/**********************************************************************************************
297+
* Macros for N=1 and M=8
298+
**********************************************************************************************/
299+
300+
#if defined(_AIX)
301+
define(`COPY_1x8', `
302+
#else
303+
.macro COPY_1x8
304+
#endif
305+
306+
lxvpx vs32, o0, A0
307+
308+
mr T1, BO
309+
310+
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
311+
stxvx vs32, o0, T1
312+
stxvx vs33, o16, T1
313+
#else
314+
stxvx vs33, o0, T1
315+
stxvx vs32, o16, T1
316+
#endif
317+
318+
#if defined(_AIX)
319+
')
320+
#else
321+
.endm
322+
#endif
323+

kernel/power/sgemm_tcopy_macros_16_power8.S

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3838
* Macros for N=4 and M=16
3939
**********************************************************************************************/
4040

41+
#ifndef POWER10
4142
#if defined(_AIX)
4243
define(`COPY_4x16', `
4344
#else
@@ -141,6 +142,7 @@ define(`COPY_4x8', `
141142
#else
142143
.endm
143144
#endif
145+
#endif
144146

145147
/**********************************************************************************************
146148
* Macros for N=4 and M=4
@@ -264,6 +266,7 @@ define(`COPY_4x1', `
264266
* Macros for N=2 and M=16
265267
**********************************************************************************************/
266268

269+
#ifndef POWER10
267270
#if defined(_AIX)
268271
define(`COPY_2x16', `
269272
#else
@@ -329,6 +332,7 @@ define(`COPY_2x8', `
329332
#else
330333
.endm
331334
#endif
335+
#endif
332336

333337
/**********************************************************************************************
334338
* Macros for N=2 and M=4
@@ -418,6 +422,7 @@ define(`COPY_2x1', `
418422
* Macros for N=1 and M=16
419423
**********************************************************************************************/
420424

425+
#ifndef POWER10
421426
#if defined(_AIX)
422427
define(`COPY_1x16', `
423428
#else
@@ -465,6 +470,7 @@ define(`COPY_1x8', `
465470
#else
466471
.endm
467472
#endif
473+
#endif
468474

469475
/**********************************************************************************************
470476
* Macros for N=1 and M=4

0 commit comments

Comments
 (0)