Skip to content

Commit 8fe794f

Browse files
improved zgemm power9 based on power8
1 parent 47f8921 commit 8fe794f

File tree

7 files changed

+2802
-24
lines changed

7 files changed

+2802
-24
lines changed

kernel/power/KERNEL.POWER9

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy.o
3838
CGEMMINCOPYOBJ = cgemm_incopy.o
3939
CGEMMITCOPYOBJ = cgemm_itcopy.o
4040

41-
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
41+
ZGEMMKERNEL = zgemm_kernel_power9.S
4242
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
4343
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
4444
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c

kernel/power/sgemm_kernel_power9.S

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
168168

169169

170170
/*alpha is stored in f1. convert to single and splat*/
171-
xscvdpspn alpha_r,vs1
171+
xscvdpspn alpha_r,vs1
172172
xxspltw alpha_r,alpha_r,0
173173

174174

kernel/power/sgemm_logic_power9.S

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -53,39 +53,39 @@ LSGEMM_L8x16_BEGIN:
5353
LSGEMM_L8x16_LOOP_START:
5454

5555
LOAD8x16_0 /*we already zeroed */
56-
##OffsetA=64 OffsetB=32
57-
addi AO,AO,2112
58-
addi BO,BO,32
56+
/*##OffsetA=64 OffsetB=32
57+
#addi AO,AO,2112
58+
#addi BO,BO,32 */
5959

6060
mtctr L
6161

6262
MY_ALIGN
6363

6464
LSGEMM_L8x16_LOOP:
6565

66-
KERNEL8x16_I1_L4_2 -2048,0, 0,0
67-
KERNEL8x16_I1_L4_2 -2048,0, 1,0
68-
KERNEL8x16_I1_L4_2 -2048,0, 2,0
69-
KERNEL8x16_I1_L4_2 -2048,0, 3,0
70-
KERNEL8x16_I1_L4_2 -2048,0, 4,0
71-
KERNEL8x16_I1_L4_2 -2048,0, 5,0
72-
KERNEL8x16_I1_L4_2 -2048,0, 6,0
73-
KERNEL8x16_I1_L4_2 -2048,0, 7,0
74-
KERNEL8x16_I1_L4_2 -2048,0, 8,0
75-
KERNEL8x16_I1_L4_2 -2048,0, 9,0
76-
KERNEL8x16_I1_L4_2 -2048,0, 10,0
77-
KERNEL8x16_I1_L4_2 -2048,0, 11,0
78-
KERNEL8x16_I1_L4_2 -2048,0, 12,0
79-
KERNEL8x16_I1_L4_2 -2048,0, 13,0
80-
KERNEL8x16_I1_L4_2 -2048,0, 14,0
81-
KERNEL8x16_I1_L4_2 -2048,0, 15,1
66+
KERNEL8x16_I1_L4_2 64,32, 0,0
67+
KERNEL8x16_I1_L4_2 64,32, 1,0
68+
KERNEL8x16_I1_L4_2 64,32, 2,0
69+
KERNEL8x16_I1_L4_2 64,32, 3,0
70+
KERNEL8x16_I1_L4_2 64,32, 4,0
71+
KERNEL8x16_I1_L4_2 64,32, 5,0
72+
KERNEL8x16_I1_L4_2 64,32, 6,0
73+
KERNEL8x16_I1_L4_2 64,32, 7,0
74+
KERNEL8x16_I1_L4_2 64,32, 8,0
75+
KERNEL8x16_I1_L4_2 64,32, 9,0
76+
KERNEL8x16_I1_L4_2 64,32, 10,0
77+
KERNEL8x16_I1_L4_2 64,32, 11,0
78+
KERNEL8x16_I1_L4_2 64,32, 12,0
79+
KERNEL8x16_I1_L4_2 64,32, 13,0
80+
KERNEL8x16_I1_L4_2 64,32, 14,0
81+
KERNEL8x16_I1_L4_2 64,32, 15,1
8282

8383
bdnz LSGEMM_L8x16_LOOP
8484

8585
MY_ALIGN
8686
LSGEMM_L8x16_LOOP_END:
8787

88-
END8x16 0, AO, BO, -2048, 0
88+
END8x16 0, AO, BO, 64, 32
8989

9090
b LSGEMM_L8x16_SUB1
9191
MY_ALIGN

kernel/power/zgemm_kernel_power9.S

Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
/***************************************************************************
2+
Copyright (c) 2013-2019, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
#define ASSEMBLER
28+
#include "common.h"
29+
#include "def_vsx.h"
30+
31+
#define LOAD ld
32+
33+
#define STACKSIZE 32192
34+
35+
#define FZERO 312+192(SP)
36+
37+
38+
#define M r3
39+
#define N r4
40+
#define K r5
41+
42+
43+
#define A r8
44+
#define B r9
45+
#define C r10
46+
#define LDC r6
47+
#define OFFSET r7
48+
49+
50+
51+
#define o0 0
52+
#define alpha_r vs30
53+
#define alpha_i vs31
54+
55+
#define VECSAVE r11
56+
57+
#define FRAMEPOINTER r12
58+
59+
#define BBUFFER r14
60+
61+
#define L r15
62+
#define ALPHA r16
63+
#define T5 r17
64+
#define T2 r19
65+
#define BBO r20
66+
#define o8 r21
67+
#define I r22
68+
#define J r23
69+
#define AO r24
70+
#define BO r25
71+
#define CO r26
72+
#define o16 r27
73+
#define T3 r28
74+
#define T4 r29
75+
76+
#define PRE r30
77+
#define T1 r31
78+
79+
#ifndef NEEDPARAM
80+
81+
PROLOGUE
82+
PROFCODE
83+
84+
mr FRAMEPOINTER, SP
85+
addi SP, SP, -STACKSIZE
86+
addi SP, SP, -STACKSIZE
87+
addi SP, SP, -STACKSIZE
88+
addi SP, SP, -STACKSIZE
89+
li r0, 0
90+
91+
stfd f14, 0(SP)
92+
stfd f15, 8(SP)
93+
stfd f16, 16(SP)
94+
stfd f17, 24(SP)
95+
96+
stfd f18, 32(SP)
97+
stfd f19, 40(SP)
98+
stfd f20, 48(SP)
99+
stfd f21, 56(SP)
100+
101+
stfd f22, 64(SP)
102+
stfd f23, 72(SP)
103+
stfd f24, 80(SP)
104+
stfd f25, 88(SP)
105+
106+
stfd f26, 96(SP)
107+
stfd f27, 104(SP)
108+
stfd f28, 112(SP)
109+
stfd f29, 120(SP)
110+
111+
stfd f30, 128(SP)
112+
stfd f31, 136(SP)
113+
114+
115+
std r31, 144(SP)
116+
std r30, 152(SP)
117+
std r29, 160(SP)
118+
std r28, 168(SP)
119+
std r27, 176(SP)
120+
std r26, 184(SP)
121+
std r25, 192(SP)
122+
std r24, 200(SP)
123+
std r23, 208(SP)
124+
std r22, 216(SP)
125+
std r21, 224(SP)
126+
std r20, 232(SP)
127+
std r19, 240(SP)
128+
std r18, 248(SP)
129+
std r17, 256(SP)
130+
std r16, 264(SP)
131+
std r15, 272(SP)
132+
std r14, 280(SP)
133+
134+
135+
stxv v20, 288(SP)
136+
stxv v21, 304(SP)
137+
stxv v22, 320(SP)
138+
stxv v23, 336(SP)
139+
stxv v24, 352(SP)
140+
stxv v25, 368(SP)
141+
stxv v26, 384(SP)
142+
stxv v27, 400(SP)
143+
stxv v28, 416(SP)
144+
stxv v29, 432(SP)
145+
stxv v30, 448(SP)
146+
stxv v31, 464(SP)
147+
148+
149+
stw r0, FZERO
150+
151+
#ifdef linux
152+
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
153+
#endif
154+
155+
156+
#ifdef TRMMKERNEL
157+
#if defined(linux) && defined(__64BIT__)
158+
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
159+
#endif
160+
#endif
161+
162+
163+
#include "zgemm_macros_power9.S"
164+
165+
cmpwi cr0, M, 0
166+
ble L999
167+
cmpwi cr0, N, 0
168+
ble L999
169+
cmpwi cr0, K, 0
170+
ble L999
171+
172+
slwi LDC, LDC, ZBASE_SHIFT
173+
li PRE, 512
174+
li o8 , 8
175+
li o16 , 16
176+
177+
addi BBUFFER, SP, 512+4096
178+
li T1, -4096
179+
and BBUFFER, BBUFFER, T1
180+
181+
182+
addi ALPHA, SP, 296+192
183+
184+
xxlor alpha_r,vs1,vs1 /*copy from register f1 */
185+
xxlor alpha_i,vs2,vs2 /*copy from register f2 */
186+
187+
.align 4
188+
189+
#include "zgemm_logic_power9.S"
190+
191+
L999:
192+
addi r3, 0, 0
193+
194+
lfd f14, 0(SP)
195+
lfd f15, 8(SP)
196+
lfd f16, 16(SP)
197+
lfd f17, 24(SP)
198+
199+
lfd f18, 32(SP)
200+
lfd f19, 40(SP)
201+
lfd f20, 48(SP)
202+
lfd f21, 56(SP)
203+
204+
lfd f22, 64(SP)
205+
lfd f23, 72(SP)
206+
lfd f24, 80(SP)
207+
lfd f25, 88(SP)
208+
209+
lfd f26, 96(SP)
210+
lfd f27, 104(SP)
211+
lfd f28, 112(SP)
212+
lfd f29, 120(SP)
213+
214+
lfd f30, 128(SP)
215+
lfd f31, 136(SP)
216+
217+
218+
ld r31, 144(SP)
219+
ld r30, 152(SP)
220+
ld r29, 160(SP)
221+
ld r28, 168(SP)
222+
ld r27, 176(SP)
223+
ld r26, 184(SP)
224+
ld r25, 192(SP)
225+
ld r24, 200(SP)
226+
ld r23, 208(SP)
227+
ld r22, 216(SP)
228+
ld r21, 224(SP)
229+
ld r20, 232(SP)
230+
ld r19, 240(SP)
231+
ld r18, 248(SP)
232+
ld r17, 256(SP)
233+
ld r16, 264(SP)
234+
ld r15, 272(SP)
235+
ld r14, 280(SP)
236+
237+
lxv v20, 288(SP)
238+
lxv v21, 304(SP)
239+
lxv v22, 320(SP)
240+
lxv v23, 336(SP)
241+
lxv v24, 352(SP)
242+
lxv v25, 368(SP)
243+
lxv v26, 384(SP)
244+
lxv v27, 400(SP)
245+
lxv v28, 416(SP)
246+
lxv v29, 432(SP)
247+
lxv v30, 448(SP)
248+
lxv v31, 464(SP)
249+
250+
addi SP, SP, STACKSIZE
251+
addi SP, SP, STACKSIZE
252+
addi SP, SP, STACKSIZE
253+
addi SP, SP, STACKSIZE
254+
blr
255+
256+
EPILOGUE
257+
#endif

0 commit comments

Comments
 (0)