Skip to content

Commit 0833a48

Browse files
committed
Use arm neon instructions to optimize sgemm_beta operation
1 parent c45b7ae commit 0833a48

File tree

2 files changed

+260
-0
lines changed

2 files changed

+260
-0
lines changed

kernel/arm64/KERNEL.ARMV8

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ ZDOTKERNEL = zdot.S
103103
DSDOTKERNEL = dot.S
104104

105105
DGEMM_BETA = dgemm_beta.S
106+
SGEMM_BETA = sgemm_beta.S
106107

107108
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
108109
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S

kernel/arm64/sgemm_beta.S

Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
/***************************************************************************
2+
Copyright (c) 2016, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define ASSEMBLER
29+
#include "common.h"
30+
31+
#define M x0
32+
#define N x1
33+
#define BETA s0
34+
#define LDC x6
35+
#define C00 x7
36+
37+
#define A01 x8
38+
#define A02 x9
39+
#define A03 x10
40+
#define A04 x11
41+
#define I x12
42+
43+
#define beta0 s11
44+
#define betaV0 v11.s[0]
45+
46+
#define prfm_size 640
47+
#define calc_size 128
48+
49+
/**************************************************************************************
50+
* Macro definitions
51+
**************************************************************************************/
52+
53+
.macro SAVE_REGS
54+
add sp, sp, #-(11 * 16)
55+
stp d8, d9, [sp, #(0 * 16)]
56+
stp d10, d11, [sp, #(1 * 16)]
57+
stp d12, d13, [sp, #(2 * 16)]
58+
stp d14, d15, [sp, #(3 * 16)]
59+
stp d16, d17, [sp, #(4 * 16)]
60+
stp x18, x19, [sp, #(5 * 16)]
61+
stp x20, x21, [sp, #(6 * 16)]
62+
stp x22, x23, [sp, #(7 * 16)]
63+
stp x24, x25, [sp, #(8 * 16)]
64+
stp x26, x27, [sp, #(9 * 16)]
65+
str x28, [sp, #(10 * 16)]
66+
.endm
67+
68+
.macro RESTORE_REGS
69+
ldp d8, d9, [sp, #(0 * 16)]
70+
ldp d10, d11, [sp, #(1 * 16)]
71+
ldp d12, d13, [sp, #(2 * 16)]
72+
ldp d14, d15, [sp, #(3 * 16)]
73+
ldp d16, d17, [sp, #(4 * 16)]
74+
ldp x18, x19, [sp, #(5 * 16)]
75+
ldp x20, x21, [sp, #(6 * 16)]
76+
ldp x22, x23, [sp, #(7 * 16)]
77+
ldp x24, x25, [sp, #(8 * 16)]
78+
ldp x26, x27, [sp, #(9 * 16)]
79+
ldr x28, [sp, #(10 * 16)]
80+
add sp, sp, #(11*16)
81+
.endm
82+
83+
.macro INIT_ZERO
84+
fmul v0.4s, v0.4s, betaV0
85+
fmul v1.4s, v1.4s, betaV0
86+
fmul v2.4s, v2.4s, betaV0
87+
fmul v3.4s, v3.4s, betaV0
88+
fmul v4.4s, v4.4s, betaV0
89+
fmul v5.4s, v5.4s, betaV0
90+
fmul v6.4s, v6.4s, betaV0
91+
fmul v7.4s, v7.4s, betaV0
92+
.endm
93+
94+
/**************************************************************************************
95+
* End of macro definitions
96+
**************************************************************************************/
97+
98+
PROLOGUE
99+
100+
.align 5
101+
102+
ldr LDC, [sp]
103+
SAVE_REGS
104+
105+
.Lgemm_beta_BEGIN:
106+
107+
fmov beta0, BETA
108+
cmp N, #0
109+
ble .Lgemm_beta_L999
110+
111+
fcmp BETA, #0.0
112+
beq .Lgemm_beta_zero_01
113+
114+
.Lgemm_beta_01:
115+
116+
lsl LDC, LDC, #2
117+
118+
.align 5
119+
.Lgemm_beta_02:
120+
121+
mov A01, C00
122+
add C00, C00, LDC
123+
asr I, M, #5
124+
cmp I, #0
125+
ble .Lgemm_beta_04
126+
add A02, A01, #32
127+
add A03, A02, #32
128+
add A04, A03, #32
129+
130+
.align 5
131+
.Lgemm_beta_03:
132+
133+
prfm PLDL1KEEP, [A01, prfm_size]
134+
135+
ldp q0, q1, [A01]
136+
ldp q2, q3, [A02]
137+
ldp q4, q5, [A03]
138+
ldp q6, q7, [A04]
139+
140+
fmul v0.4s, v0.4s, betaV0
141+
fmul v1.4s, v1.4s, betaV0
142+
143+
fmul v2.4s, v2.4s, betaV0
144+
fmul v3.4s, v3.4s, betaV0
145+
146+
fmul v4.4s, v4.4s, betaV0
147+
fmul v5.4s, v5.4s, betaV0
148+
149+
fmul v6.4s, v6.4s, betaV0
150+
fmul v7.4s, v7.4s, betaV0
151+
152+
prfm PLDL1KEEP, [A01, prfm_size + 64]
153+
154+
st1 {v0.4s, v1.4s}, [A01]
155+
add A01, A01, calc_size
156+
st1 {v2.4s, v3.4s}, [A02]
157+
add A02, A02, calc_size
158+
st1 {v4.4s, v5.4s}, [A03]
159+
add A03, A03, calc_size
160+
st1 {v6.4s, v7.4s}, [A04]
161+
add A04, A04, calc_size
162+
163+
subs I , I , #1
164+
bne .Lgemm_beta_03
165+
166+
.align 5
167+
.Lgemm_beta_04:
168+
169+
and I, M , #31
170+
cmp I, #0
171+
ble .Lgemm_beta_06
172+
173+
.align 5
174+
.Lgemm_beta_05:
175+
176+
ldr s12, [A01]
177+
fmul s12, s12, beta0
178+
str s12, [A01]
179+
add A01, A01, #4
180+
181+
subs I , I , #1
182+
bne .Lgemm_beta_05
183+
184+
.align 5
185+
.Lgemm_beta_06:
186+
187+
subs N , N, #1 // N--
188+
bne .Lgemm_beta_02
189+
190+
.align 5
191+
.Lgemm_beta_L999:
192+
193+
mov x0, #0
194+
RESTORE_REGS
195+
ret
196+
197+
.align 5
198+
.Lgemm_beta_zero_01:
199+
200+
INIT_ZERO
201+
lsl LDC, LDC, #2
202+
203+
.align 5
204+
.Lgemm_beta_zero_02:
205+
206+
mov A01, C00
207+
add C00, C00, LDC
208+
209+
asr I, M, #5
210+
cmp I, #0
211+
ble .Lgemm_beta_zero_04
212+
add A02, A01, #32
213+
add A03, A02, #32
214+
add A04, A03, #32
215+
216+
.align 5
217+
.Lgemm_beta_zero_03:
218+
219+
st1 {v0.4s, v1.4s}, [A01]
220+
add A01, A01, calc_size
221+
st1 {v2.4s, v3.4s}, [A02]
222+
add A02, A02, calc_size
223+
st1 {v4.4s, v5.4s}, [A03]
224+
add A03, A03, calc_size
225+
st1 {v6.4s, v7.4s}, [A04]
226+
add A04, A04, calc_size
227+
228+
subs I, I, #1
229+
bne .Lgemm_beta_zero_03
230+
231+
.align 5
232+
.Lgemm_beta_zero_04:
233+
234+
and I, M, #31
235+
cmp I, #0
236+
ble .Lgemm_beta_zero_06
237+
238+
.align 5
239+
.Lgemm_beta_zero_05:
240+
241+
str beta0, [A01]
242+
add A01, A01, #4
243+
244+
subs I, I, #1
245+
bne .Lgemm_beta_zero_05
246+
247+
.align 5
248+
.Lgemm_beta_zero_06:
249+
250+
subs N, N, #1
251+
bne .Lgemm_beta_zero_02
252+
253+
.align 5
254+
.Lgemm_beta_zero_L999:
255+
mov x0, #0
256+
RESTORE_REGS
257+
ret
258+
259+
EPILOGUE

0 commit comments

Comments
 (0)