Skip to content

Commit aeef942

Browse files
committed
use arm neon instructions to optimize gemm beta operation
1 parent 445ca2f commit aeef942

File tree

2 files changed

+179
-1
lines changed

2 files changed

+179
-1
lines changed

kernel/arm64/KERNEL

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ ifndef SGEMM_BETA
3434
SGEMM_BETA = ../generic/gemm_beta.c
3535
endif
3636
ifndef DGEMM_BETA
37-
DGEMM_BETA = ../generic/gemm_beta.c
37+
DGEMM_BETA = ../arm64/dgemm_beta.S
3838
endif
3939
ifndef CGEMM_BETA
4040
CGEMM_BETA = ../generic/zgemm_beta.c

kernel/arm64/dgemm_beta.S

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
/***************************************************************************
2+
Copyright (c) 2016, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define ASSEMBLER
29+
#include "common.h"
30+
31+
#define M x0
32+
#define N x1
33+
#define BETA d0
34+
#define LDC x6
35+
#define C00 x7
36+
37+
#define A01 x8
38+
#define A02 x9
39+
#define A03 x10
40+
#define A04 x11
41+
42+
#define beta0 d11
43+
#define betaV0 v11.d[0]
44+
#define I x16
45+
46+
#define size 128
47+
48+
/**************************************************************************************
49+
* Macro definitions
50+
**************************************************************************************/
51+
52+
.macro SAVE_REGS
53+
add sp, sp, #-(11 * 16)
54+
stp d8, d9, [sp, #(0 * 16)]
55+
stp d10, d11, [sp, #(1 * 16)]
56+
stp d12, d13, [sp, #(2 * 16)]
57+
stp d14, d15, [sp, #(3 * 16)]
58+
stp d16, d17, [sp, #(4 * 16)]
59+
stp x18, x19, [sp, #(5 * 16)]
60+
stp x20, x21, [sp, #(6 * 16)]
61+
stp x22, x23, [sp, #(7 * 16)]
62+
stp x24, x25, [sp, #(8 * 16)]
63+
stp x26, x27, [sp, #(9 * 16)]
64+
str x28, [sp, #(10 * 16)]
65+
.endm
66+
67+
.macro RESTORE_REGS
68+
ldp d8, d9, [sp, #(0 * 16)]
69+
ldp d10, d11, [sp, #(1 * 16)]
70+
ldp d12, d13, [sp, #(2 * 16)]
71+
ldp d14, d15, [sp, #(3 * 16)]
72+
ldp d16, d17, [sp, #(4 * 16)]
73+
ldp x18, x19, [sp, #(5 * 16)]
74+
ldp x20, x21, [sp, #(6 * 16)]
75+
ldp x22, x23, [sp, #(7 * 16)]
76+
ldp x24, x25, [sp, #(8 * 16)]
77+
ldp x26, x27, [sp, #(9 * 16)]
78+
ldr x28, [sp, #(10 * 16)]
79+
add sp, sp, #(11*16)
80+
.endm
81+
82+
/**************************************************************************************
83+
* End of macro definitions
84+
**************************************************************************************/
85+
86+
PROLOGUE
87+
88+
.align 5
89+
90+
ldr LDC, [sp]
91+
SAVE_REGS
92+
93+
.Lgemm_beta_BEGIN:
94+
95+
fmov beta0, BETA
96+
cmp N, #0
97+
ble .Lgemm_beta_L999
98+
99+
.Lgemm_beta_01:
100+
101+
lsl LDC, LDC, #3
102+
103+
.align 5
104+
.Lgemm_beta_02:
105+
106+
mov A01, C00
107+
add C00, C00, LDC
108+
asr I, M, #4
109+
cmp I, #0
110+
ble .Lgemm_beta_04
111+
add A02, A01, #32
112+
add A03, A02, #32
113+
add A04, A03, #32
114+
115+
.align 5
116+
.Lgemm_beta_03:
117+
118+
ldp q0, q1, [A01]
119+
ldp q2, q3, [A02]
120+
ldp q4, q5, [A03]
121+
ldp q6, q7, [A04]
122+
123+
fmul v0.2d, v0.2d, betaV0
124+
fmul v1.2d, v1.2d, betaV0
125+
126+
fmul v2.2d, v2.2d, betaV0
127+
fmul v3.2d, v3.2d, betaV0
128+
129+
fmul v4.2d, v4.2d, betaV0
130+
fmul v5.2d, v5.2d, betaV0
131+
132+
fmul v6.2d, v6.2d, betaV0
133+
fmul v7.2d, v7.2d, betaV0
134+
135+
st1 {v0.2d, v1.2d}, [A01]
136+
add A01, A01, size
137+
st1 {v2.2d, v3.2d}, [A02]
138+
add A02, A02, size
139+
st1 {v4.2d, v5.2d}, [A03]
140+
add A03, A03, size
141+
st1 {v6.2d, v7.2d}, [A04]
142+
add A04, A04, size
143+
144+
subs I , I , #1
145+
bne .Lgemm_beta_03
146+
147+
.align 5
148+
.Lgemm_beta_04:
149+
150+
and I, M , #15 // M%16
151+
cmp I, #0
152+
ble .Lgemm_beta_06
153+
154+
.align 5
155+
.Lgemm_beta_05:
156+
157+
ldr d12, [A01]
158+
fmul d12, d12, beta0
159+
str d12, [A01]
160+
add A01, A01, #8
161+
162+
subs I , I , #1
163+
bne .Lgemm_beta_05
164+
165+
.align 5
166+
.Lgemm_beta_06:
167+
168+
subs N , N, #1 // N--
169+
bne .Lgemm_beta_02
170+
171+
.align 5
172+
.Lgemm_beta_L999:
173+
174+
mov x0, #0
175+
RESTORE_REGS
176+
ret
177+
178+
EPILOGUE

0 commit comments

Comments
 (0)