Skip to content

Commit 9998f8e

Browse files
authored
Merge pull request #2356 from shengyang-3390/develop
Use arm neon instructions to optimize ncopy operation (and enable 7th column in float+complex cblas3 test drivers)
2 parents 4402858 + 80db5f1 commit 9998f8e

File tree

3 files changed

+353
-4
lines changed

3 files changed

+353
-4
lines changed

kernel/arm64/KERNEL.ARMV8

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,21 +108,29 @@ SGEMM_BETA = sgemm_beta.S
108108
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
109109
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
110110
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
111-
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
112111
ifeq ($(SGEMM_UNROLL_M), 16)
113112
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
114113
else
115114
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
116115
endif
116+
ifeq ($(SGEMM_UNROLL_M), 4)
117+
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
118+
else
119+
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
120+
endif
117121
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
118122
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
119123
endif
120-
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
121124
ifeq ($(SGEMM_UNROLL_N), 16)
122125
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
123126
else
124127
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
125128
endif
129+
ifeq ($(SGEMM_UNROLL_N), 4)
130+
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
131+
else
132+
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
133+
endif
126134
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
127135
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
128136

kernel/arm64/KERNEL.TSV110

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,21 +109,29 @@ ZGEMVTKERNEL = zgemv_t.S
109109
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
110110
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
111111
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
112-
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
113112
ifeq ($(SGEMM_UNROLL_M), 16)
114113
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
115114
else
116115
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
117116
endif
117+
ifeq ($(SGEMM_UNROLL_M), 4)
118+
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
119+
else
120+
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
121+
endif
118122
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
119123
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
120124
endif
121-
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
122125
ifeq ($(SGEMM_UNROLL_N), 16)
123126
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
124127
else
125128
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
126129
endif
130+
ifeq ($(SGEMM_UNROLL_N), 4)
131+
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
132+
else
133+
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
134+
endif
127135
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
128136
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
129137

kernel/arm64/sgemm_ncopy_4.S

Lines changed: 333 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
/***************************************************************************
2+
Copyright (c) 2016, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define ASSEMBLER
29+
#include "common.h"
30+
31+
#define M x0
32+
#define N x1
33+
#define A00 x2
34+
#define LDA x3
35+
#define B00 x4
36+
37+
#define A01 x5
38+
#define A02 x6
39+
#define A03 x7
40+
#define A04 x8
41+
42+
#define I x9
43+
#define J x10
44+
45+
#define TEMP1 x11
46+
#define TEMP2 x12
47+
48+
#define A_PREFETCH 2560
49+
50+
/**************************************************************************************
51+
* Macro definitions
52+
**************************************************************************************/
53+
54+
.macro SAVE_REGS
55+
add sp, sp, #-(11 * 16)
56+
stp d8, d9, [sp, #(0 * 16)]
57+
stp d10, d11, [sp, #(1 * 16)]
58+
stp d12, d13, [sp, #(2 * 16)]
59+
stp d14, d15, [sp, #(3 * 16)]
60+
stp d16, d17, [sp, #(4 * 16)]
61+
stp x18, x19, [sp, #(5 * 16)]
62+
stp x20, x21, [sp, #(6 * 16)]
63+
stp x22, x23, [sp, #(7 * 16)]
64+
stp x24, x25, [sp, #(8 * 16)]
65+
stp x26, x27, [sp, #(9 * 16)]
66+
str x28, [sp, #(10 * 16)]
67+
.endm
68+
69+
.macro RESTORE_REGS
70+
ldp d8, d9, [sp, #(0 * 16)]
71+
ldp d10, d11, [sp, #(1 * 16)]
72+
ldp d12, d13, [sp, #(2 * 16)]
73+
ldp d14, d15, [sp, #(3 * 16)]
74+
ldp d16, d17, [sp, #(4 * 16)]
75+
ldp x18, x19, [sp, #(5 * 16)]
76+
ldp x20, x21, [sp, #(6 * 16)]
77+
ldp x22, x23, [sp, #(7 * 16)]
78+
ldp x24, x25, [sp, #(8 * 16)]
79+
ldp x26, x27, [sp, #(9 * 16)]
80+
ldr x28, [sp, #(10 * 16)]
81+
add sp, sp, #(11*16)
82+
.endm
83+
84+
.macro COPY4x4
85+
prfm PLDL1KEEP, [A01, #A_PREFETCH]
86+
prfm PLDL1KEEP, [A02, #A_PREFETCH]
87+
prfm PLDL1KEEP, [A03, #A_PREFETCH]
88+
prfm PLDL1KEEP, [A04, #A_PREFETCH]
89+
90+
ldr q0, [A01], #16
91+
ins v8.s[0], v0.s[0]
92+
ins v9.s[0], v0.s[1]
93+
ins v10.s[0], v0.s[2]
94+
ins v11.s[0], v0.s[3]
95+
96+
ldr q1, [A02], #16
97+
ins v8.s[1], v1.s[0]
98+
ins v9.s[1], v1.s[1]
99+
ins v10.s[1], v1.s[2]
100+
ins v11.s[1], v1.s[3]
101+
102+
ldr q2, [A03], #16
103+
ins v8.s[2], v2.s[0]
104+
ins v9.s[2], v2.s[1]
105+
ins v10.s[2], v2.s[2]
106+
ins v11.s[2], v2.s[3]
107+
108+
ldr q3, [A04], #16
109+
ins v8.s[3], v3.s[0]
110+
ins v9.s[3], v3.s[1]
111+
ins v10.s[3], v3.s[2]
112+
ins v11.s[3], v3.s[3]
113+
114+
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00]
115+
add B00, B00, #64
116+
117+
.endm
118+
119+
.macro COPY1x4
120+
prfm PLDL1KEEP, [A01, #A_PREFETCH]
121+
prfm PLDL1KEEP, [A02, #A_PREFETCH]
122+
prfm PLDL1KEEP, [A03, #A_PREFETCH]
123+
prfm PLDL1KEEP, [A04, #A_PREFETCH]
124+
125+
ldr s0, [A01], #4
126+
ldr s1, [A02], #4
127+
ldr s2, [A03], #4
128+
ldr s3, [A04], #4
129+
130+
stp s0, s1, [B00]
131+
add B00, B00, #8
132+
stp s2, s3, [B00]
133+
add B00, B00, #8
134+
.endm
135+
136+
.macro COPY4x2
137+
prfm PLDL1KEEP, [A01, #A_PREFETCH]
138+
prfm PLDL1KEEP, [A02, #A_PREFETCH]
139+
140+
ldr q0, [A01], #16
141+
ins v8.s[0], v0.s[0]
142+
ins v9.s[0], v0.s[1]
143+
ins v10.s[0], v0.s[2]
144+
ins v11.s[0], v0.s[3]
145+
146+
ldr q1, [A02], #16
147+
ins v8.s[1], v1.s[0]
148+
ins v9.s[1], v1.s[1]
149+
ins v10.s[1], v1.s[2]
150+
ins v11.s[1], v1.s[3]
151+
152+
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00]
153+
add B00, B00, #32
154+
.endm
155+
156+
157+
.macro COPY1x2
158+
prfm PLDL1KEEP, [A01, #A_PREFETCH]
159+
prfm PLDL1KEEP, [A02, #A_PREFETCH]
160+
161+
ldr s0, [A01], #4
162+
ldr s1, [A02], #4
163+
164+
stp s0, s1, [B00]
165+
add B00, B00, #8
166+
.endm
167+
168+
.macro COPY4x1
169+
prfm PLDL1KEEP, [A01, #A_PREFETCH]
170+
171+
ldr q0, [A01], #16
172+
str q0, [B00], #16
173+
.endm
174+
175+
176+
.macro COPY1x1
177+
prfm PLDL1KEEP, [A01, #A_PREFETCH]
178+
179+
ldr s0, [A01], #4
180+
str s0, [B00], #4
181+
.endm
182+
183+
/**************************************************************************************
184+
* End of macro definitions
185+
**************************************************************************************/
186+
187+
PROLOGUE
188+
189+
.align 5
190+
191+
SAVE_REGS
192+
193+
lsl LDA, LDA, #2 // LDA = LDA * SIZE
194+
195+
.Ldgemm_ncopy_L4_BEGIN:
196+
197+
asr J, N, #2 // J = N / 4
198+
cmp J, #0
199+
ble .Ldgemm_ncopy_L2_BEGIN
200+
201+
.align 5
202+
.Ldgemm_ncopy_L4_M4_BEGIN:
203+
204+
mov A01, A00
205+
add A02, A01, LDA
206+
add A03, A02, LDA
207+
add A04, A03, LDA
208+
add A00, A04, LDA
209+
210+
asr I, M, #2 // I = M / 4
211+
cmp I, #0
212+
ble .Ldgemm_ncopy_L4_M4_40
213+
214+
.align 5
215+
.Ldgemm_ncopy_L4_M4_20:
216+
217+
COPY4x4
218+
219+
subs I , I , #1
220+
bne .Ldgemm_ncopy_L4_M4_20
221+
222+
.Ldgemm_ncopy_L4_M4_40:
223+
224+
and I, M , #3
225+
cmp I, #0
226+
ble .Ldgemm_ncopy_L4_M4_END
227+
228+
.align 5
229+
.Ldgemm_ncopy_L4_M4_60:
230+
231+
COPY1x4
232+
233+
subs I , I , #1
234+
bne .Ldgemm_ncopy_L4_M4_60
235+
236+
.Ldgemm_ncopy_L4_M4_END:
237+
238+
subs J , J, #1 // j--
239+
bne .Ldgemm_ncopy_L4_M4_BEGIN
240+
241+
/*********************************************************************************************/
242+
243+
.Ldgemm_ncopy_L2_BEGIN:
244+
245+
tst N, #3
246+
ble .Ldgemm_ncopy_L999
247+
248+
tst N, #2
249+
ble .Ldgemm_ncopy_L1_BEGIN
250+
251+
.Ldgemm_ncopy_L2_M4_BEGIN:
252+
mov A01, A00
253+
add A02, A01, LDA
254+
add A00, A02, LDA
255+
256+
asr I, M, #2 // I = M / 4
257+
cmp I, #0
258+
ble .Ldgemm_ncopy_L2_M4_40
259+
260+
.align 5
261+
.Ldgemm_ncopy_L2_M4_20:
262+
263+
COPY4x2
264+
265+
subs I , I , #1
266+
bne .Ldgemm_ncopy_L2_M4_20
267+
268+
.Ldgemm_ncopy_L2_M4_40:
269+
270+
and I, M , #3
271+
cmp I, #0
272+
ble .Ldgemm_ncopy_L2_M4_END
273+
274+
.align 5
275+
.Ldgemm_ncopy_L2_M4_60:
276+
277+
COPY1x2
278+
279+
subs I , I , #1
280+
bne .Ldgemm_ncopy_L2_M4_60
281+
282+
.Ldgemm_ncopy_L2_M4_END:
283+
284+
285+
/*********************************************************************************************/
286+
287+
.Ldgemm_ncopy_L1_BEGIN:
288+
289+
tst N, #1
290+
ble .Ldgemm_ncopy_L999
291+
292+
.Ldgemm_ncopy_L1_M4_BEGIN:
293+
294+
mov A01, A00
295+
296+
asr I, M, #2 // I = M / 4
297+
cmp I, #0
298+
ble .Ldgemm_ncopy_L1_M4_40
299+
300+
.align 5
301+
.Ldgemm_ncopy_L1_M4_20:
302+
303+
COPY4x1
304+
305+
subs I , I , #1
306+
bne .Ldgemm_ncopy_L1_M4_20
307+
308+
309+
.Ldgemm_ncopy_L1_M4_40:
310+
311+
and I, M , #3
312+
cmp I, #0
313+
ble .Ldgemm_ncopy_L1_M4_END
314+
315+
.align 5
316+
.Ldgemm_ncopy_L1_M4_60:
317+
318+
COPY1x1
319+
320+
subs I , I , #1
321+
bne .Ldgemm_ncopy_L1_M4_60
322+
323+
324+
.Ldgemm_ncopy_L1_M4_END:
325+
326+
.Ldgemm_ncopy_L999:
327+
328+
mov x0, #0
329+
RESTORE_REGS
330+
ret
331+
332+
EPILOGUE
333+

0 commit comments

Comments
 (0)