Skip to content

Commit 84a268b

Browse files
committed
Use SVE zgemm/cgemm on Arm(R) Neoverse(TM) V1 core
This patch removes the prefetches from cgemm/zgemm which improves the performance similar to sgemm/dgemm did in #3868, this means I'm happy to enable this on any applicable cores. I also replicated the unrolling the copies from sgemm and dgemm.
1 parent 9ba9c8b commit 84a268b

File tree

7 files changed

+244
-151
lines changed

7 files changed

+244
-151
lines changed

kernel/arm64/KERNEL.ARMV8SVE

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c
160160
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
161161
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
162162

163-
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
164-
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
163+
CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
164+
CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
165165
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
166166
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
167167

@@ -184,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c
184184
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
185185
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
186186

187-
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
188-
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
187+
ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
188+
ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
189189
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
190190
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
191191

kernel/arm64/KERNEL.NEOVERSEV1

Lines changed: 0 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,66 +1 @@
11
include $(KERNELDIR)/KERNEL.ARMV8SVE
2-
3-
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
4-
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
5-
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
6-
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
7-
8-
CTRMMUNCOPY_M =
9-
CTRMMLNCOPY_M =
10-
CTRMMUTCOPY_M =
11-
CTRMMLTCOPY_M =
12-
CHEMMLTCOPY_M =
13-
CHEMMUTCOPY_M =
14-
CSYMMUCOPY_M =
15-
CSYMMLCOPY_M =
16-
17-
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
18-
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
19-
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
20-
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
21-
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
22-
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
23-
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
24-
else
25-
CGEMMINCOPYOBJ =
26-
CGEMMITCOPYOBJ =
27-
endif
28-
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
29-
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
30-
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
31-
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
32-
33-
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
34-
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
35-
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
36-
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
37-
38-
ZTRSMCOPYLN_M =
39-
ZTRSMCOPYLT_M =
40-
ZTRSMCOPYUN_M =
41-
ZTRSMCOPYUT_M =
42-
43-
ZTRMMUNCOPY_M =
44-
ZTRMMLNCOPY_M =
45-
ZTRMMUTCOPY_M =
46-
ZTRMMLTCOPY_M =
47-
ZHEMMLTCOPY_M =
48-
ZHEMMUTCOPY_M =
49-
ZSYMMUCOPY_M =
50-
ZSYMMLCOPY_M =
51-
52-
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
53-
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
54-
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
55-
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
56-
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
57-
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
58-
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
59-
else
60-
ZGEMMINCOPYOBJ =
61-
ZGEMMITCOPYOBJ =
62-
endif
63-
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
64-
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
65-
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
66-
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

kernel/arm64/cgemm_kernel_sve_v1x4.S

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
240240

241241
add pB, pB, 32
242242

243-
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
244243
.endm
245244

246245
.macro KERNELv1x4_M1
@@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
276275
ld1rw z15.s, p0/z, [pB, 28]
277276

278277
add pB, pB, 32
279-
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
280-
281-
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
282278
.endm
283279

284280
.macro KERNELv1x4_M2
@@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
313309
OP_ri z23.s, p1/m, z2.s, z15.s
314310
ld1rw z15.s, p0/z, [pB, 28]
315311

316-
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
317-
318312
add pB, pB, 32
319-
320-
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
321313
.endm
322314

323315
.macro KERNELv1x4_E
@@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
341333
OP_ii z22.s, p1/m, z3.s, z15.s
342334
OP_ri z23.s, p1/m, z2.s, z15.s
343335

344-
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
345-
346-
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
347-
348336
.endm
349337

350338
.macro KERNELv1x4_SUB
@@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
383371
OP_ii z22.s, p1/m, z1.s, z15.s
384372
OP_ri z23.s, p1/m, z0.s, z15.s
385373

386-
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
387-
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
388374
.endm
389375

390376
.macro SAVEv1x4
391-
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
392-
393377
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
394378
fmla z24.s, p1/m, z16.s, alphaz_R
395379
fmls z24.s, p1/m, z17.s, alphaz_I
@@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
407391
st2w {z26.s, z27.s}, p1, [pCRow1]
408392

409393
add pCRow1, pCRow1, lanes, lsl #3
410-
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
411-
412394
ld2w {z28.s, z29.s}, p1/z, [pCRow2]
413395
fmla z28.s, p1/m, z20.s, alphaz_R
414396
fmls z28.s, p1/m, z21.s, alphaz_I
@@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
425407
fmla z31.s, p1/m, z23.s, alphaz_R
426408
st2w {z30.s, z31.s}, p1, [pCRow3]
427409

428-
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
429-
430410
add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4
431411

432-
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
433-
434412
.endm
435413

436414
/******************************************************************************/
@@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
466444
.endm
467445

468446
.macro SAVEv1x2
469-
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
470-
471447
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
472448
fmla z24.s, p1/m, z16.s, alphaz_R
473449
fmls z24.s, p1/m, z17.s, alphaz_I
@@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
485461
st2w {z26.s, z27.s}, p1, [pCRow1]
486462

487463
add pCRow1, pCRow1, lanes, lsl #3
488-
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
489-
490-
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
491-
492464
.endm
493465

494466
/******************************************************************************/
@@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
516488
.endm
517489

518490
.macro SAVEv1x1
519-
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
520-
521491
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
522492
fmla z24.s, p1/m, z16.s, alphaz_R
523493
fmls z24.s, p1/m, z17.s, alphaz_I
@@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
527497

528498
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4
529499

530-
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
531-
532500
.endm
533501

534502
/******************************************************************************/
@@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
553521
stp x26, x27, [sp, #(9 * 16)]
554522
str x28, [sp, #(10 * 16)]
555523

556-
prfm PLDL1KEEP, [origPB]
557-
prfm PLDL1KEEP, [origPA]
558-
559524
fmov alphaR, s0
560525
dup alphaz_R, alphaR
561526
fmov alphaI, s1
@@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
676641
bne .Lcgemm_kernel_L4_Mv1_46
677642

678643
.Lcgemm_kernel_L4_Mv1_100:
679-
prfm PLDL1KEEP, [pA]
680-
prfm PLDL1KEEP, [pA, #64]
681-
prfm PLDL1KEEP, [origPB]
682-
683644
SAVEv1x4
684645

685646
.Lcgemm_kernel_L4_Mv1_END:
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
/***************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#include <stdint.h>
29+
#include <stdio.h>
30+
#include <arm_sve.h>
31+
32+
#include "common.h"
33+
34+
#ifdef DOUBLE
35+
#define COUNT "cntd"
36+
#define SV_TYPE svfloat64_t
37+
#define SV_INDEX svuint64_t
38+
#define SV_INDEXER svindex_u64
39+
#define SV_TRUE svptrue_b64
40+
#define SV_WHILE svwhilelt_b64
41+
#else
42+
#define COUNT "cntw"
43+
#define SV_TYPE svfloat32_t
44+
#define SV_INDEX svuint32_t
45+
#define SV_INDEXER svindex_u32
46+
#define SV_TRUE svptrue_b32
47+
#define SV_WHILE svwhilelt_b32
48+
#endif
49+
50+
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
51+
a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \
52+
a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \
53+
svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \
54+
a_offset_inner += 2; \
55+
b_offset += active * 2;
56+
57+
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
58+
uint64_t sve_size;
59+
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
60+
61+
IFLOAT *a_offset, *a_offset_inner, *b_offset;
62+
a_offset = a;
63+
b_offset = b;
64+
65+
SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2);
66+
SV_TYPE a_vec_real;
67+
SV_TYPE a_vec_imag;
68+
svbool_t pg_true = SV_TRUE();
69+
70+
BLASLONG single_vectors_n = n & -sve_size;
71+
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
72+
a_offset_inner = a_offset;
73+
74+
svbool_t pg = pg_true;
75+
uint64_t active = sve_size;
76+
uint64_t i_cnt = m >> 2;
77+
while (i_cnt--) {
78+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
79+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
80+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
81+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
82+
}
83+
84+
if (m & 2) {
85+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
86+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
87+
}
88+
89+
if (m & 1) {
90+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
91+
}
92+
93+
a_offset += sve_size * lda * 2;
94+
}
95+
96+
BLASLONG remaining_n = n - single_vectors_n;
97+
if (remaining_n) {
98+
a_offset_inner = a_offset;
99+
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
100+
uint64_t active = remaining_n;
101+
uint64_t i_cnt = m >> 2;
102+
while (i_cnt--) {
103+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
104+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
105+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
106+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
107+
}
108+
109+
if (m & 2) {
110+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
111+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
112+
}
113+
114+
if (m & 1) {
115+
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
116+
}
117+
}
118+
119+
return 0;
120+
}
121+

0 commit comments

Comments
 (0)