Skip to content

Commit 4787a55

Browse files
author
pengxu
committed
Optimized cgemm kernel 16x4 LASX for LoongArch
1 parent ebbf5b3 commit 4787a55

20 files changed

+19828
-7
lines changed

kernel/generic/zhemm_ltcopy_16.c

Lines changed: 1170 additions & 0 deletions
Large diffs are not rendered by default.

kernel/generic/zhemm_utcopy_16.c

Lines changed: 1168 additions & 0 deletions
Large diffs are not rendered by default.

kernel/generic/zneg_tcopy_16.c

Lines changed: 587 additions & 0 deletions
Large diffs are not rendered by default.

kernel/generic/zsymm_lcopy_16.c

Lines changed: 333 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
/*******************************************************************************
2+
Copyright (c) 2024, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*******************************************************************************/
27+
28+
#include <stdio.h>
29+
#include "common.h"
30+
31+
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
32+
33+
BLASLONG i, js, offset;
34+
35+
FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
36+
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
37+
FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
38+
FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
39+
40+
FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8;
41+
FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
42+
43+
lda *= 2;
44+
45+
js = (n >> 4);
46+
while (js > 0){
47+
48+
offset = posX - posY;
49+
50+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
51+
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
52+
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
53+
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
54+
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
55+
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
56+
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
57+
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
58+
if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda;
59+
if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda;
60+
if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda;
61+
if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda;
62+
if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda;
63+
if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda;
64+
if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda;
65+
if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda;
66+
67+
68+
i = m;
69+
70+
while (i > 0) {
71+
data01 = *(ao1 + 0);
72+
data02 = *(ao1 + 1);
73+
data03 = *(ao2 + 0);
74+
data04 = *(ao2 + 1);
75+
data05 = *(ao3 + 0);
76+
data06 = *(ao3 + 1);
77+
data07 = *(ao4 + 0);
78+
data08 = *(ao4 + 1);
79+
data09 = *(ao5 + 0);
80+
data10 = *(ao5 + 1);
81+
data11 = *(ao6 + 0);
82+
data12 = *(ao6 + 1);
83+
data13 = *(ao7 + 0);
84+
data14 = *(ao7 + 1);
85+
data15 = *(ao8 + 0);
86+
data16 = *(ao8 + 1);
87+
data17 = *(ao9 + 0);
88+
data18 = *(ao9 + 1);
89+
data19 = *(ao10 + 0);
90+
data20 = *(ao10 + 1);
91+
data21 = *(ao11 + 0);
92+
data22 = *(ao11 + 1);
93+
data23 = *(ao12 + 0);
94+
data24 = *(ao12 + 1);
95+
data25 = *(ao13 + 0);
96+
data26 = *(ao13 + 1);
97+
data27 = *(ao14 + 0);
98+
data28 = *(ao14 + 1);
99+
data29 = *(ao15 + 0);
100+
data30 = *(ao15 + 1);
101+
data31 = *(ao16 + 0);
102+
data32 = *(ao16 + 1);
103+
104+
if (offset > 0) ao1 += lda; else ao1 += 2;
105+
if (offset > -1) ao2 += lda; else ao2 += 2;
106+
if (offset > -2) ao3 += lda; else ao3 += 2;
107+
if (offset > -3) ao4 += lda; else ao4 += 2;
108+
if (offset > -4) ao5 += lda; else ao5 += 2;
109+
if (offset > -5) ao6 += lda; else ao6 += 2;
110+
if (offset > -6) ao7 += lda; else ao7 += 2;
111+
if (offset > -7) ao8 += lda; else ao8 += 2;
112+
if (offset > -8) ao9 += lda; else ao9 += 2;
113+
if (offset > -9) ao10 += lda; else ao10 += 2;
114+
if (offset > -10) ao11 += lda; else ao11 += 2;
115+
if (offset > -11) ao12 += lda; else ao12 += 2;
116+
if (offset > -12) ao13 += lda; else ao13 += 2;
117+
if (offset > -13) ao14 += lda; else ao14 += 2;
118+
if (offset > -14) ao15 += lda; else ao15 += 2;
119+
if (offset > -15) ao16 += lda; else ao16 += 2;
120+
121+
b[ 0] = data01;
122+
b[ 1] = data02;
123+
b[ 2] = data03;
124+
b[ 3] = data04;
125+
b[ 4] = data05;
126+
b[ 5] = data06;
127+
b[ 6] = data07;
128+
b[ 7] = data08;
129+
b[ 8] = data09;
130+
b[ 9] = data10;
131+
b[10] = data11;
132+
b[11] = data12;
133+
b[12] = data13;
134+
b[13] = data14;
135+
b[14] = data15;
136+
b[15] = data16;
137+
b[16] = data17;
138+
b[17] = data18;
139+
b[18] = data19;
140+
b[19] = data20;
141+
b[20] = data21;
142+
b[21] = data22;
143+
b[22] = data23;
144+
b[23] = data24;
145+
b[24] = data25;
146+
b[25] = data26;
147+
b[26] = data27;
148+
b[27] = data28;
149+
b[28] = data29;
150+
b[29] = data30;
151+
b[30] = data31;
152+
b[31] = data32;
153+
154+
b += 32;
155+
156+
offset --;
157+
i --;
158+
}
159+
160+
posX += 16;
161+
js --;
162+
}
163+
164+
if (n & 8) {
165+
offset = posX - posY;
166+
167+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
168+
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
169+
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
170+
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
171+
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
172+
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
173+
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
174+
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
175+
176+
i = m;
177+
178+
while (i > 0) {
179+
data01 = *(ao1 + 0);
180+
data02 = *(ao1 + 1);
181+
data03 = *(ao2 + 0);
182+
data04 = *(ao2 + 1);
183+
data05 = *(ao3 + 0);
184+
data06 = *(ao3 + 1);
185+
data07 = *(ao4 + 0);
186+
data08 = *(ao4 + 1);
187+
data09 = *(ao5 + 0);
188+
data10 = *(ao5 + 1);
189+
data11 = *(ao6 + 0);
190+
data12 = *(ao6 + 1);
191+
data13 = *(ao7 + 0);
192+
data14 = *(ao7 + 1);
193+
data15 = *(ao8 + 0);
194+
data16 = *(ao8 + 1);
195+
196+
if (offset > 0) ao1 += lda; else ao1 += 2;
197+
if (offset > -1) ao2 += lda; else ao2 += 2;
198+
if (offset > -2) ao3 += lda; else ao3 += 2;
199+
if (offset > -3) ao4 += lda; else ao4 += 2;
200+
if (offset > -4) ao5 += lda; else ao5 += 2;
201+
if (offset > -5) ao6 += lda; else ao6 += 2;
202+
if (offset > -6) ao7 += lda; else ao7 += 2;
203+
if (offset > -7) ao8 += lda; else ao8 += 2;
204+
205+
b[ 0] = data01;
206+
b[ 1] = data02;
207+
b[ 2] = data03;
208+
b[ 3] = data04;
209+
b[ 4] = data05;
210+
b[ 5] = data06;
211+
b[ 6] = data07;
212+
b[ 7] = data08;
213+
b[ 8] = data09;
214+
b[ 9] = data10;
215+
b[10] = data11;
216+
b[11] = data12;
217+
b[12] = data13;
218+
b[13] = data14;
219+
b[14] = data15;
220+
b[15] = data16;
221+
222+
b += 16;
223+
224+
offset --;
225+
i --;
226+
}
227+
228+
posX += 8;
229+
}
230+
231+
if (n & 4) {
232+
offset = posX - posY;
233+
234+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
235+
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
236+
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
237+
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
238+
239+
i = m;
240+
241+
while (i > 0) {
242+
data01 = *(ao1 + 0);
243+
data02 = *(ao1 + 1);
244+
data03 = *(ao2 + 0);
245+
data04 = *(ao2 + 1);
246+
data05 = *(ao3 + 0);
247+
data06 = *(ao3 + 1);
248+
data07 = *(ao4 + 0);
249+
data08 = *(ao4 + 1);
250+
251+
if (offset > 0) ao1 += lda; else ao1 += 2;
252+
if (offset > -1) ao2 += lda; else ao2 += 2;
253+
if (offset > -2) ao3 += lda; else ao3 += 2;
254+
if (offset > -3) ao4 += lda; else ao4 += 2;
255+
256+
b[ 0] = data01;
257+
b[ 1] = data02;
258+
b[ 2] = data03;
259+
b[ 3] = data04;
260+
b[ 4] = data05;
261+
b[ 5] = data06;
262+
b[ 6] = data07;
263+
b[ 7] = data08;
264+
265+
b += 8;
266+
267+
offset --;
268+
i --;
269+
}
270+
271+
posX += 4;
272+
}
273+
274+
if (n & 2) {
275+
276+
offset = posX - posY;
277+
278+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
279+
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
280+
281+
i = m;
282+
283+
while (i > 0) {
284+
data01 = *(ao1 + 0);
285+
data02 = *(ao1 + 1);
286+
data03 = *(ao2 + 0);
287+
data04 = *(ao2 + 1);
288+
289+
if (offset > 0) ao1 += lda; else ao1 += 2;
290+
if (offset > -1) ao2 += lda; else ao2 += 2;
291+
292+
b[ 0] = data01;
293+
b[ 1] = data02;
294+
b[ 2] = data03;
295+
b[ 3] = data04;
296+
297+
b += 4;
298+
299+
offset --;
300+
i --;
301+
}
302+
303+
posX += 2;
304+
305+
}
306+
307+
if (n & 1) {
308+
309+
offset = posX - posY;
310+
311+
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
312+
313+
i = m;
314+
315+
while (i > 0) {
316+
data01 = *(ao1 + 0);
317+
data02 = *(ao1 + 1);
318+
319+
if (offset > 0) ao1 += lda; else ao1 += 2;
320+
321+
b[ 0] = data01;
322+
b[ 1] = data02;
323+
324+
b += 2;
325+
326+
offset --;
327+
i --;
328+
}
329+
330+
}
331+
332+
return 0;
333+
}

0 commit comments

Comments
 (0)