Skip to content

Commit a978ad3

Browse files
author
pengxu
committed
Loongarch64: add C functions of zgemm_ncopy_16
1 parent 0ccb050 commit a978ad3

File tree

1 file changed

+332
-0
lines changed

1 file changed

+332
-0
lines changed

kernel/generic/zgemm_ncopy_16.c

Lines changed: 332 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
/*********************************************************************/
2+
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* All rights reserved. */
4+
/* */
5+
/* Redistribution and use in source and binary forms, with or */
6+
/* without modification, are permitted provided that the following */
7+
/* conditions are met: */
8+
/* */
9+
/* 1. Redistributions of source code must retain the above */
10+
/* copyright notice, this list of conditions and the following */
11+
/* disclaimer. */
12+
/* */
13+
/* 2. Redistributions in binary form must reproduce the above */
14+
/* copyright notice, this list of conditions and the following */
15+
/* disclaimer in the documentation and/or other materials */
16+
/* provided with the distribution. */
17+
/* */
18+
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19+
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20+
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21+
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22+
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23+
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24+
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25+
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26+
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27+
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28+
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29+
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30+
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31+
/* POSSIBILITY OF SUCH DAMAGE. */
32+
/* */
33+
/* The views and conclusions contained in the software and */
34+
/* documentation are those of the authors and should not be */
35+
/* interpreted as representing official policies, either expressed */
36+
/* or implied, of The University of Texas at Austin. */
37+
/*********************************************************************/
38+
39+
#include <stdio.h>
40+
#include "common.h"
41+
42+
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
43+
BLASLONG i, j;
44+
45+
IFLOAT *aoffset;
46+
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
47+
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
48+
IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
49+
IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
50+
51+
IFLOAT *boffset;
52+
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
53+
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
54+
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
55+
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
56+
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
57+
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
58+
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
59+
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
60+
61+
aoffset = a;
62+
boffset = b;
63+
lda *= 2;
64+
65+
j = (n >> 4);
66+
if (j > 0){
67+
do{
68+
aoffset1 = aoffset;
69+
aoffset2 = aoffset1 + lda;
70+
aoffset3 = aoffset2 + lda;
71+
aoffset4 = aoffset3 + lda;
72+
aoffset5 = aoffset4 + lda;
73+
aoffset6 = aoffset5 + lda;
74+
aoffset7 = aoffset6 + lda;
75+
aoffset8 = aoffset7 + lda;
76+
aoffset9 = aoffset8 + lda;
77+
aoffset10 = aoffset9 + lda;
78+
aoffset11 = aoffset10 + lda;
79+
aoffset12 = aoffset11 + lda;
80+
aoffset13 = aoffset12 + lda;
81+
aoffset14 = aoffset13 + lda;
82+
aoffset15 = aoffset14 + lda;
83+
aoffset16 = aoffset15 + lda;
84+
aoffset += 16 * lda;
85+
86+
i = m;
87+
if (i > 0){
88+
do{
89+
ctemp01 = *(aoffset1 + 0);
90+
ctemp02 = *(aoffset1 + 1);
91+
ctemp03 = *(aoffset2 + 0);
92+
ctemp04 = *(aoffset2 + 1);
93+
ctemp05 = *(aoffset3 + 0);
94+
ctemp06 = *(aoffset3 + 1);
95+
ctemp07 = *(aoffset4 + 0);
96+
ctemp08 = *(aoffset4 + 1);
97+
ctemp09 = *(aoffset5 + 0);
98+
ctemp10 = *(aoffset5 + 1);
99+
ctemp11 = *(aoffset6 + 0);
100+
ctemp12 = *(aoffset6 + 1);
101+
ctemp13 = *(aoffset7 + 0);
102+
ctemp14 = *(aoffset7 + 1);
103+
ctemp15 = *(aoffset8 + 0);
104+
ctemp16 = *(aoffset8 + 1);
105+
106+
ctemp17 = *(aoffset9 + 0);
107+
ctemp18 = *(aoffset9 + 1);
108+
ctemp19 = *(aoffset10 + 0);
109+
ctemp20 = *(aoffset10 + 1);
110+
ctemp21 = *(aoffset11 + 0);
111+
ctemp22 = *(aoffset11 + 1);
112+
ctemp23 = *(aoffset12 + 0);
113+
ctemp24 = *(aoffset12 + 1);
114+
ctemp25 = *(aoffset13 + 0);
115+
ctemp26 = *(aoffset13 + 1);
116+
ctemp27 = *(aoffset14 + 0);
117+
ctemp28 = *(aoffset14 + 1);
118+
ctemp29 = *(aoffset15 + 0);
119+
ctemp30 = *(aoffset15 + 1);
120+
ctemp31 = *(aoffset16 + 0);
121+
ctemp32 = *(aoffset16 + 1);
122+
123+
*(boffset + 0) = ctemp01;
124+
*(boffset + 1) = ctemp02;
125+
*(boffset + 2) = ctemp03;
126+
*(boffset + 3) = ctemp04;
127+
*(boffset + 4) = ctemp05;
128+
*(boffset + 5) = ctemp06;
129+
*(boffset + 6) = ctemp07;
130+
*(boffset + 7) = ctemp08;
131+
*(boffset + 8) = ctemp09;
132+
*(boffset + 9) = ctemp10;
133+
*(boffset + 10) = ctemp11;
134+
*(boffset + 11) = ctemp12;
135+
*(boffset + 12) = ctemp13;
136+
*(boffset + 13) = ctemp14;
137+
*(boffset + 14) = ctemp15;
138+
*(boffset + 15) = ctemp16;
139+
140+
*(boffset + 16) = ctemp17;
141+
*(boffset + 17) = ctemp18;
142+
*(boffset + 18) = ctemp19;
143+
*(boffset + 19) = ctemp20;
144+
*(boffset + 20) = ctemp21;
145+
*(boffset + 21) = ctemp22;
146+
*(boffset + 22) = ctemp23;
147+
*(boffset + 23) = ctemp24;
148+
*(boffset + 24) = ctemp25;
149+
*(boffset + 25) = ctemp26;
150+
*(boffset + 26) = ctemp27;
151+
*(boffset + 27) = ctemp28;
152+
*(boffset + 28) = ctemp29;
153+
*(boffset + 29) = ctemp30;
154+
*(boffset + 30) = ctemp31;
155+
*(boffset + 31) = ctemp32;
156+
157+
aoffset1 += 2;
158+
aoffset2 += 2;
159+
aoffset3 += 2;
160+
aoffset4 += 2;
161+
aoffset5 += 2;
162+
aoffset6 += 2;
163+
aoffset7 += 2;
164+
aoffset8 += 2;
165+
aoffset9 += 2;
166+
aoffset10 += 2;
167+
aoffset11 += 2;
168+
aoffset12 += 2;
169+
aoffset13 += 2;
170+
aoffset14 += 2;
171+
aoffset15 += 2;
172+
aoffset16 += 2;
173+
174+
boffset += 32;
175+
i --;
176+
}while(i > 0);
177+
}
178+
j--;
179+
}while(j > 0);
180+
} /* end of if(j > 0) */
181+
182+
if (n & 8){
183+
aoffset1 = aoffset;
184+
aoffset2 = aoffset1 + lda;
185+
aoffset3 = aoffset2 + lda;
186+
aoffset4 = aoffset3 + lda;
187+
aoffset5 = aoffset4 + lda;
188+
aoffset6 = aoffset5 + lda;
189+
aoffset7 = aoffset6 + lda;
190+
aoffset8 = aoffset7 + lda;
191+
aoffset += 8 * lda;
192+
193+
i = m;
194+
if (i > 0){
195+
do{
196+
ctemp01 = *(aoffset1 + 0);
197+
ctemp02 = *(aoffset1 + 1);
198+
ctemp03 = *(aoffset2 + 0);
199+
ctemp04 = *(aoffset2 + 1);
200+
ctemp05 = *(aoffset3 + 0);
201+
ctemp06 = *(aoffset3 + 1);
202+
ctemp07 = *(aoffset4 + 0);
203+
ctemp08 = *(aoffset4 + 1);
204+
ctemp09 = *(aoffset5 + 0);
205+
ctemp10 = *(aoffset5 + 1);
206+
ctemp11 = *(aoffset6 + 0);
207+
ctemp12 = *(aoffset6 + 1);
208+
ctemp13 = *(aoffset7 + 0);
209+
ctemp14 = *(aoffset7 + 1);
210+
ctemp15 = *(aoffset8 + 0);
211+
ctemp16 = *(aoffset8 + 1);
212+
213+
*(boffset + 0) = ctemp01;
214+
*(boffset + 1) = ctemp02;
215+
*(boffset + 2) = ctemp03;
216+
*(boffset + 3) = ctemp04;
217+
*(boffset + 4) = ctemp05;
218+
*(boffset + 5) = ctemp06;
219+
*(boffset + 6) = ctemp07;
220+
*(boffset + 7) = ctemp08;
221+
*(boffset + 8) = ctemp09;
222+
*(boffset + 9) = ctemp10;
223+
*(boffset + 10) = ctemp11;
224+
*(boffset + 11) = ctemp12;
225+
*(boffset + 12) = ctemp13;
226+
*(boffset + 13) = ctemp14;
227+
*(boffset + 14) = ctemp15;
228+
*(boffset + 15) = ctemp16;
229+
230+
aoffset1 += 2;
231+
aoffset2 += 2;
232+
aoffset3 += 2;
233+
aoffset4 += 2;
234+
aoffset5 += 2;
235+
aoffset6 += 2;
236+
aoffset7 += 2;
237+
aoffset8 += 2;
238+
239+
boffset += 16;
240+
i --;
241+
}while(i > 0);
242+
}
243+
}
244+
245+
if (n & 4){
246+
aoffset1 = aoffset;
247+
aoffset2 = aoffset1 + lda;
248+
aoffset3 = aoffset2 + lda;
249+
aoffset4 = aoffset3 + lda;
250+
aoffset += 4 * lda;
251+
252+
i = m;
253+
if (i > 0){
254+
do{
255+
ctemp01 = *(aoffset1 + 0);
256+
ctemp02 = *(aoffset1 + 1);
257+
ctemp03 = *(aoffset2 + 0);
258+
ctemp04 = *(aoffset2 + 1);
259+
ctemp05 = *(aoffset3 + 0);
260+
ctemp06 = *(aoffset3 + 1);
261+
ctemp07 = *(aoffset4 + 0);
262+
ctemp08 = *(aoffset4 + 1);
263+
264+
*(boffset + 0) = ctemp01;
265+
*(boffset + 1) = ctemp02;
266+
*(boffset + 2) = ctemp03;
267+
*(boffset + 3) = ctemp04;
268+
*(boffset + 4) = ctemp05;
269+
*(boffset + 5) = ctemp06;
270+
*(boffset + 6) = ctemp07;
271+
*(boffset + 7) = ctemp08;
272+
273+
aoffset1 += 2;
274+
aoffset2 += 2;
275+
aoffset3 += 2;
276+
aoffset4 += 2;
277+
278+
boffset += 8;
279+
i --;
280+
}while(i > 0);
281+
}
282+
} /* end of if(j > 0) */
283+
284+
if (n & 2){
285+
aoffset1 = aoffset;
286+
aoffset2 = aoffset1 + lda;
287+
aoffset += 2 * lda;
288+
289+
i = m;
290+
if (i > 0){
291+
do{
292+
ctemp01 = *(aoffset1 + 0);
293+
ctemp02 = *(aoffset1 + 1);
294+
ctemp03 = *(aoffset2 + 0);
295+
ctemp04 = *(aoffset2 + 1);
296+
297+
*(boffset + 0) = ctemp01;
298+
*(boffset + 1) = ctemp02;
299+
*(boffset + 2) = ctemp03;
300+
*(boffset + 3) = ctemp04;
301+
302+
aoffset1 += 2;
303+
aoffset2 += 2;
304+
boffset += 4;
305+
i --;
306+
}while(i > 0);
307+
}
308+
309+
} /* end of if(j > 0) */
310+
311+
if (n & 1){
312+
aoffset1 = aoffset;
313+
314+
i = m;
315+
if (i > 0){
316+
do{
317+
ctemp01 = *(aoffset1 + 0);
318+
ctemp02 = *(aoffset1 + 1);
319+
320+
*(boffset + 0) = ctemp01;
321+
*(boffset + 1) = ctemp02;
322+
323+
aoffset1 += 2;
324+
boffset += 2;
325+
i --;
326+
}while(i > 0);
327+
}
328+
329+
} /* end of if(j > 0) */
330+
331+
return 0;
332+
}

0 commit comments

Comments
 (0)