Skip to content

Commit 76bb74f

Browse files
authored
Merge pull request #2012 from maamountki/z14
[ZARCH] Many improvements
2 parents 63d7bad + 0a54c98 commit 76bb74f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+13486
-14601
lines changed

kernel/zarch/camax.c

Lines changed: 172 additions & 198 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/***************************************************************************
2-
Copyright (c) 2013-2017, The OpenBLAS Project
2+
Copyright (c) 2013-2019, The OpenBLAS Project
33
All rights reserved.
44
Redistribution and use in source and binary forms, with or without
55
modification, are permitted provided that the following conditions are
@@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828
#include "common.h"
2929
#include <math.h>
3030

31-
#if defined(DOUBLE)
32-
#define ABS fabs
33-
#else
34-
#define ABS fabsf
35-
#endif
36-
37-
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
38-
39-
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
40-
{
41-
FLOAT amax;
42-
43-
__asm__ volatile (
44-
"vlef %%v0,0(%2),0 \n\t"
45-
"vlef %%v16,4(%2),0 \n\t"
46-
"vlef %%v0,8(%2),1 \n\t"
47-
"vlef %%v16,12(%2),1 \n\t"
48-
"vlef %%v0,16(%2),2 \n\t"
49-
"vlef %%v16,20(%2),2 \n\t"
50-
"vlef %%v0,24(%2),3 \n\t"
51-
"vlef %%v16,28(%2),3 \n\t"
52-
"vflpsb %%v0,%%v0 \n\t"
53-
"vflpsb %%v16,%%v16 \n\t"
54-
"vfasb %%v0,%%v0,%%v16 \n\t"
55-
"vleib %%v1,0,0 \n\t"
56-
"vleib %%v1,1,1 \n\t"
57-
"vleib %%v1,2,2 \n\t"
58-
"vleib %%v1,3,3 \n\t"
59-
"vleib %%v1,8,4 \n\t"
60-
"vleib %%v1,9,5 \n\t"
61-
"vleib %%v1,10,6 \n\t"
62-
"vleib %%v1,11,7 \n\t"
63-
"vleib %%v1,16,8 \n\t"
64-
"vleib %%v1,17,9 \n\t"
65-
"vleib %%v1,18,10 \n\t"
66-
"vleib %%v1,19,11 \n\t"
67-
"vleib %%v1,24,12 \n\t"
68-
"vleib %%v1,25,13 \n\t"
69-
"vleib %%v1,26,14 \n\t"
70-
"vleib %%v1,27,15 \n\t"
71-
"srlg %%r0,%1,5 \n\t"
72-
"xgr %%r1,%%r1 \n\t"
73-
"0: \n\t"
74-
"pfd 1, 1024(%%r1,%2) \n\t"
75-
76-
"vl %%v16,0(%%r1,%2) \n\t"
77-
"vl %%v2,16(%%r1,%2) \n\t"
78-
"vpkg %%v17,%%v16,%%v2 \n\t"
79-
"vperm %%v16,%%v16,%%v2,%%v1 \n\t"
80-
81-
"vl %%v18,32(%%r1,%2) \n\t"
82-
"vl %%v2,48(%%r1,%2) \n\t"
83-
"vpkg %%v19,%%v18,%%v2 \n\t"
84-
"vperm %%v18,%%v18,%%v2,%%v1 \n\t"
85-
86-
"vl %%v20,64(%%r1,%2) \n\t"
87-
"vl %%v2,80(%%r1,%2) \n\t"
88-
"vpkg %%v21,%%v20,%%v2 \n\t"
89-
"vperm %%v20,%%v20,%%v2,%%v1 \n\t"
90-
91-
"vl %%v22,96(%%r1,%2) \n\t"
92-
"vl %%v2,112(%%r1,%2) \n\t"
93-
"vpkg %%v23,%%v22,%%v2 \n\t"
94-
"vperm %%v22,%%v22,%%v2,%%v1 \n\t"
95-
96-
"vl %%v24,128(%%r1,%2) \n\t"
97-
"vl %%v2,144(%%r1,%2) \n\t"
98-
"vpkg %%v25,%%v24,%%v2 \n\t"
99-
"vperm %%v24,%%v24,%%v2,%%v1 \n\t"
100-
101-
"vl %%v26,160(%%r1,%2) \n\t"
102-
"vl %%v2,176(%%r1,%2) \n\t"
103-
"vpkg %%v27,%%v26,%%v2 \n\t"
104-
"vperm %%v26,%%v26,%%v2,%%v1 \n\t"
105-
106-
"vl %%v28,192(%%r1,%2) \n\t"
107-
"vl %%v2,208(%%r1,%2) \n\t"
108-
"vpkg %%v29,%%v28,%%v2 \n\t"
109-
"vperm %%v28,%%v28,%%v2,%%v1 \n\t"
110-
111-
"vl %%v30,224(%%r1,%2) \n\t"
112-
"vl %%v2,240(%%r1,%2) \n\t"
113-
"vpkg %%v31,%%v30,%%v2 \n\t"
114-
"vperm %%v30,%%v30,%%v2,%%v1 \n\t"
115-
116-
"vflpsb %%v16,%%v16 \n\t"
117-
"vflpsb %%v17,%%v17 \n\t"
118-
"vflpsb %%v18,%%v18 \n\t"
119-
"vflpsb %%v19,%%v19 \n\t"
120-
"vflpsb %%v20,%%v20 \n\t"
121-
"vflpsb %%v21,%%v21 \n\t"
122-
"vflpsb %%v22,%%v22 \n\t"
123-
"vflpsb %%v23,%%v23 \n\t"
124-
"vflpsb %%v24,%%v24 \n\t"
125-
"vflpsb %%v25,%%v25 \n\t"
126-
"vflpsb %%v26,%%v26 \n\t"
127-
"vflpsb %%v27,%%v27 \n\t"
128-
"vflpsb %%v28,%%v28 \n\t"
129-
"vflpsb %%v29,%%v29 \n\t"
130-
"vflpsb %%v30,%%v30 \n\t"
131-
"vflpsb %%v31,%%v31 \n\t"
132-
133-
"vfasb %%v16,%%v16,%%v17 \n\t"
134-
"vfasb %%v18,%%v18,%%v19 \n\t"
135-
"vfasb %%v20,%%v20,%%v21 \n\t"
136-
"vfasb %%v22,%%v22,%%v23 \n\t"
137-
"vfasb %%v24,%%v24,%%v25 \n\t"
138-
"vfasb %%v26,%%v26,%%v27 \n\t"
139-
"vfasb %%v28,%%v28,%%v29 \n\t"
140-
"vfasb %%v30,%%v30,%%v31 \n\t"
141-
142-
"vfmaxsb %%v16,%%v16,%%v24,0 \n\t"
143-
"vfmaxsb %%v18,%%v18,%%v26,0 \n\t"
144-
"vfmaxsb %%v20,%%v20,%%v28,0 \n\t"
145-
"vfmaxsb %%v22,%%v22,%%v30,0 \n\t"
146-
147-
"vfmaxsb %%v16,%%v16,%%v20,0 \n\t"
148-
"vfmaxsb %%v18,%%v18,%%v22,0 \n\t"
149-
150-
"vfmaxsb %%v16,%%v16,%%v18,0 \n\t"
151-
152-
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
153-
154-
"agfi %%r1, 256 \n\t"
155-
"brctg %%r0, 0b \n\t"
156-
157-
"veslg %%v16,%%v0,32 \n\t"
158-
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
159-
160-
"vrepf %%v16,%%v0,2 \n\t"
161-
"wfmaxsb %%v0,%%v0,%%v16,0 \n\t"
162-
"ler %0,%%f0 "
163-
:"=f"(amax)
164-
:"r"(n),"ZR"((const FLOAT (*)[n])x)
165-
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
166-
);
167-
168-
return amax;
31+
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
32+
33+
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) {
34+
FLOAT amax;
35+
36+
__asm__("vlef %%v0,0(%[x]),0\n\t"
37+
"vlef %%v16,4(%[x]),0\n\t"
38+
"vlef %%v0,8(%[x]),1\n\t"
39+
"vlef %%v16,12(%[x]),1\n\t"
40+
"vlef %%v0,16(%[x]),2\n\t"
41+
"vlef %%v16,20(%[x]),2\n\t"
42+
"vlef %%v0,24(%[x]),3\n\t"
43+
"vlef %%v16,28(%[x]),3\n\t"
44+
"vflpsb %%v0,%%v0\n\t"
45+
"vflpsb %%v16,%%v16\n\t"
46+
"vfasb %%v0,%%v0,%%v16\n\t"
47+
"vleib %%v1,0,0\n\t"
48+
"vleib %%v1,1,1\n\t"
49+
"vleib %%v1,2,2\n\t"
50+
"vleib %%v1,3,3\n\t"
51+
"vleib %%v1,8,4\n\t"
52+
"vleib %%v1,9,5\n\t"
53+
"vleib %%v1,10,6\n\t"
54+
"vleib %%v1,11,7\n\t"
55+
"vleib %%v1,16,8\n\t"
56+
"vleib %%v1,17,9\n\t"
57+
"vleib %%v1,18,10\n\t"
58+
"vleib %%v1,19,11\n\t"
59+
"vleib %%v1,24,12\n\t"
60+
"vleib %%v1,25,13\n\t"
61+
"vleib %%v1,26,14\n\t"
62+
"vleib %%v1,27,15\n\t"
63+
"srlg %[n],%[n],5\n\t"
64+
"xgr %%r1,%%r1\n\t"
65+
"0:\n\t"
66+
"pfd 1, 1024(%%r1,%[x])\n\t"
67+
"vl %%v16,0(%%r1,%[x])\n\t"
68+
"vl %%v2,16(%%r1,%[x])\n\t"
69+
"vpkg %%v17,%%v16,%%v2\n\t"
70+
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
71+
"vl %%v18,32(%%r1,%[x])\n\t"
72+
"vl %%v2,48(%%r1,%[x])\n\t"
73+
"vpkg %%v19,%%v18,%%v2\n\t"
74+
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
75+
"vl %%v20,64(%%r1,%[x])\n\t"
76+
"vl %%v2,80(%%r1,%[x])\n\t"
77+
"vpkg %%v21,%%v20,%%v2\n\t"
78+
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
79+
"vl %%v22,96(%%r1,%[x])\n\t"
80+
"vl %%v2,112(%%r1,%[x])\n\t"
81+
"vpkg %%v23,%%v22,%%v2\n\t"
82+
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
83+
"vl %%v24,128(%%r1,%[x])\n\t"
84+
"vl %%v2,144(%%r1,%[x])\n\t"
85+
"vpkg %%v25,%%v24,%%v2\n\t"
86+
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
87+
"vl %%v26,160(%%r1,%[x])\n\t"
88+
"vl %%v2,176(%%r1,%[x])\n\t"
89+
"vpkg %%v27,%%v26,%%v2\n\t"
90+
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
91+
"vl %%v28,192(%%r1,%[x])\n\t"
92+
"vl %%v2,208(%%r1,%[x])\n\t"
93+
"vpkg %%v29,%%v28,%%v2\n\t"
94+
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
95+
"vl %%v30,224(%%r1,%[x])\n\t"
96+
"vl %%v2,240(%%r1,%[x])\n\t"
97+
"vpkg %%v31,%%v30,%%v2\n\t"
98+
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
99+
"vflpsb %%v16,%%v16\n\t"
100+
"vflpsb %%v17,%%v17\n\t"
101+
"vflpsb %%v18,%%v18\n\t"
102+
"vflpsb %%v19,%%v19\n\t"
103+
"vflpsb %%v20,%%v20\n\t"
104+
"vflpsb %%v21,%%v21\n\t"
105+
"vflpsb %%v22,%%v22\n\t"
106+
"vflpsb %%v23,%%v23\n\t"
107+
"vflpsb %%v24,%%v24\n\t"
108+
"vflpsb %%v25,%%v25\n\t"
109+
"vflpsb %%v26,%%v26\n\t"
110+
"vflpsb %%v27,%%v27\n\t"
111+
"vflpsb %%v28,%%v28\n\t"
112+
"vflpsb %%v29,%%v29\n\t"
113+
"vflpsb %%v30,%%v30\n\t"
114+
"vflpsb %%v31,%%v31\n\t"
115+
"vfasb %%v16,%%v16,%%v17\n\t"
116+
"vfasb %%v18,%%v18,%%v19\n\t"
117+
"vfasb %%v20,%%v20,%%v21\n\t"
118+
"vfasb %%v22,%%v22,%%v23\n\t"
119+
"vfasb %%v24,%%v24,%%v25\n\t"
120+
"vfasb %%v26,%%v26,%%v27\n\t"
121+
"vfasb %%v28,%%v28,%%v29\n\t"
122+
"vfasb %%v30,%%v30,%%v31\n\t"
123+
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
124+
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
125+
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
126+
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
127+
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
128+
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
129+
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
130+
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
131+
"agfi %%r1, 256\n\t"
132+
"brctg %[n], 0b\n\t"
133+
"veslg %%v16,%%v0,32\n\t"
134+
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
135+
"vrepf %%v16,%%v0,2\n\t"
136+
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
137+
"ler %[amax],%%f0"
138+
: [amax] "=f"(amax),[n] "+&r"(n)
139+
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
140+
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
141+
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
142+
"v31");
143+
144+
return amax;
169145
}
170-
146+
171147
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
172-
BLASLONG i = 0;
173-
BLASLONG ix = 0;
174-
FLOAT maxf = 0.0;
175-
BLASLONG inc_x2;
176-
177-
if (n <= 0 || inc_x <= 0) return (maxf);
178-
179-
if (inc_x == 1) {
180-
181-
BLASLONG n1 = n & -32;
182-
if (n1 > 0) {
183-
184-
maxf = camax_kernel_32(n1, x);
185-
ix = n1 * 2;
186-
i = n1;
187-
}
188-
else
189-
{
190-
maxf=CABS1(x,0);
191-
ix += 2;
192-
i++;
193-
}
194-
195-
while (i < n) {
196-
if (CABS1(x,ix) > maxf) {
197-
maxf = CABS1(x,ix);
198-
}
199-
ix += 2;
200-
i++;
201-
}
202-
return (maxf);
148+
BLASLONG i = 0;
149+
BLASLONG ix = 0;
150+
FLOAT maxf = 0.0;
151+
BLASLONG inc_x2;
152+
153+
if (n <= 0 || inc_x <= 0)
154+
return (maxf);
155+
156+
if (inc_x == 1) {
157+
158+
BLASLONG n1 = n & -32;
159+
if (n1 > 0) {
203160

161+
maxf = camax_kernel_32(n1, x);
162+
ix = n1 * 2;
163+
i = n1;
204164
} else {
165+
maxf = CABS1(x, 0);
166+
ix += 2;
167+
i++;
168+
}
169+
170+
while (i < n) {
171+
if (CABS1(x, ix) > maxf) {
172+
maxf = CABS1(x, ix);
173+
}
174+
ix += 2;
175+
i++;
176+
}
177+
return (maxf);
205178

206-
maxf=CABS1(x,0);
207-
inc_x2 = 2 * inc_x;
179+
} else {
208180

209-
BLASLONG n1 = n & -4;
210-
while (i < n1) {
181+
maxf = CABS1(x, 0);
182+
inc_x2 = 2 * inc_x;
211183

212-
if (CABS1(x,ix) > maxf) {
213-
maxf = CABS1(x,ix);
214-
}
215-
if (CABS1(x,ix+inc_x2) > maxf) {
216-
maxf = CABS1(x,ix+inc_x2);
217-
}
218-
if (CABS1(x,ix+inc_x2*2) > maxf) {
219-
maxf = CABS1(x,ix+inc_x2*2);
220-
}
221-
if (CABS1(x,ix+inc_x2*3) > maxf) {
222-
maxf = CABS1(x,ix+inc_x2*3);
223-
}
184+
BLASLONG n1 = n & -4;
185+
while (i < n1) {
224186

225-
ix += inc_x2 * 4;
187+
if (CABS1(x, ix) > maxf) {
188+
maxf = CABS1(x, ix);
189+
}
190+
if (CABS1(x, ix + inc_x2) > maxf) {
191+
maxf = CABS1(x, ix + inc_x2);
192+
}
193+
if (CABS1(x, ix + inc_x2 * 2) > maxf) {
194+
maxf = CABS1(x, ix + inc_x2 * 2);
195+
}
196+
if (CABS1(x, ix + inc_x2 * 3) > maxf) {
197+
maxf = CABS1(x, ix + inc_x2 * 3);
198+
}
226199

227-
i += 4;
200+
ix += inc_x2 * 4;
228201

229-
}
202+
i += 4;
230203

204+
}
231205

232-
while (i < n) {
233-
if (CABS1(x,ix) > maxf) {
234-
maxf = CABS1(x,ix);
235-
}
236-
ix += inc_x2;
237-
i++;
238-
}
239-
return (maxf);
206+
while (i < n) {
207+
if (CABS1(x, ix) > maxf) {
208+
maxf = CABS1(x, ix);
209+
}
210+
ix += inc_x2;
211+
i++;
240212
}
213+
return (maxf);
214+
}
241215
}

0 commit comments

Comments
 (0)