|
1 | 1 | /***************************************************************************
|
2 |
| -Copyright (c) 2013-2017, The OpenBLAS Project |
| 2 | +Copyright (c) 2013-2019, The OpenBLAS Project |
3 | 3 | All rights reserved.
|
4 | 4 | Redistribution and use in source and binary forms, with or without
|
5 | 5 | modification, are permitted provided that the following conditions are
|
@@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
28 | 28 | #include "common.h"
|
29 | 29 | #include <math.h>
|
30 | 30 |
|
31 |
| -#if defined(DOUBLE) |
32 |
| -#define ABS fabs |
33 |
| -#else |
34 |
| -#define ABS fabsf |
35 |
| -#endif |
36 |
| - |
37 |
| -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) |
38 |
| - |
39 |
| -static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) |
40 |
| -{ |
41 |
| - FLOAT amax; |
42 |
| - |
43 |
| - __asm__ volatile ( |
44 |
| - "vlef %%v0,0(%2),0 \n\t" |
45 |
| - "vlef %%v16,4(%2),0 \n\t" |
46 |
| - "vlef %%v0,8(%2),1 \n\t" |
47 |
| - "vlef %%v16,12(%2),1 \n\t" |
48 |
| - "vlef %%v0,16(%2),2 \n\t" |
49 |
| - "vlef %%v16,20(%2),2 \n\t" |
50 |
| - "vlef %%v0,24(%2),3 \n\t" |
51 |
| - "vlef %%v16,28(%2),3 \n\t" |
52 |
| - "vflpsb %%v0,%%v0 \n\t" |
53 |
| - "vflpsb %%v16,%%v16 \n\t" |
54 |
| - "vfasb %%v0,%%v0,%%v16 \n\t" |
55 |
| - "vleib %%v1,0,0 \n\t" |
56 |
| - "vleib %%v1,1,1 \n\t" |
57 |
| - "vleib %%v1,2,2 \n\t" |
58 |
| - "vleib %%v1,3,3 \n\t" |
59 |
| - "vleib %%v1,8,4 \n\t" |
60 |
| - "vleib %%v1,9,5 \n\t" |
61 |
| - "vleib %%v1,10,6 \n\t" |
62 |
| - "vleib %%v1,11,7 \n\t" |
63 |
| - "vleib %%v1,16,8 \n\t" |
64 |
| - "vleib %%v1,17,9 \n\t" |
65 |
| - "vleib %%v1,18,10 \n\t" |
66 |
| - "vleib %%v1,19,11 \n\t" |
67 |
| - "vleib %%v1,24,12 \n\t" |
68 |
| - "vleib %%v1,25,13 \n\t" |
69 |
| - "vleib %%v1,26,14 \n\t" |
70 |
| - "vleib %%v1,27,15 \n\t" |
71 |
| - "srlg %%r0,%1,5 \n\t" |
72 |
| - "xgr %%r1,%%r1 \n\t" |
73 |
| - "0: \n\t" |
74 |
| - "pfd 1, 1024(%%r1,%2) \n\t" |
75 |
| - |
76 |
| - "vl %%v16,0(%%r1,%2) \n\t" |
77 |
| - "vl %%v2,16(%%r1,%2) \n\t" |
78 |
| - "vpkg %%v17,%%v16,%%v2 \n\t" |
79 |
| - "vperm %%v16,%%v16,%%v2,%%v1 \n\t" |
80 |
| - |
81 |
| - "vl %%v18,32(%%r1,%2) \n\t" |
82 |
| - "vl %%v2,48(%%r1,%2) \n\t" |
83 |
| - "vpkg %%v19,%%v18,%%v2 \n\t" |
84 |
| - "vperm %%v18,%%v18,%%v2,%%v1 \n\t" |
85 |
| - |
86 |
| - "vl %%v20,64(%%r1,%2) \n\t" |
87 |
| - "vl %%v2,80(%%r1,%2) \n\t" |
88 |
| - "vpkg %%v21,%%v20,%%v2 \n\t" |
89 |
| - "vperm %%v20,%%v20,%%v2,%%v1 \n\t" |
90 |
| - |
91 |
| - "vl %%v22,96(%%r1,%2) \n\t" |
92 |
| - "vl %%v2,112(%%r1,%2) \n\t" |
93 |
| - "vpkg %%v23,%%v22,%%v2 \n\t" |
94 |
| - "vperm %%v22,%%v22,%%v2,%%v1 \n\t" |
95 |
| - |
96 |
| - "vl %%v24,128(%%r1,%2) \n\t" |
97 |
| - "vl %%v2,144(%%r1,%2) \n\t" |
98 |
| - "vpkg %%v25,%%v24,%%v2 \n\t" |
99 |
| - "vperm %%v24,%%v24,%%v2,%%v1 \n\t" |
100 |
| - |
101 |
| - "vl %%v26,160(%%r1,%2) \n\t" |
102 |
| - "vl %%v2,176(%%r1,%2) \n\t" |
103 |
| - "vpkg %%v27,%%v26,%%v2 \n\t" |
104 |
| - "vperm %%v26,%%v26,%%v2,%%v1 \n\t" |
105 |
| - |
106 |
| - "vl %%v28,192(%%r1,%2) \n\t" |
107 |
| - "vl %%v2,208(%%r1,%2) \n\t" |
108 |
| - "vpkg %%v29,%%v28,%%v2 \n\t" |
109 |
| - "vperm %%v28,%%v28,%%v2,%%v1 \n\t" |
110 |
| - |
111 |
| - "vl %%v30,224(%%r1,%2) \n\t" |
112 |
| - "vl %%v2,240(%%r1,%2) \n\t" |
113 |
| - "vpkg %%v31,%%v30,%%v2 \n\t" |
114 |
| - "vperm %%v30,%%v30,%%v2,%%v1 \n\t" |
115 |
| - |
116 |
| - "vflpsb %%v16,%%v16 \n\t" |
117 |
| - "vflpsb %%v17,%%v17 \n\t" |
118 |
| - "vflpsb %%v18,%%v18 \n\t" |
119 |
| - "vflpsb %%v19,%%v19 \n\t" |
120 |
| - "vflpsb %%v20,%%v20 \n\t" |
121 |
| - "vflpsb %%v21,%%v21 \n\t" |
122 |
| - "vflpsb %%v22,%%v22 \n\t" |
123 |
| - "vflpsb %%v23,%%v23 \n\t" |
124 |
| - "vflpsb %%v24,%%v24 \n\t" |
125 |
| - "vflpsb %%v25,%%v25 \n\t" |
126 |
| - "vflpsb %%v26,%%v26 \n\t" |
127 |
| - "vflpsb %%v27,%%v27 \n\t" |
128 |
| - "vflpsb %%v28,%%v28 \n\t" |
129 |
| - "vflpsb %%v29,%%v29 \n\t" |
130 |
| - "vflpsb %%v30,%%v30 \n\t" |
131 |
| - "vflpsb %%v31,%%v31 \n\t" |
132 |
| - |
133 |
| - "vfasb %%v16,%%v16,%%v17 \n\t" |
134 |
| - "vfasb %%v18,%%v18,%%v19 \n\t" |
135 |
| - "vfasb %%v20,%%v20,%%v21 \n\t" |
136 |
| - "vfasb %%v22,%%v22,%%v23 \n\t" |
137 |
| - "vfasb %%v24,%%v24,%%v25 \n\t" |
138 |
| - "vfasb %%v26,%%v26,%%v27 \n\t" |
139 |
| - "vfasb %%v28,%%v28,%%v29 \n\t" |
140 |
| - "vfasb %%v30,%%v30,%%v31 \n\t" |
141 |
| - |
142 |
| - "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" |
143 |
| - "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" |
144 |
| - "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" |
145 |
| - "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" |
146 |
| - |
147 |
| - "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" |
148 |
| - "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" |
149 |
| - |
150 |
| - "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" |
151 |
| - |
152 |
| - "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" |
153 |
| - |
154 |
| - "agfi %%r1, 256 \n\t" |
155 |
| - "brctg %%r0, 0b \n\t" |
156 |
| - |
157 |
| - "veslg %%v16,%%v0,32 \n\t" |
158 |
| - "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" |
159 |
| - |
160 |
| - "vrepf %%v16,%%v0,2 \n\t" |
161 |
| - "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" |
162 |
| - "ler %0,%%f0 " |
163 |
| - :"=f"(amax) |
164 |
| - :"r"(n),"ZR"((const FLOAT (*)[n])x) |
165 |
| - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" |
166 |
| - ); |
167 |
| - |
168 |
| - return amax; |
| 31 | +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) |
| 32 | + |
| 33 | +static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) { |
| 34 | + FLOAT amax; |
| 35 | + |
| 36 | + __asm__("vlef %%v0,0(%[x]),0\n\t" |
| 37 | + "vlef %%v16,4(%[x]),0\n\t" |
| 38 | + "vlef %%v0,8(%[x]),1\n\t" |
| 39 | + "vlef %%v16,12(%[x]),1\n\t" |
| 40 | + "vlef %%v0,16(%[x]),2\n\t" |
| 41 | + "vlef %%v16,20(%[x]),2\n\t" |
| 42 | + "vlef %%v0,24(%[x]),3\n\t" |
| 43 | + "vlef %%v16,28(%[x]),3\n\t" |
| 44 | + "vflpsb %%v0,%%v0\n\t" |
| 45 | + "vflpsb %%v16,%%v16\n\t" |
| 46 | + "vfasb %%v0,%%v0,%%v16\n\t" |
| 47 | + "vleib %%v1,0,0\n\t" |
| 48 | + "vleib %%v1,1,1\n\t" |
| 49 | + "vleib %%v1,2,2\n\t" |
| 50 | + "vleib %%v1,3,3\n\t" |
| 51 | + "vleib %%v1,8,4\n\t" |
| 52 | + "vleib %%v1,9,5\n\t" |
| 53 | + "vleib %%v1,10,6\n\t" |
| 54 | + "vleib %%v1,11,7\n\t" |
| 55 | + "vleib %%v1,16,8\n\t" |
| 56 | + "vleib %%v1,17,9\n\t" |
| 57 | + "vleib %%v1,18,10\n\t" |
| 58 | + "vleib %%v1,19,11\n\t" |
| 59 | + "vleib %%v1,24,12\n\t" |
| 60 | + "vleib %%v1,25,13\n\t" |
| 61 | + "vleib %%v1,26,14\n\t" |
| 62 | + "vleib %%v1,27,15\n\t" |
| 63 | + "srlg %[n],%[n],5\n\t" |
| 64 | + "xgr %%r1,%%r1\n\t" |
| 65 | + "0:\n\t" |
| 66 | + "pfd 1, 1024(%%r1,%[x])\n\t" |
| 67 | + "vl %%v16,0(%%r1,%[x])\n\t" |
| 68 | + "vl %%v2,16(%%r1,%[x])\n\t" |
| 69 | + "vpkg %%v17,%%v16,%%v2\n\t" |
| 70 | + "vperm %%v16,%%v16,%%v2,%%v1\n\t" |
| 71 | + "vl %%v18,32(%%r1,%[x])\n\t" |
| 72 | + "vl %%v2,48(%%r1,%[x])\n\t" |
| 73 | + "vpkg %%v19,%%v18,%%v2\n\t" |
| 74 | + "vperm %%v18,%%v18,%%v2,%%v1\n\t" |
| 75 | + "vl %%v20,64(%%r1,%[x])\n\t" |
| 76 | + "vl %%v2,80(%%r1,%[x])\n\t" |
| 77 | + "vpkg %%v21,%%v20,%%v2\n\t" |
| 78 | + "vperm %%v20,%%v20,%%v2,%%v1\n\t" |
| 79 | + "vl %%v22,96(%%r1,%[x])\n\t" |
| 80 | + "vl %%v2,112(%%r1,%[x])\n\t" |
| 81 | + "vpkg %%v23,%%v22,%%v2\n\t" |
| 82 | + "vperm %%v22,%%v22,%%v2,%%v1\n\t" |
| 83 | + "vl %%v24,128(%%r1,%[x])\n\t" |
| 84 | + "vl %%v2,144(%%r1,%[x])\n\t" |
| 85 | + "vpkg %%v25,%%v24,%%v2\n\t" |
| 86 | + "vperm %%v24,%%v24,%%v2,%%v1\n\t" |
| 87 | + "vl %%v26,160(%%r1,%[x])\n\t" |
| 88 | + "vl %%v2,176(%%r1,%[x])\n\t" |
| 89 | + "vpkg %%v27,%%v26,%%v2\n\t" |
| 90 | + "vperm %%v26,%%v26,%%v2,%%v1\n\t" |
| 91 | + "vl %%v28,192(%%r1,%[x])\n\t" |
| 92 | + "vl %%v2,208(%%r1,%[x])\n\t" |
| 93 | + "vpkg %%v29,%%v28,%%v2\n\t" |
| 94 | + "vperm %%v28,%%v28,%%v2,%%v1\n\t" |
| 95 | + "vl %%v30,224(%%r1,%[x])\n\t" |
| 96 | + "vl %%v2,240(%%r1,%[x])\n\t" |
| 97 | + "vpkg %%v31,%%v30,%%v2\n\t" |
| 98 | + "vperm %%v30,%%v30,%%v2,%%v1\n\t" |
| 99 | + "vflpsb %%v16,%%v16\n\t" |
| 100 | + "vflpsb %%v17,%%v17\n\t" |
| 101 | + "vflpsb %%v18,%%v18\n\t" |
| 102 | + "vflpsb %%v19,%%v19\n\t" |
| 103 | + "vflpsb %%v20,%%v20\n\t" |
| 104 | + "vflpsb %%v21,%%v21\n\t" |
| 105 | + "vflpsb %%v22,%%v22\n\t" |
| 106 | + "vflpsb %%v23,%%v23\n\t" |
| 107 | + "vflpsb %%v24,%%v24\n\t" |
| 108 | + "vflpsb %%v25,%%v25\n\t" |
| 109 | + "vflpsb %%v26,%%v26\n\t" |
| 110 | + "vflpsb %%v27,%%v27\n\t" |
| 111 | + "vflpsb %%v28,%%v28\n\t" |
| 112 | + "vflpsb %%v29,%%v29\n\t" |
| 113 | + "vflpsb %%v30,%%v30\n\t" |
| 114 | + "vflpsb %%v31,%%v31\n\t" |
| 115 | + "vfasb %%v16,%%v16,%%v17\n\t" |
| 116 | + "vfasb %%v18,%%v18,%%v19\n\t" |
| 117 | + "vfasb %%v20,%%v20,%%v21\n\t" |
| 118 | + "vfasb %%v22,%%v22,%%v23\n\t" |
| 119 | + "vfasb %%v24,%%v24,%%v25\n\t" |
| 120 | + "vfasb %%v26,%%v26,%%v27\n\t" |
| 121 | + "vfasb %%v28,%%v28,%%v29\n\t" |
| 122 | + "vfasb %%v30,%%v30,%%v31\n\t" |
| 123 | + "vfmaxsb %%v16,%%v16,%%v24,0\n\t" |
| 124 | + "vfmaxsb %%v18,%%v18,%%v26,0\n\t" |
| 125 | + "vfmaxsb %%v20,%%v20,%%v28,0\n\t" |
| 126 | + "vfmaxsb %%v22,%%v22,%%v30,0\n\t" |
| 127 | + "vfmaxsb %%v16,%%v16,%%v20,0\n\t" |
| 128 | + "vfmaxsb %%v18,%%v18,%%v22,0\n\t" |
| 129 | + "vfmaxsb %%v16,%%v16,%%v18,0\n\t" |
| 130 | + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" |
| 131 | + "agfi %%r1, 256\n\t" |
| 132 | + "brctg %[n], 0b\n\t" |
| 133 | + "veslg %%v16,%%v0,32\n\t" |
| 134 | + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" |
| 135 | + "vrepf %%v16,%%v0,2\n\t" |
| 136 | + "wfmaxsb %%v0,%%v0,%%v16,0\n\t" |
| 137 | + "ler %[amax],%%f0" |
| 138 | + : [amax] "=f"(amax),[n] "+&r"(n) |
| 139 | + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) |
| 140 | + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", |
| 141 | + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", |
| 142 | + "v31"); |
| 143 | + |
| 144 | + return amax; |
169 | 145 | }
|
170 |
| - |
| 146 | + |
171 | 147 | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
172 |
| - BLASLONG i = 0; |
173 |
| - BLASLONG ix = 0; |
174 |
| - FLOAT maxf = 0.0; |
175 |
| - BLASLONG inc_x2; |
176 |
| - |
177 |
| - if (n <= 0 || inc_x <= 0) return (maxf); |
178 |
| - |
179 |
| - if (inc_x == 1) { |
180 |
| - |
181 |
| - BLASLONG n1 = n & -32; |
182 |
| - if (n1 > 0) { |
183 |
| - |
184 |
| - maxf = camax_kernel_32(n1, x); |
185 |
| - ix = n1 * 2; |
186 |
| - i = n1; |
187 |
| - } |
188 |
| - else |
189 |
| - { |
190 |
| - maxf=CABS1(x,0); |
191 |
| - ix += 2; |
192 |
| - i++; |
193 |
| - } |
194 |
| - |
195 |
| - while (i < n) { |
196 |
| - if (CABS1(x,ix) > maxf) { |
197 |
| - maxf = CABS1(x,ix); |
198 |
| - } |
199 |
| - ix += 2; |
200 |
| - i++; |
201 |
| - } |
202 |
| - return (maxf); |
| 148 | + BLASLONG i = 0; |
| 149 | + BLASLONG ix = 0; |
| 150 | + FLOAT maxf = 0.0; |
| 151 | + BLASLONG inc_x2; |
| 152 | + |
| 153 | + if (n <= 0 || inc_x <= 0) |
| 154 | + return (maxf); |
| 155 | + |
| 156 | + if (inc_x == 1) { |
| 157 | + |
| 158 | + BLASLONG n1 = n & -32; |
| 159 | + if (n1 > 0) { |
203 | 160 |
|
| 161 | + maxf = camax_kernel_32(n1, x); |
| 162 | + ix = n1 * 2; |
| 163 | + i = n1; |
204 | 164 | } else {
|
| 165 | + maxf = CABS1(x, 0); |
| 166 | + ix += 2; |
| 167 | + i++; |
| 168 | + } |
| 169 | + |
| 170 | + while (i < n) { |
| 171 | + if (CABS1(x, ix) > maxf) { |
| 172 | + maxf = CABS1(x, ix); |
| 173 | + } |
| 174 | + ix += 2; |
| 175 | + i++; |
| 176 | + } |
| 177 | + return (maxf); |
205 | 178 |
|
206 |
| - maxf=CABS1(x,0); |
207 |
| - inc_x2 = 2 * inc_x; |
| 179 | + } else { |
208 | 180 |
|
209 |
| - BLASLONG n1 = n & -4; |
210 |
| - while (i < n1) { |
| 181 | + maxf = CABS1(x, 0); |
| 182 | + inc_x2 = 2 * inc_x; |
211 | 183 |
|
212 |
| - if (CABS1(x,ix) > maxf) { |
213 |
| - maxf = CABS1(x,ix); |
214 |
| - } |
215 |
| - if (CABS1(x,ix+inc_x2) > maxf) { |
216 |
| - maxf = CABS1(x,ix+inc_x2); |
217 |
| - } |
218 |
| - if (CABS1(x,ix+inc_x2*2) > maxf) { |
219 |
| - maxf = CABS1(x,ix+inc_x2*2); |
220 |
| - } |
221 |
| - if (CABS1(x,ix+inc_x2*3) > maxf) { |
222 |
| - maxf = CABS1(x,ix+inc_x2*3); |
223 |
| - } |
| 184 | + BLASLONG n1 = n & -4; |
| 185 | + while (i < n1) { |
224 | 186 |
|
225 |
| - ix += inc_x2 * 4; |
| 187 | + if (CABS1(x, ix) > maxf) { |
| 188 | + maxf = CABS1(x, ix); |
| 189 | + } |
| 190 | + if (CABS1(x, ix + inc_x2) > maxf) { |
| 191 | + maxf = CABS1(x, ix + inc_x2); |
| 192 | + } |
| 193 | + if (CABS1(x, ix + inc_x2 * 2) > maxf) { |
| 194 | + maxf = CABS1(x, ix + inc_x2 * 2); |
| 195 | + } |
| 196 | + if (CABS1(x, ix + inc_x2 * 3) > maxf) { |
| 197 | + maxf = CABS1(x, ix + inc_x2 * 3); |
| 198 | + } |
226 | 199 |
|
227 |
| - i += 4; |
| 200 | + ix += inc_x2 * 4; |
228 | 201 |
|
229 |
| - } |
| 202 | + i += 4; |
230 | 203 |
|
| 204 | + } |
231 | 205 |
|
232 |
| - while (i < n) { |
233 |
| - if (CABS1(x,ix) > maxf) { |
234 |
| - maxf = CABS1(x,ix); |
235 |
| - } |
236 |
| - ix += inc_x2; |
237 |
| - i++; |
238 |
| - } |
239 |
| - return (maxf); |
| 206 | + while (i < n) { |
| 207 | + if (CABS1(x, ix) > maxf) { |
| 208 | + maxf = CABS1(x, ix); |
| 209 | + } |
| 210 | + ix += inc_x2; |
| 211 | + i++; |
240 | 212 | }
|
| 213 | + return (maxf); |
| 214 | + } |
241 | 215 | }
|
0 commit comments