@@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
27
28
28
29
29
#include "common.h"
30
- #define Z13_D 1
30
+
31
31
#define PREFETCH_INS 1
32
32
#if defined(Z13_A )
33
33
#include <vecintrin.h>
34
34
35
- static void daxpy_kernel_32 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT * alpha )
35
+ static void daxpy_kernel_32 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT alpha )
36
36
{
37
37
BLASLONG i = 0 ;
38
- __vector double v_a = {* alpha ,* alpha };
38
+ __vector double v_a = {alpha ,alpha };
39
39
__vector double * v_y = (__vector double * )y ;
40
40
__vector double * v_x = (__vector double * )x ;
41
41
@@ -60,256 +60,53 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
60
60
}
61
61
62
62
}
63
- #elif defined(Z13_B )
64
- static void __attribute__ ((noinline )) daxpy_kernel_32 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT * alpha )
65
- {
66
-
67
-
68
- __asm__ volatile (
69
- #if defined(PREFETCH_INS )
70
- "pfd 1, 0(%1) \n\t"
71
- "pfd 2, 0(%2) \n\t"
72
- #endif
73
- "vlrepg %%v0 , 0(%3) \n\t"
74
- "srlg %3,%0,5 \n\t"
75
- "xgr %%r1,%%r1 \n\t"
76
- "vlr %%v1,%%v0 \n\t"
77
- ".align 16 \n\t"
78
- "1: \n\t"
79
- #if defined(PREFETCH_INS )
80
- "pfd 1, 256(%%r1,%1) \n\t"
81
- "pfd 2, 256(%%r1,%2) \n\t"
82
- #endif
83
-
84
- "vl %%v24, 0(%%r1,%2) \n\t"
85
- "vl %%v16, 0(%%r1,%1) \n\t"
86
- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
87
- "vst %%v16, 0(%%r1,%2) \n\t"
88
- "vl %%v25, 16(%%r1,%2) \n\t"
89
- "vl %%v17, 16(%%r1,%1) \n\t"
90
- "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
91
- "vst %%v17, 16(%%r1,%2) \n\t"
92
- "vl %%v26, 32(%%r1,%2) \n\t"
93
- "vl %%v18, 32(%%r1,%1) \n\t"
94
- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
95
- "vst %%v18, 32(%%r1,%2) \n\t"
96
- "vl %%v27, 48(%%r1,%2) \n\t"
97
- "vl %%v19, 48(%%r1,%1) \n\t"
98
- "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
99
- "vst %%v19, 48(%%r1,%2) \n\t"
100
-
101
- "vl %%v24,( 0+64)(%%r1,%2) \n\t"
102
- "vl %%v16,( 0+64)(%%r1,%1) \n\t"
103
- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
104
- "vst %%v16,( 0+64)(%%r1,%2) \n\t"
105
- "vl %%v25, (16+64)(%%r1,%2) \n\t"
106
- "vl %%v17, (16+64)(%%r1,%1) \n\t"
107
- "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
108
- "vst %%v17, (16+64)(%%r1,%2) \n\t"
109
- "vl %%v26, (32+64)(%%r1,%2) \n\t"
110
- "vl %%v18, (32+64)(%%r1,%1) \n\t"
111
- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
112
- "vst %%v18, (32+64)(%%r1,%2) \n\t"
113
- "vl %%v27, (48+64)(%%r1,%2) \n\t"
114
- "vl %%v19, (48+64)(%%r1,%1) \n\t"
115
- "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
116
- "vst %%v19, (48+64)(%%r1,%2) \n\t"
117
-
118
- "vl %%v24,( 0+128)(%%r1,%2) \n\t"
119
- "vl %%v16,( 0+128)(%%r1,%1) \n\t"
120
- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
121
- "vst %%v16,( 0+128)(%%r1,%2) \n\t"
122
- "vl %%v25, (16+128)(%%r1,%2) \n\t"
123
- "vl %%v17, (16+128)(%%r1,%1) \n\t"
124
- "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
125
- "vst %%v17, (16+128)(%%r1,%2) \n\t"
126
- "vl %%v26, (32+128)(%%r1,%2) \n\t"
127
- "vl %%v18, (32+128)(%%r1,%1) \n\t"
128
- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
129
- "vst %%v18, (32+128)(%%r1,%2) \n\t"
130
- "vl %%v27, (48+128)(%%r1,%2) \n\t"
131
- "vl %%v19, (48+128)(%%r1,%1) \n\t"
132
- "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
133
- "vst %%v19, (48+128)(%%r1,%2) \n\t"
134
-
135
- "vl %%v24,( 0+192)(%%r1,%2) \n\t"
136
- "vl %%v16,( 0+192)(%%r1,%1) \n\t"
137
- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
138
- "vst %%v16,( 0+192)(%%r1,%2) \n\t"
139
- "vl %%v25, (16+192)(%%r1,%2) \n\t"
140
- "vl %%v17, (16+192)(%%r1,%1) \n\t"
141
- "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
142
- "vst %%v17, (16+192)(%%r1,%2) \n\t"
143
- "vl %%v26, (32+192)(%%r1,%2) \n\t"
144
- "vl %%v18, (32+192)(%%r1,%1) \n\t"
145
- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
146
- "vst %%v18, (32+192)(%%r1,%2) \n\t"
147
- "vl %%v27, (48+192)(%%r1,%2) \n\t"
148
- "vl %%v19, (48+192)(%%r1,%1) \n\t"
149
- "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
150
- "vst %%v19, (48+192)(%%r1,%2) \n\t"
151
-
152
-
153
- "la %%r1,256(%%r1) \n\t"
154
- "brctg %3,1b"
155
- :
156
- :"r "(n ),"a" (x ),"a" (y ),"a" (alpha )
157
- :"cc ", "memory ", "r1 " ,"v0 " ,"v16 ","v17 ","v18 ","v19 ", "v24 ","v25 ","v26 ","v27 "
158
- );
159
- }
160
-
161
- #elif defined(Z13_C )
162
- static void __attribute__ ((noinline )) daxpy_kernel_32 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT * alpha )
163
- {
164
-
165
- __asm__ volatile (
166
- #if defined(PREFETCH_INS )
167
- "pfd 1, 0(%1) \n\t"
168
- "pfd 2, 0(%2) \n\t"
169
- #endif
170
- "vlrepg %%v0 , 0(%3) \n\t"
171
- "srlg %3,%0,5 \n\t"
172
- "xgr %%r1,%%r1 \n\t"
173
- "vlr %%v1,%%v0 \n\t"
174
- ".align 16 \n\t"
175
- "1: \n\t"
176
- #if defined(PREFETCH_INS )
177
- "pfd 1, 256(%%r1,%1) \n\t"
178
- "pfd 2, 256(%%r1,%2) \n\t"
179
- #endif
180
- "vl %%v16, 0(%%r1,%1) \n\t"
181
- "vl %%v17, 16(%%r1,%1) \n\t"
182
- "vl %%v18, 32(%%r1,%1) \n\t"
183
- "vl %%v19, 48(%%r1,%1) \n\t"
184
-
185
- "vl %%v24, 0(%%r1,%2) \n\t"
186
- "vl %%v25, 16(%%r1,%2) \n\t"
187
- "vl %%v26, 32(%%r1,%2) \n\t"
188
- "vl %%v27, 48(%%r1,%2) \n\t"
189
- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
190
- "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
191
- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
192
- "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
193
- "vst %%v16, 0(%%r1,%2) \n\t"
194
- "vst %%v17, 16(%%r1,%2) \n\t"
195
- "vst %%v18, 32(%%r1,%2) \n\t"
196
- "vst %%v19, 48(%%r1,%2) \n\t"
197
-
198
- "vl %%v24, 64(%%r1,%1) \n\t"
199
- "vl %%v25, 80(%%r1,%1) \n\t"
200
- "vl %%v26, 96(%%r1,%1) \n\t"
201
- "vl %%v27, 112(%%r1,%1) \n\t"
202
-
203
- "vl %%v16, 64(%%r1,%2) \n\t"
204
- "vl %%v17, 80(%%r1,%2) \n\t"
205
- "vl %%v18, 96(%%r1,%2) \n\t"
206
- "vl %%v19, 112(%%r1,%2) \n\t"
207
-
208
-
209
- "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
210
- "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
211
- "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
212
- "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
213
-
214
- "vst %%v24, 64(%%r1,%2) \n\t"
215
- "vst %%v25, 80(%%r1,%2) \n\t"
216
- "vst %%v26, 96(%%r1,%2) \n\t"
217
- "vst %%v27, 112(%%r1,%2) \n\t"
218
-
219
- "vl %%v16, (0+128)(%%r1,%1) \n\t"
220
- "vl %%v17, (16+128)(%%r1,%1) \n\t"
221
- "vl %%v18, (32+128)(%%r1,%1) \n\t"
222
- "vl %%v19, (48+128)(%%r1,%1) \n\t"
223
-
224
- "vl %%v24, (0+128)(%%r1,%2) \n\t"
225
- "vl %%v25, (16+128)(%%r1,%2) \n\t"
226
- "vl %%v26, (32+128)(%%r1,%2) \n\t"
227
- "vl %%v27, (48+128)(%%r1,%2) \n\t"
228
-
229
- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
230
- "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
231
- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
232
- "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
233
- "vst %%v16, (0+128)(%%r1,%2) \n\t"
234
- "vst %%v17, (16+128)(%%r1,%2) \n\t"
235
- "vst %%v18, (32+128)(%%r1,%2) \n\t"
236
- "vst %%v19, (48+128)(%%r1,%2) \n\t"
237
-
238
- "vl %%v24, (64+128)(%%r1,%1) \n\t"
239
- "vl %%v25, (80+128)(%%r1,%1) \n\t"
240
- "vl %%v26, (96+128)(%%r1,%1) \n\t"
241
- "vl %%v27, (112+128)(%%r1,%1) \n\t"
242
-
243
- "vl %%v16, (64+128)(%%r1,%2) \n\t"
244
- "vl %%v17, (80+128)(%%r1,%2) \n\t"
245
- "vl %%v18, (96+128)(%%r1,%2) \n\t"
246
- "vl %%v19, (112+128)(%%r1,%2) \n\t"
247
-
248
- "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
249
- "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
250
- "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
251
- "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
252
-
253
- "vst %%v24, (64+128)(%%r1,%2) \n\t"
254
- "vst %%v25, (80+128)(%%r1,%2) \n\t"
255
- "vst %%v26, (96+128)(%%r1,%2) \n\t"
256
- "vst %%v27, (112+128)(%%r1,%2) \n\t"
257
-
258
- "la %%r1,256(%%r1) \n\t"
259
- "brctg %3,1b"
260
- :
261
- :"r "(n ),"a" (x ),"a" (y ),"a" (alpha )
262
- :"cc ", "memory ", "r1 " ,"v0 ","v1 ","v16 ","v17 ","v18 ","v19 ", "v24 ","v25 ","v26 ","v27 "
263
- );
264
- }
265
-
266
-
267
- #elif defined(Z13_D )
268
- static void __attribute__ ((noinline )) daxpy_kernel_32 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT * alpha )
63
+ #else
64
+ static void daxpy_kernel_32 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT alpha )
269
65
{
270
66
271
67
__asm__ volatile (
272
68
#if defined(PREFETCH_INS )
273
- "pfd 1, 0(%1) \n\t"
274
- "pfd 2, 0(%2) \n\t"
275
- #endif
276
- "vlrepg %%v0 , 0(%3) \n\t"
277
- "srlg %3,%0,5 \n\t"
278
- "vlr %%v1,%%v0 \n\t"
69
+ "pfd 1, 0(%[x_tmp]) \n\t"
70
+ "pfd 2, 0(%[y_tmp]) \n\t"
71
+ #endif
72
+ "lgdr %%r0,%[alpha] \n\t"
73
+ "vlvgp %%v0,%%r0,%%r0 \n\t"
74
+ "srlg %%r0,%[n],5 \n\t"
75
+ "vlr %%v1,%%v0 \n\t"
279
76
".align 16 \n\t"
280
77
"1: \n\t"
281
78
#if defined(PREFETCH_INS )
282
- "pfd 1, 256(%1 ) \n\t"
283
- "pfd 2, 256(%2 ) \n\t"
79
+ "pfd 1, 256(%[x_tmp] ) \n\t"
80
+ "pfd 2, 256(%[y_tmp] ) \n\t"
284
81
#endif
285
- "vlm %%v16,%%v23, 0(%1) \n\t"
286
- "vlm %%v24, %%v31, 0(%2) \n\t"
287
- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
288
- "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
289
- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
290
- "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
291
- "vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
292
- "vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
293
- "vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
294
- "vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
295
- "vstm %%v16,%%v23, 0(%2) \n\t"
296
- "vlm %%v24,%%v31, 128(%1) \n\t"
297
- "vlm %%v16,%%v23, 128(%2 ) \n\t"
298
- "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
299
- "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
300
- "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
301
- "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
302
- "vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
303
- "vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
304
- "vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
305
- "vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
306
- "la %1,256(%1) \n\t"
307
- "vstm %%v24, %%v31, 128(%2) \n\t"
308
- "la %2,256(%2) \n\t"
309
- "brctg %3 ,1b"
310
- :
311
- :" r "( n ), "a" ( x ),"a" ( y ), "a " (alpha )
312
- :"cc" , "memory" , "v0" ,"v1" ,"v16" ,"v17" ,"v18" ,"v19" ,"v20" ,"v21" ,
82
+ "vlm %%v16,%%v23, 0(%[x_tmp]) \n\t"
83
+ "vlm %%v24, %%v31, 0(%[y_tmp]) \n\t"
84
+ "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
85
+ "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
86
+ "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
87
+ "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
88
+ "vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
89
+ "vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
90
+ "vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
91
+ "vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
92
+ "vstm %%v16,%%v23, 0(%[y_tmp]) \n\t"
93
+ "vlm %%v24,%%v31, 128(%[x_tmp]) \n\t"
94
+ "vlm %%v16,%%v23, 128(%[y_tmp] ) \n\t"
95
+ "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
96
+ "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
97
+ "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
98
+ "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
99
+ "vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
100
+ "vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
101
+ "vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
102
+ "vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
103
+ "la %[x_tmp],256(%[x_tmp]) \n\t"
104
+ "vstm %%v24, %%v31, 128(%[y_tmp]) \n\t"
105
+ "la %[y_tmp],256(%[y_tmp]) \n\t"
106
+ "brctg %%r0 ,1b"
107
+ : [ mem_y ] " + m " (*(double (*)[n])y), [x_tmp] " + & a "(x), [y_tmp] " + & a "(y)
108
+ : [ mem_x ] "m" ( * ( const double ( * )[ n ]) x ), [ n ] " r "(n), [alpha] " f "(alpha)
109
+ :" cc ", " r0 ", "v0 "," v1 "," v16 "," v17 "," v18 "," v19 "," v20 "," v21 ",
313
110
" v22 "," v23 "," v24 "," v25 "," v26 "," v27 "," v28 "," v29 "," v30 "," v31 "
314
111
);
315
112
@@ -334,7 +131,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
334
131
BLASLONG n1 = n & -32 ;
335
132
336
133
if ( n1 )
337
- daxpy_kernel_32 (n1 , x , y , & da );
134
+ daxpy_kernel_32 (n1 , x , y , da );
338
135
339
136
i = n1 ;
340
137
while (i < n )
0 commit comments