@@ -35,327 +35,72 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
35
36
36
37
37
#if defined(HAVE_KERNEL4x8_ASM )
38
- static void dgemv_kernel_4x8 (BLASLONG n , BLASLONG lda , double * ap , double * x , double * y , double alpha ) {
39
-
38
+ typedef __vector unsigned char vec_t ;
39
+ static void dgemv_kernel_4x8 (BLASLONG n , BLASLONG lda , FLOAT * ap , FLOAT * x , FLOAT * y , FLOAT alpha ) {
40
+ BLASLONG i ;
40
41
FLOAT * a0 , * a1 , * a2 , * a3 , * a4 , * a5 , * a6 , * a7 ;
41
- BLASLONG off2 ;
42
- BLASLONG tempR ;
43
- __asm__(
44
-
45
- "sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2
46
- "sldi %[off], %[off], 3 \n\t" // lda * sizeof (double)
47
- "xxlxor 34,34,34 \n\t"
48
- "xxlxor 35,34,34 \n\t"
49
- "add %[a2], %[a0], %[temp] \n\t"
50
- "add %[a1], %[a0], %[off] \n\t"
51
- "xxlxor 4,34,34 \n\t"
52
- "xxlxor 5,34,34 \n\t"
53
- "xxlxor 6,34,34 \n\t"
54
- "xxlxor 7,34,34 \n\t"
55
- "add %[a3], %[a2], %[off] \n\t"
56
- "add %[a4], %[a2], %[temp] \n\t"
57
-
58
- "xxlxor 8,34,34 \n\t"
59
- "xxlxor 9,34,34 \n\t"
60
- "add %[a5], %[a3], %[temp] \n\t"
61
- "li %[off],0 \n\t"
62
- "li %[off2],16 \n\t"
63
-
64
- "add %[a6], %[a4], %[temp] \n\t"
65
- "add %[a7], %[a5], %[temp] \n\t"
66
-
67
-
68
-
69
-
70
- "lxvp 32, 0(%[x]) \n\t"
71
- "lxvp 36, 0(%[a0]) \n\t"
72
- "lxvp 38, 0(%[a1]) \n\t"
73
- "lxvp 40, 0(%[a2]) \n\t"
74
- "lxvp 42, 0(%[a3]) \n\t"
75
- "lxvp 44, 0(%[a4]) \n\t"
76
- "lxvp 46, 0(%[a5]) \n\t"
77
- "lxvp 48, 0(%[a6]) \n\t"
78
- "lxvp 50, 0(%[a7]) \n\t"
79
- #if defined(PREFETCH )
80
- "li %[temp],896 \n\t"
81
- #endif
82
- "addic. %[n],%[n],-4 \n\t"
83
-
84
- "li %[off],32 \n\t"
85
-
86
-
87
- "ble- two%= \n\t"
88
-
89
- //--------------------------------------------------
90
- ".align 5 \n\t"
91
- "one%=: \n\t"
92
- "xvmaddadp 34,36,32 \n\t"
93
- "xvmaddadp 35,38,32 \n\t"
94
- "addi %[off2], %[off2],32 \n\t"
95
- "xvmaddadp 4,40,32 \n\t"
96
- "xvmaddadp 5,42,32 \n\t"
97
- "xvmaddadp 6,44,32 \n\t"
98
- "xvmaddadp 7,46,32 \n\t"
99
- "xvmaddadp 8,48,32 \n\t"
100
- "xvmaddadp 9,50,32 \n\t"
101
-
102
- "xvmaddadp 34,37,33 \n\t"
103
- "xvmaddadp 35,39,33 \n\t"
104
- "lxvp 36, 32(%[a0]) \n\t"
105
- "lxvp 38, 32(%[a1]) \n\t"
106
- "xvmaddadp 4,41,33 \n\t"
107
- "xvmaddadp 5,43,33 \n\t"
108
- "addi %[off], %[off],32 \n\t"
109
- "lxvp 40, 32(%[a2]) \n\t"
110
- "lxvp 42, 32(%[a3]) \n\t"
111
- "xvmaddadp 6,45,33 \n\t"
112
- "xvmaddadp 7,47,33 \n\t"
113
- "lxvp 44, 32(%[a4]) \n\t"
114
- "lxvp 46, 32(%[a5]) \n\t"
115
- "xvmaddadp 8,49,33 \n\t"
116
- "xvmaddadp 9,51,33 \n\t"
117
-
118
- "addic. %[n],%[n],-4 \n\t"
119
- "lxvp 48, 32(%[a6]) \n\t"
120
- "lxvp 50, 32(%[a7]) \n\t"
121
- "lxvp 32, 32(%[x]) \n\t"
122
- "ble- two%= \n\t"
123
- "xvmaddadp 34,36,32 \n\t"
124
- "xvmaddadp 35,38,32 \n\t"
125
- "addi %[off2], %[off2],32 \n\t"
126
- "xvmaddadp 4,40,32 \n\t"
127
- "xvmaddadp 5,42,32 \n\t"
128
- "xvmaddadp 6,44,32 \n\t"
129
- "xvmaddadp 7,46,32 \n\t"
130
- "xvmaddadp 8,48,32 \n\t"
131
- "xvmaddadp 9,50,32 \n\t"
132
-
133
- "xvmaddadp 34,37,33 \n\t"
134
- "xvmaddadp 35,39,33 \n\t"
135
- "lxvp 36, 64(%[a0]) \n\t"
136
- "lxvp 38, 64(%[a1]) \n\t"
137
- "xvmaddadp 4,41,33 \n\t"
138
- "xvmaddadp 5,43,33 \n\t"
139
- "addi %[off], %[off],32 \n\t"
140
- "lxvp 40, 64(%[a2]) \n\t"
141
- "lxvp 42, 64(%[a3]) \n\t"
142
- "xvmaddadp 6,45,33 \n\t"
143
- "xvmaddadp 7,47,33 \n\t"
144
- "lxvp 44, 64(%[a4]) \n\t"
145
- "lxvp 46, 64(%[a5]) \n\t"
146
- "xvmaddadp 8,49,33 \n\t"
147
- "xvmaddadp 9,51,33 \n\t"
148
-
149
- "addic. %[n],%[n],-4 \n\t"
150
- "lxvp 48, 64(%[a6]) \n\t"
151
- "lxvp 50, 64(%[a7]) \n\t"
152
- "lxvp 32, 64(%[x]) \n\t"
153
- "ble- two%= \n\t"
154
- "xvmaddadp 34,36,32 \n\t"
155
- "xvmaddadp 35,38,32 \n\t"
156
- #if defined(PREFETCH )
157
- "addi %[temp],%[temp],128 \n\t"
158
- #endif
159
- "addi %[off2], %[off2],32 \n\t"
160
- "xvmaddadp 4,40,32 \n\t"
161
- "xvmaddadp 5,42,32 \n\t"
162
- "xvmaddadp 6,44,32 \n\t"
163
- "xvmaddadp 7,46,32 \n\t"
164
- "xvmaddadp 8,48,32 \n\t"
165
- "xvmaddadp 9,50,32 \n\t"
166
- #if defined(PREFETCH )
167
- "dcbt %[temp],%[a0] \n\t"
168
- #endif
169
-
170
- "xvmaddadp 34,37,33 \n\t"
171
- "xvmaddadp 35,39,33 \n\t"
172
- "lxvp 36, 96(%[a0]) \n\t"
173
- "lxvp 38, 96(%[a1]) \n\t"
174
- "xvmaddadp 4,41,33 \n\t"
175
- "xvmaddadp 5,43,33 \n\t"
176
- #if defined(PREFETCH )
177
- "dcbt %[temp],%[a1] \n\t"
178
- #endif
179
- "lxvp 40, 96(%[a2]) \n\t"
180
- "lxvp 42, 96(%[a3]) \n\t"
181
- "addi %[off], %[off],32 \n\t"
182
- "xvmaddadp 6,45,33 \n\t"
183
- "xvmaddadp 7,47,33 \n\t"
184
- "lxvp 44, 96(%[a4]) \n\t"
185
- "lxvp 46, 96(%[a5]) \n\t"
186
- "xvmaddadp 8,49,33 \n\t"
187
- "xvmaddadp 9,51,33 \n\t"
188
- #if defined(PREFETCH )
189
- "dcbt %[temp],%[a3] \n\t"
190
- #endif
191
- "lxvp 48, 96(%[a6]) \n\t"
192
- "lxvp 50, 96(%[a7]) \n\t"
193
- "lxvp 32, 96(%[x]) \n\t"
194
-
195
- "addic. %[n],%[n],-4 \n\t"
196
- "ble- two%= \n\t"
197
-
198
- "addi %[off2], %[off2],32 \n\t"
199
- #if defined(PREFETCH )
200
- "dcbt %[temp],%[a2] \n\t"
201
- #endif
202
- "xvmaddadp 34,36,32 \n\t"
203
- "xvmaddadp 35,38,32 \n\t"
204
- "xvmaddadp 4,40,32 \n\t"
205
- "xvmaddadp 5,42,32 \n\t"
206
- #if defined(PREFETCH )
207
- "dcbt %[temp],%[a4] \n\t"
208
- #endif
209
- "xvmaddadp 6,44,32 \n\t"
210
- "xvmaddadp 7,46,32 \n\t"
211
- "xvmaddadp 8,48,32 \n\t"
212
- "xvmaddadp 9,50,32 \n\t"
213
-
214
- #if defined(PREFETCH )
215
- "dcbt %[temp],%[a5] \n\t"
216
- #endif
217
- "xvmaddadp 34,37,33 \n\t"
218
- "xvmaddadp 35,39,33 \n\t"
219
- "lxvp 36, 128(%[a0]) \n\t"
220
- "lxvp 38, 128(%[a1]) \n\t"
221
- "xvmaddadp 4,41,33 \n\t"
222
- "xvmaddadp 5,43,33 \n\t"
223
- "addi %[off], %[off],32 \n\t"
224
- "lxvp 40, 128(%[a2]) \n\t"
225
- "lxvp 42, 128(%[a3]) \n\t"
226
- #if defined(PREFETCH )
227
- "dcbt %[temp],%[a6] \n\t"
228
- #endif
229
- "xvmaddadp 6,45,33 \n\t"
230
- "xvmaddadp 7,47,33 \n\t"
231
- "lxvp 44, 128(%[a4]) \n\t"
232
- "lxvp 46, 128(%[a5]) \n\t"
233
- "xvmaddadp 8,49,33 \n\t"
234
- "xvmaddadp 9,51,33 \n\t"
235
-
236
- #if defined(PREFETCH )
237
- "dcbt %[temp],%[a7] \n\t"
238
- #endif
239
- "addic. %[n],%[n],-4 \n\t"
240
- "lxvp 48, 128(%[a6]) \n\t"
241
- "lxvp 50, 128(%[a7]) \n\t"
242
- "lxvp 32, 128(%[x]) \n\t"
243
- #if defined(PREFETCH )
244
- "dcbt %[temp],%[x] \n\t"
245
- #endif
246
- "addi %[a0], %[a0], 128 \n\t"
247
- "addi %[a1], %[a1], 128 \n\t"
248
- "addi %[a2], %[a2], 128 \n\t"
249
- "addi %[a3], %[a3], 128 \n\t"
250
- "addi %[a4], %[a4], 128 \n\t"
251
- "addi %[a5], %[a5], 128 \n\t"
252
- "addi %[a6], %[a6], 128 \n\t"
253
- "addi %[a7], %[a7], 128 \n\t"
254
- "addi %[x], %[x], 128 \n\t"
255
- "bgt+ one%= \n\t"
256
- ".align 5 \n\t"
257
- "two%=: \n\t"
258
- //--------------------------------------------
259
-
260
- "xvmaddadp 34,36,32 \n\t"
261
- "xvmaddadp 35,38,32 \n\t"
262
- "xvmaddadp 4,40,32 \n\t"
263
- "xvmaddadp 5,42,32 \n\t"
264
- "xvmaddadp 6,44,32 \n\t"
265
- "xvmaddadp 7,46,32 \n\t"
266
- "xvmaddadp 8,48,32 \n\t"
267
- "xvmaddadp 9,50,32 \n\t"
268
- XXSPLTD_S (36 ,%x [alpha ],0 )
269
- "xvmaddadp 34,37,33 \n\t"
270
- "xvmaddadp 35,39,33 \n\t"
271
- "xvmaddadp 4,41,33 \n\t"
272
- "xvmaddadp 5,43,33 \n\t"
273
- "xvmaddadp 6,45,33 \n\t"
274
- "xvmaddadp 7,47,33 \n\t"
275
- "xvmaddadp 8,49,33 \n\t"
276
- "xvmaddadp 9,51,33 \n\t"
277
-
278
- "lxvp 38, 0(%[y]) \n\t"
279
- "lxvp 40, 32(%[y]) \n\t"
280
-
281
-
282
- #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
283
- XXMRGHD_S (42 ,34 ,35 )
284
- XXMRGLD_S (43 ,34 ,35 )
285
-
286
- XXMRGHD_S (44 ,4 ,5 )
287
- XXMRGLD_S (45 ,4 ,5 )
288
- #else
289
- XXMRGLD_S (42 ,35 ,34 )
290
- XXMRGHD_S (43 ,35 ,34 )
291
-
292
- XXMRGLD_S (44 ,5 ,4 )
293
- XXMRGHD_S (45 ,5 ,4 )
294
- #endif
295
-
296
- "xvadddp 42,42,43 \n\t"
297
-
298
- #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
299
- XXMRGHD_S (46 ,6 ,7 )
300
- XXMRGLD_S (47 ,6 ,7 )
301
- #else
302
- XXMRGLD_S (46 ,7 ,6 )
303
- XXMRGHD_S (47 ,7 ,6 )
304
- #endif
305
- "xvadddp 44,44,45 \n\t"
306
-
307
- #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
308
- XXMRGHD_S (48 ,8 ,9 )
309
- XXMRGLD_S (49 ,8 ,9 )
310
- #else
311
- XXMRGLD_S (48 ,9 ,8 )
312
- XXMRGHD_S (49 ,9 ,8 )
313
- #endif
314
- "xvadddp 46,46,47 \n\t"
315
- #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
316
- "xvmaddadp 38,42,36 \n\t"
317
- "xvmaddadp 39,44,36 \n\t"
318
- #else
319
- "xvmaddadp 39,42,36 \n\t"
320
- "xvmaddadp 38,44,36 \n\t"
321
- #endif
322
- "xvadddp 48,48,49 \n\t"
323
- #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
324
- "xvmaddadp 41,48,36 \n\t"
325
- #else
326
- "xvmaddadp 41,46,36 \n\t"
327
- #endif
328
- "stxvp 38, 0(%[y]) \n\t"
329
- #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
330
- "xvmaddadp 40,46,36 \n\t"
331
- #else
332
- "xvmaddadp 40,48,36 \n\t"
333
- #endif
334
- "stxvp 40, 32(%[y]) \n\t"
335
-
336
- : [memy ] "+ m " (*(double (*)[8])y),
337
- [n ] "+ & r " (n),
338
- [a0 ] "= b " (a0),
339
- [a1 ] "= & b " (a1),
340
- [a2 ] "= & b " (a2),
341
- [a3 ] "= & b " (a3),
342
- [a4 ] "= & b " (a4),
343
- [a5 ] "= & b " (a5),
344
- [a6 ] "= & b " (a6),
345
- [a7 ] "= & b " (a7),
346
- [off ] "+ & b " (lda),
347
- [off2 ]"= & b " (off2),
348
- [temp ] "= & b " (tempR)
349
- : [memx ] "m" (* (const double (* )[n ])x ),
350
- [mem_ap ] "m" (* (const double (* )[n * 8 ]) ap ),
351
- [alpha ] "d" (alpha ),
352
- "[a0]" (ap ),
353
- [x ] "b" (x ),
354
- [y ] "b" (y )
355
- : "cc" ,"vs4" ,"vs5" ,"vs6" ,"vs7" ,"vs8" ,"vs9" ,"vs32" ,"vs33" ,"vs34" ,"vs35" , "vs36" , "vs37" , "vs38" , "vs39" ,
356
- "vs40" , "vs41" , "vs42" , "vs43" , "vs44" , "vs45" , "vs46" , "vs47" , "vs48" , "vs49" , "vs50" , "vs51"
357
- );
358
- return ;
42
+ __vector_pair vx , vp ;
43
+ vec_t res [2 ],res1 [2 ];
44
+ register __vector double temp0 = {0 , 0 };
45
+ register __vector double temp1 = {0 , 0 };
46
+ register __vector double temp2 = {0 , 0 };
47
+ register __vector double temp3 = {0 , 0 };
48
+ register __vector double temp4 = {0 , 0 };
49
+ register __vector double temp5 = {0 , 0 };
50
+ register __vector double temp6 = {0 , 0 };
51
+ register __vector double temp7 = {0 , 0 };
52
+ a0 = ap ;
53
+ a1 = ap + lda ;
54
+ a2 = a1 + lda ;
55
+ a3 = a2 + lda ;
56
+ a4 = a3 + lda ;
57
+ a5 = a4 + lda ;
58
+ a6 = a5 + lda ;
59
+ a7 = a6 + lda ;
60
+ for (i = 0 ; i < n /2 ; i += 2 ) {
61
+ vp = * ((__vector_pair * )((void * )& a0 [i * 2 ]));
62
+ vx = * ((__vector_pair * )((void * )& x [i * 2 ]));
63
+ __builtin_vsx_disassemble_pair (res , & vx );
64
+ __builtin_vsx_disassemble_pair (res1 , & vp );
65
+ temp0 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp0 );
66
+ temp0 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp0 );
67
+ vp = * ((__vector_pair * )((void * )& a1 [i * 2 ]));
68
+ __builtin_vsx_disassemble_pair (res1 , & vp );
69
+ temp1 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp1 );
70
+ temp1 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp1 );
71
+ vp = * ((__vector_pair * )((void * )& a2 [i * 2 ]));
72
+ __builtin_vsx_disassemble_pair (res1 , & vp );
73
+ temp2 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp2 );
74
+ temp2 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp2 );
75
+ vp = * ((__vector_pair * )((void * )& a3 [i * 2 ]));
76
+ __builtin_vsx_disassemble_pair (res1 , & vp );
77
+ temp3 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp3 );
78
+ temp3 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp3 );
79
+ vp = * ((__vector_pair * )((void * )& a4 [i * 2 ]));
80
+ __builtin_vsx_disassemble_pair (res1 , & vp );
81
+ temp4 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp4 );
82
+ temp4 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp4 );
83
+ vp = * ((__vector_pair * )((void * )& a5 [i * 2 ]));
84
+ __builtin_vsx_disassemble_pair (res1 , & vp );
85
+ temp5 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp5 );
86
+ temp5 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp5 );
87
+ vp = * ((__vector_pair * )((void * )& a6 [i * 2 ]));
88
+ __builtin_vsx_disassemble_pair (res1 , & vp );
89
+ temp6 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp6 );
90
+ temp6 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp6 );
91
+ vp = * ((__vector_pair * )((void * )& a7 [i * 2 ]));
92
+ __builtin_vsx_disassemble_pair (res1 , & vp );
93
+ temp7 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp7 );
94
+ temp7 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp7 );
95
+ }
96
+ y [0 ] += alpha * (temp0 [0 ] + temp0 [1 ]);
97
+ y [1 ] += alpha * (temp1 [0 ] + temp1 [1 ]);
98
+ y [2 ] += alpha * (temp2 [0 ] + temp2 [1 ]);
99
+ y [3 ] += alpha * (temp3 [0 ] + temp3 [1 ]);
100
+ y [4 ] += alpha * (temp4 [0 ] + temp4 [1 ]);
101
+ y [5 ] += alpha * (temp5 [0 ] + temp5 [1 ]);
102
+ y [6 ] += alpha * (temp6 [0 ] + temp6 [1 ]);
103
+ y [7 ] += alpha * (temp7 [0 ] + temp7 [1 ]);
359
104
}
360
105
#else
361
106
static void dgemv_kernel_4x8 (BLASLONG n , BLASLONG lda , FLOAT * ap , FLOAT * x , FLOAT * y , FLOAT alpha ) {
0 commit comments