@@ -52,82 +52,66 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
52
52
"vflpsb %%v0,%%v0 \n\t"
53
53
"vflpsb %%v16,%%v16 \n\t"
54
54
"vfasb %%v0,%%v0,%%v16 \n\t"
55
+ "vleib %%v1,0,0 \n\t"
56
+ "vleib %%v1,1,1 \n\t"
57
+ "vleib %%v1,2,2 \n\t"
58
+ "vleib %%v1,3,3 \n\t"
59
+ "vleib %%v1,8,4 \n\t"
60
+ "vleib %%v1,9,5 \n\t"
61
+ "vleib %%v1,10,6 \n\t"
62
+ "vleib %%v1,11,7 \n\t"
63
+ "vleib %%v1,16,8 \n\t"
64
+ "vleib %%v1,17,9 \n\t"
65
+ "vleib %%v1,18,10 \n\t"
66
+ "vleib %%v1,19,11 \n\t"
67
+ "vleib %%v1,24,12 \n\t"
68
+ "vleib %%v1,25,13 \n\t"
69
+ "vleib %%v1,26,14 \n\t"
70
+ "vleib %%v1,27,15 \n\t"
55
71
"srlg %%r0,%1,5 \n\t"
56
72
"xgr %%r1,%%r1 \n\t"
57
73
"0: \n\t"
58
74
"pfd 1, 1024(%%r1,%2) \n\t"
59
75
60
- "vlef %%v16,0(%%r1,%2),0 \n\t"
61
- "vlef %%v17,4(%%r1,%2),0 \n\t"
62
- "vlef %%v16,8(%%r1,%2),1 \n\t"
63
- "vlef %%v17,12(%%r1,%2),1 \n\t"
64
- "vlef %%v16,16(%%r1,%2),2 \n\t"
65
- "vlef %%v17,20(%%r1,%2),2 \n\t"
66
- "vlef %%v16,24(%%r1,%2),3 \n\t"
67
- "vlef %%v17,28(%%r1,%2),3 \n\t"
68
-
69
- "vlef %%v18,32(%%r1,%2),0 \n\t"
70
- "vlef %%v19,36(%%r1,%2),0 \n\t"
71
- "vlef %%v18,40(%%r1,%2),1 \n\t"
72
- "vlef %%v19,44(%%r1,%2),1 \n\t"
73
- "vlef %%v18,48(%%r1,%2),2 \n\t"
74
- "vlef %%v19,52(%%r1,%2),2 \n\t"
75
- "vlef %%v18,56(%%r1,%2),3 \n\t"
76
- "vlef %%v19,30(%%r1,%2),3 \n\t"
77
-
78
- "vlef %%v20,64(%%r1,%2),0 \n\t"
79
- "vlef %%v21,68(%%r1,%2),0 \n\t"
80
- "vlef %%v20,72(%%r1,%2),1 \n\t"
81
- "vlef %%v21,76(%%r1,%2),1 \n\t"
82
- "vlef %%v20,80(%%r1,%2),2 \n\t"
83
- "vlef %%v21,84(%%r1,%2),2 \n\t"
84
- "vlef %%v20,88(%%r1,%2),3 \n\t"
85
- "vlef %%v21,92(%%r1,%2),3 \n\t"
86
-
87
- "vlef %%v22,96(%%r1,%2),0 \n\t"
88
- "vlef %%v23,100(%%r1,%2),0 \n\t"
89
- "vlef %%v22,104(%%r1,%2),1 \n\t"
90
- "vlef %%v23,108(%%r1,%2),1 \n\t"
91
- "vlef %%v22,112(%%r1,%2),2 \n\t"
92
- "vlef %%v23,116(%%r1,%2),2 \n\t"
93
- "vlef %%v22,120(%%r1,%2),3 \n\t"
94
- "vlef %%v23,124(%%r1,%2),3 \n\t"
95
-
96
- "vlef %%v24,128(%%r1,%2),0 \n\t"
97
- "vlef %%v25,132(%%r1,%2),0 \n\t"
98
- "vlef %%v24,136(%%r1,%2),1 \n\t"
99
- "vlef %%v25,140(%%r1,%2),1 \n\t"
100
- "vlef %%v24,144(%%r1,%2),2 \n\t"
101
- "vlef %%v25,148(%%r1,%2),2 \n\t"
102
- "vlef %%v24,152(%%r1,%2),3 \n\t"
103
- "vlef %%v25,156(%%r1,%2),3 \n\t"
104
-
105
- "vlef %%v26,160(%%r1,%2),0 \n\t"
106
- "vlef %%v27,164(%%r1,%2),0 \n\t"
107
- "vlef %%v26,168(%%r1,%2),1 \n\t"
108
- "vlef %%v27,172(%%r1,%2),1 \n\t"
109
- "vlef %%v26,176(%%r1,%2),2 \n\t"
110
- "vlef %%v27,180(%%r1,%2),2 \n\t"
111
- "vlef %%v26,184(%%r1,%2),3 \n\t"
112
- "vlef %%v27,188(%%r1,%2),3 \n\t"
113
-
114
- "vlef %%v28,192(%%r1,%2),0 \n\t"
115
- "vlef %%v29,196(%%r1,%2),0 \n\t"
116
- "vlef %%v28,200(%%r1,%2),1 \n\t"
117
- "vlef %%v29,204(%%r1,%2),1 \n\t"
118
- "vlef %%v28,208(%%r1,%2),2 \n\t"
119
- "vlef %%v29,212(%%r1,%2),2 \n\t"
120
- "vlef %%v28,216(%%r1,%2),3 \n\t"
121
- "vlef %%v29,220(%%r1,%2),3 \n\t"
122
-
123
- "vlef %%v30,224(%%r1,%2),0 \n\t"
124
- "vlef %%v31,228(%%r1,%2),0 \n\t"
125
- "vlef %%v30,232(%%r1,%2),1 \n\t"
126
- "vlef %%v31,236(%%r1,%2),1 \n\t"
127
- "vlef %%v30,240(%%r1,%2),2 \n\t"
128
- "vlef %%v31,244(%%r1,%2),2 \n\t"
129
- "vlef %%v30,248(%%r1,%2),3 \n\t"
130
- "vlef %%v31,252(%%r1,%2),3 \n\t"
76
+ "vl %%v16,0(%%r1,%2) \n\t"
77
+ "vl %%v2,16(%%r1,%2) \n\t"
78
+ "vpkg %%v17,%%v16,%%v2 \n\t"
79
+ "vperm %%v16,%%v16,%%v2,%%v1 \n\t"
80
+
81
+ "vl %%v18,32(%%r1,%2) \n\t"
82
+ "vl %%v2,48(%%r1,%2) \n\t"
83
+ "vpkg %%v19,%%v18,%%v2 \n\t"
84
+ "vperm %%v18,%%v18,%%v2,%%v1 \n\t"
85
+
86
+ "vl %%v20,64(%%r1,%2) \n\t"
87
+ "vl %%v2,80(%%r1,%2) \n\t"
88
+ "vpkg %%v21,%%v20,%%v2 \n\t"
89
+ "vperm %%v20,%%v20,%%v2,%%v1 \n\t"
90
+
91
+ "vl %%v22,96(%%r1,%2) \n\t"
92
+ "vl %%v2,112(%%r1,%2) \n\t"
93
+ "vpkg %%v23,%%v22,%%v2 \n\t"
94
+ "vperm %%v22,%%v22,%%v2,%%v1 \n\t"
95
+
96
+ "vl %%v24,128(%%r1,%2) \n\t"
97
+ "vl %%v2,144(%%r1,%2) \n\t"
98
+ "vpkg %%v25,%%v24,%%v2 \n\t"
99
+ "vperm %%v24,%%v24,%%v2,%%v1 \n\t"
100
+
101
+ "vl %%v26,160(%%r1,%2) \n\t"
102
+ "vl %%v2,176(%%r1,%2) \n\t"
103
+ "vpkg %%v27,%%v26,%%v2 \n\t"
104
+ "vperm %%v26,%%v26,%%v2,%%v1 \n\t"
105
+
106
+ "vl %%v28,192(%%r1,%2) \n\t"
107
+ "vl %%v2,208(%%r1,%2) \n\t"
108
+ "vpkg %%v29,%%v28,%%v2 \n\t"
109
+ "vperm %%v28,%%v28,%%v2,%%v1 \n\t"
110
+
111
+ "vl %%v30,224(%%r1,%2) \n\t"
112
+ "vl %%v2,240(%%r1,%2) \n\t"
113
+ "vpkg %%v31,%%v30,%%v2 \n\t"
114
+ "vperm %%v30,%%v30,%%v2,%%v1 \n\t"
131
115
132
116
"vflpsb %%v16,%%v16 \n\t"
133
117
"vflpsb %%v17,%%v17 \n\t"
@@ -178,7 +162,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
178
162
"ler %0,%%f0 "
179
163
:"=f" (amax )
180
164
:"r" (n ),"ZR" ((const FLOAT (* )[n ])x )
181
- :"memory" ,"cc" ,"r0" ,"r1" ,"v0" ,"v16" ,"v17" ,"v18" ,"v19" ,"v20" ,"v21" ,"v22" ,"v23" ,"v24" ,"v25" ,"v26" ,"v27"
165
+ :"memory" ,"cc" ,"r0" ,"r1" ,"v0" ,"v1" , "v2" , " v16" ,"v17" ,"v18" ,"v19" ,"v20" ,"v21" ,"v22" ,"v23" ,"v24" ,"v25" ,"v26" ,"v27" , "v28" , "v29" , "v30" , "v31 "
182
166
);
183
167
184
168
return amax ;
0 commit comments