@@ -34,112 +34,112 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) {
34
34
FLOAT amax ;
35
35
36
36
__asm__("vlef %%v0,0(%[x]),0\n\t"
37
- "vlef %%v16,4(%[x]),0\n\t"
38
- "vlef %%v0,8(%[x]),1\n\t"
39
- "vlef %%v16,12(%[x]),1\n\t"
40
- "vlef %%v0,16(%[x]),2\n\t"
41
- "vlef %%v16,20(%[x]),2\n\t"
42
- "vlef %%v0,24(%[x]),3\n\t"
43
- "vlef %%v16,28(%[x]),3\n\t"
44
- "vflpsb %%v0,%%v0\n\t"
45
- "vflpsb %%v16,%%v16\n\t"
46
- "vfasb %%v0,%%v0,%%v16\n\t"
47
- "vleib %%v1,0,0\n\t"
48
- "vleib %%v1,1,1\n\t"
49
- "vleib %%v1,2,2\n\t"
50
- "vleib %%v1,3,3\n\t"
51
- "vleib %%v1,8,4\n\t"
52
- "vleib %%v1,9,5\n\t"
53
- "vleib %%v1,10,6\n\t"
54
- "vleib %%v1,11,7\n\t"
55
- "vleib %%v1,16,8\n\t"
56
- "vleib %%v1,17,9\n\t"
57
- "vleib %%v1,18,10\n\t"
58
- "vleib %%v1,19,11\n\t"
59
- "vleib %%v1,24,12\n\t"
60
- "vleib %%v1,25,13\n\t"
61
- "vleib %%v1,26,14\n\t"
62
- "vleib %%v1,27,15\n\t"
63
- "srlg %[n],%[n],5\n\t"
64
- "xgr %%r1,%%r1\n\t"
65
- "0:\n\t"
66
- "pfd 1, 1024(%%r1,%[x])\n\t"
67
- "vl %%v16,0(%%r1,%[x])\n\t"
68
- "vl %%v2,16(%%r1,%[x])\n\t"
69
- "vpkg %%v17,%%v16,%%v2\n\t"
70
- "vperm %%v16,%%v16,%%v2,%%v1\n\t"
71
- "vl %%v18,32(%%r1,%[x])\n\t"
72
- "vl %%v2,48(%%r1,%[x])\n\t"
73
- "vpkg %%v19,%%v18,%%v2\n\t"
74
- "vperm %%v18,%%v18,%%v2,%%v1\n\t"
75
- "vl %%v20,64(%%r1,%[x])\n\t"
76
- "vl %%v2,80(%%r1,%[x])\n\t"
77
- "vpkg %%v21,%%v20,%%v2\n\t"
78
- "vperm %%v20,%%v20,%%v2,%%v1\n\t"
79
- "vl %%v22,96(%%r1,%[x])\n\t"
80
- "vl %%v2,112(%%r1,%[x])\n\t"
81
- "vpkg %%v23,%%v22,%%v2\n\t"
82
- "vperm %%v22,%%v22,%%v2,%%v1\n\t"
83
- "vl %%v24,128(%%r1,%[x])\n\t"
84
- "vl %%v2,144(%%r1,%[x])\n\t"
85
- "vpkg %%v25,%%v24,%%v2\n\t"
86
- "vperm %%v24,%%v24,%%v2,%%v1\n\t"
87
- "vl %%v26,160(%%r1,%[x])\n\t"
88
- "vl %%v2,176(%%r1,%[x])\n\t"
89
- "vpkg %%v27,%%v26,%%v2\n\t"
90
- "vperm %%v26,%%v26,%%v2,%%v1\n\t"
91
- "vl %%v28,192(%%r1,%[x])\n\t"
92
- "vl %%v2,208(%%r1,%[x])\n\t"
93
- "vpkg %%v29,%%v28,%%v2\n\t"
94
- "vperm %%v28,%%v28,%%v2,%%v1\n\t"
95
- "vl %%v30,224(%%r1,%[x])\n\t"
96
- "vl %%v2,240(%%r1,%[x])\n\t"
97
- "vpkg %%v31,%%v30,%%v2\n\t"
98
- "vperm %%v30,%%v30,%%v2,%%v1\n\t"
99
- "vflpsb %%v16,%%v16\n\t"
100
- "vflpsb %%v17,%%v17\n\t"
101
- "vflpsb %%v18,%%v18\n\t"
102
- "vflpsb %%v19,%%v19\n\t"
103
- "vflpsb %%v20,%%v20\n\t"
104
- "vflpsb %%v21,%%v21\n\t"
105
- "vflpsb %%v22,%%v22\n\t"
106
- "vflpsb %%v23,%%v23\n\t"
107
- "vflpsb %%v24,%%v24\n\t"
108
- "vflpsb %%v25,%%v25\n\t"
109
- "vflpsb %%v26,%%v26\n\t"
110
- "vflpsb %%v27,%%v27\n\t"
111
- "vflpsb %%v28,%%v28\n\t"
112
- "vflpsb %%v29,%%v29\n\t"
113
- "vflpsb %%v30,%%v30\n\t"
114
- "vflpsb %%v31,%%v31\n\t"
115
- "vfasb %%v16,%%v16,%%v17\n\t"
116
- "vfasb %%v18,%%v18,%%v19\n\t"
117
- "vfasb %%v20,%%v20,%%v21\n\t"
118
- "vfasb %%v22,%%v22,%%v23\n\t"
119
- "vfasb %%v24,%%v24,%%v25\n\t"
120
- "vfasb %%v26,%%v26,%%v27\n\t"
121
- "vfasb %%v28,%%v28,%%v29\n\t"
122
- "vfasb %%v30,%%v30,%%v31\n\t"
123
- "vfmaxsb %%v16,%%v16,%%v24,0\n\t"
124
- "vfmaxsb %%v18,%%v18,%%v26,0\n\t"
125
- "vfmaxsb %%v20,%%v20,%%v28,0\n\t"
126
- "vfmaxsb %%v22,%%v22,%%v30,0\n\t"
127
- "vfmaxsb %%v16,%%v16,%%v20,0\n\t"
128
- "vfmaxsb %%v18,%%v18,%%v22,0\n\t"
129
- "vfmaxsb %%v16,%%v16,%%v18,0\n\t"
130
- "vfmaxsb %%v0,%%v0,%%v16,0\n\t"
131
- "agfi %%r1, 256\n\t"
132
- "brctg %[n], 0b\n\t"
133
- "veslg %%v16,%%v0,32\n\t"
134
- "vfmaxsb %%v0,%%v0,%%v16,0\n\t"
135
- "vrepf %%v16,%%v0,2\n\t"
136
- "wfmaxsb %%v0,%%v0,%%v16,0\n\t"
137
- "ler %[amax],%%f0"
138
- : [amax ] "=f" (amax ),[n ] "+&r" (n )
139
- : "m" (* (const FLOAT ( * ) [n * 2 ]) x ),[x ] "a" (x )
140
- : "cc" , "r1" , "v0" , "v1" , "v2" , "v16" , "v17" , "v18" , "v19" , "v20" ,
141
- "v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30" ,
142
- "v31" );
37
+ "vlef %%v16,4(%[x]),0\n\t"
38
+ "vlef %%v0,8(%[x]),1\n\t"
39
+ "vlef %%v16,12(%[x]),1\n\t"
40
+ "vlef %%v0,16(%[x]),2\n\t"
41
+ "vlef %%v16,20(%[x]),2\n\t"
42
+ "vlef %%v0,24(%[x]),3\n\t"
43
+ "vlef %%v16,28(%[x]),3\n\t"
44
+ "vflpsb %%v0,%%v0\n\t"
45
+ "vflpsb %%v16,%%v16\n\t"
46
+ "vfasb %%v0,%%v0,%%v16\n\t"
47
+ "vleib %%v1,0,0\n\t"
48
+ "vleib %%v1,1,1\n\t"
49
+ "vleib %%v1,2,2\n\t"
50
+ "vleib %%v1,3,3\n\t"
51
+ "vleib %%v1,8,4\n\t"
52
+ "vleib %%v1,9,5\n\t"
53
+ "vleib %%v1,10,6\n\t"
54
+ "vleib %%v1,11,7\n\t"
55
+ "vleib %%v1,16,8\n\t"
56
+ "vleib %%v1,17,9\n\t"
57
+ "vleib %%v1,18,10\n\t"
58
+ "vleib %%v1,19,11\n\t"
59
+ "vleib %%v1,24,12\n\t"
60
+ "vleib %%v1,25,13\n\t"
61
+ "vleib %%v1,26,14\n\t"
62
+ "vleib %%v1,27,15\n\t"
63
+ "srlg %[n],%[n],5\n\t"
64
+ "xgr %%r1,%%r1\n\t"
65
+ "0:\n\t"
66
+ "pfd 1, 1024(%%r1,%[x])\n\t"
67
+ "vl %%v16,0(%%r1,%[x])\n\t"
68
+ "vl %%v2,16(%%r1,%[x])\n\t"
69
+ "vpkg %%v17,%%v16,%%v2\n\t"
70
+ "vperm %%v16,%%v16,%%v2,%%v1\n\t"
71
+ "vl %%v18,32(%%r1,%[x])\n\t"
72
+ "vl %%v2,48(%%r1,%[x])\n\t"
73
+ "vpkg %%v19,%%v18,%%v2\n\t"
74
+ "vperm %%v18,%%v18,%%v2,%%v1\n\t"
75
+ "vl %%v20,64(%%r1,%[x])\n\t"
76
+ "vl %%v2,80(%%r1,%[x])\n\t"
77
+ "vpkg %%v21,%%v20,%%v2\n\t"
78
+ "vperm %%v20,%%v20,%%v2,%%v1\n\t"
79
+ "vl %%v22,96(%%r1,%[x])\n\t"
80
+ "vl %%v2,112(%%r1,%[x])\n\t"
81
+ "vpkg %%v23,%%v22,%%v2\n\t"
82
+ "vperm %%v22,%%v22,%%v2,%%v1\n\t"
83
+ "vl %%v24,128(%%r1,%[x])\n\t"
84
+ "vl %%v2,144(%%r1,%[x])\n\t"
85
+ "vpkg %%v25,%%v24,%%v2\n\t"
86
+ "vperm %%v24,%%v24,%%v2,%%v1\n\t"
87
+ "vl %%v26,160(%%r1,%[x])\n\t"
88
+ "vl %%v2,176(%%r1,%[x])\n\t"
89
+ "vpkg %%v27,%%v26,%%v2\n\t"
90
+ "vperm %%v26,%%v26,%%v2,%%v1\n\t"
91
+ "vl %%v28,192(%%r1,%[x])\n\t"
92
+ "vl %%v2,208(%%r1,%[x])\n\t"
93
+ "vpkg %%v29,%%v28,%%v2\n\t"
94
+ "vperm %%v28,%%v28,%%v2,%%v1\n\t"
95
+ "vl %%v30,224(%%r1,%[x])\n\t"
96
+ "vl %%v2,240(%%r1,%[x])\n\t"
97
+ "vpkg %%v31,%%v30,%%v2\n\t"
98
+ "vperm %%v30,%%v30,%%v2,%%v1\n\t"
99
+ "vflpsb %%v16,%%v16\n\t"
100
+ "vflpsb %%v17,%%v17\n\t"
101
+ "vflpsb %%v18,%%v18\n\t"
102
+ "vflpsb %%v19,%%v19\n\t"
103
+ "vflpsb %%v20,%%v20\n\t"
104
+ "vflpsb %%v21,%%v21\n\t"
105
+ "vflpsb %%v22,%%v22\n\t"
106
+ "vflpsb %%v23,%%v23\n\t"
107
+ "vflpsb %%v24,%%v24\n\t"
108
+ "vflpsb %%v25,%%v25\n\t"
109
+ "vflpsb %%v26,%%v26\n\t"
110
+ "vflpsb %%v27,%%v27\n\t"
111
+ "vflpsb %%v28,%%v28\n\t"
112
+ "vflpsb %%v29,%%v29\n\t"
113
+ "vflpsb %%v30,%%v30\n\t"
114
+ "vflpsb %%v31,%%v31\n\t"
115
+ "vfasb %%v16,%%v16,%%v17\n\t"
116
+ "vfasb %%v18,%%v18,%%v19\n\t"
117
+ "vfasb %%v20,%%v20,%%v21\n\t"
118
+ "vfasb %%v22,%%v22,%%v23\n\t"
119
+ "vfasb %%v24,%%v24,%%v25\n\t"
120
+ "vfasb %%v26,%%v26,%%v27\n\t"
121
+ "vfasb %%v28,%%v28,%%v29\n\t"
122
+ "vfasb %%v30,%%v30,%%v31\n\t"
123
+ "vfmaxsb %%v16,%%v16,%%v24,0\n\t"
124
+ "vfmaxsb %%v18,%%v18,%%v26,0\n\t"
125
+ "vfmaxsb %%v20,%%v20,%%v28,0\n\t"
126
+ "vfmaxsb %%v22,%%v22,%%v30,0\n\t"
127
+ "vfmaxsb %%v16,%%v16,%%v20,0\n\t"
128
+ "vfmaxsb %%v18,%%v18,%%v22,0\n\t"
129
+ "vfmaxsb %%v16,%%v16,%%v18,0\n\t"
130
+ "vfmaxsb %%v0,%%v0,%%v16,0\n\t"
131
+ "agfi %%r1, 256\n\t"
132
+ "brctg %[n], 0b\n\t"
133
+ "veslg %%v16,%%v0,32\n\t"
134
+ "vfmaxsb %%v0,%%v0,%%v16,0\n\t"
135
+ "vrepf %%v16,%%v0,2\n\t"
136
+ "wfmaxsb %%v0,%%v0,%%v16,0\n\t"
137
+ "ler %[amax],%%f0"
138
+ : [amax ] "=f" (amax ),[n ] "+&r" (n )
139
+ : "m" (* (const struct { FLOAT x [n * 2 ]; } * ) x ),[x ] "a" (x )
140
+ : "cc" , "r1" , "v0" , "v1" , "v2" , "v16" , "v17" , "v18" , "v19" , "v20" ,
141
+ "v21" , "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30" ,
142
+ "v31" );
143
143
144
144
return amax ;
145
145
}
0 commit comments