Skip to content

Commit 48b9b94

Browse files
authored
[ZARCH] Improve loading performance for camax/icamax
1 parent eaf20f0 commit 48b9b94

File tree

6 files changed

+212
-276
lines changed

6 files changed

+212
-276
lines changed

kernel/zarch/camax.c

Lines changed: 56 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -52,82 +52,66 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
5252
"vflpsb %%v0,%%v0 \n\t"
5353
"vflpsb %%v16,%%v16 \n\t"
5454
"vfasb %%v0,%%v0,%%v16 \n\t"
55+
"vleib %%v1,0,0 \n\t"
56+
"vleib %%v1,1,1 \n\t"
57+
"vleib %%v1,2,2 \n\t"
58+
"vleib %%v1,3,3 \n\t"
59+
"vleib %%v1,8,4 \n\t"
60+
"vleib %%v1,9,5 \n\t"
61+
"vleib %%v1,10,6 \n\t"
62+
"vleib %%v1,11,7 \n\t"
63+
"vleib %%v1,16,8 \n\t"
64+
"vleib %%v1,17,9 \n\t"
65+
"vleib %%v1,18,10 \n\t"
66+
"vleib %%v1,19,11 \n\t"
67+
"vleib %%v1,24,12 \n\t"
68+
"vleib %%v1,25,13 \n\t"
69+
"vleib %%v1,26,14 \n\t"
70+
"vleib %%v1,27,15 \n\t"
5571
"srlg %%r0,%1,5 \n\t"
5672
"xgr %%r1,%%r1 \n\t"
5773
"0: \n\t"
5874
"pfd 1, 1024(%%r1,%2) \n\t"
5975

60-
"vlef %%v16,0(%%r1,%2),0 \n\t"
61-
"vlef %%v17,4(%%r1,%2),0 \n\t"
62-
"vlef %%v16,8(%%r1,%2),1 \n\t"
63-
"vlef %%v17,12(%%r1,%2),1 \n\t"
64-
"vlef %%v16,16(%%r1,%2),2 \n\t"
65-
"vlef %%v17,20(%%r1,%2),2 \n\t"
66-
"vlef %%v16,24(%%r1,%2),3 \n\t"
67-
"vlef %%v17,28(%%r1,%2),3 \n\t"
68-
69-
"vlef %%v18,32(%%r1,%2),0 \n\t"
70-
"vlef %%v19,36(%%r1,%2),0 \n\t"
71-
"vlef %%v18,40(%%r1,%2),1 \n\t"
72-
"vlef %%v19,44(%%r1,%2),1 \n\t"
73-
"vlef %%v18,48(%%r1,%2),2 \n\t"
74-
"vlef %%v19,52(%%r1,%2),2 \n\t"
75-
"vlef %%v18,56(%%r1,%2),3 \n\t"
76-
"vlef %%v19,30(%%r1,%2),3 \n\t"
77-
78-
"vlef %%v20,64(%%r1,%2),0 \n\t"
79-
"vlef %%v21,68(%%r1,%2),0 \n\t"
80-
"vlef %%v20,72(%%r1,%2),1 \n\t"
81-
"vlef %%v21,76(%%r1,%2),1 \n\t"
82-
"vlef %%v20,80(%%r1,%2),2 \n\t"
83-
"vlef %%v21,84(%%r1,%2),2 \n\t"
84-
"vlef %%v20,88(%%r1,%2),3 \n\t"
85-
"vlef %%v21,92(%%r1,%2),3 \n\t"
86-
87-
"vlef %%v22,96(%%r1,%2),0 \n\t"
88-
"vlef %%v23,100(%%r1,%2),0 \n\t"
89-
"vlef %%v22,104(%%r1,%2),1 \n\t"
90-
"vlef %%v23,108(%%r1,%2),1 \n\t"
91-
"vlef %%v22,112(%%r1,%2),2 \n\t"
92-
"vlef %%v23,116(%%r1,%2),2 \n\t"
93-
"vlef %%v22,120(%%r1,%2),3 \n\t"
94-
"vlef %%v23,124(%%r1,%2),3 \n\t"
95-
96-
"vlef %%v24,128(%%r1,%2),0 \n\t"
97-
"vlef %%v25,132(%%r1,%2),0 \n\t"
98-
"vlef %%v24,136(%%r1,%2),1 \n\t"
99-
"vlef %%v25,140(%%r1,%2),1 \n\t"
100-
"vlef %%v24,144(%%r1,%2),2 \n\t"
101-
"vlef %%v25,148(%%r1,%2),2 \n\t"
102-
"vlef %%v24,152(%%r1,%2),3 \n\t"
103-
"vlef %%v25,156(%%r1,%2),3 \n\t"
104-
105-
"vlef %%v26,160(%%r1,%2),0 \n\t"
106-
"vlef %%v27,164(%%r1,%2),0 \n\t"
107-
"vlef %%v26,168(%%r1,%2),1 \n\t"
108-
"vlef %%v27,172(%%r1,%2),1 \n\t"
109-
"vlef %%v26,176(%%r1,%2),2 \n\t"
110-
"vlef %%v27,180(%%r1,%2),2 \n\t"
111-
"vlef %%v26,184(%%r1,%2),3 \n\t"
112-
"vlef %%v27,188(%%r1,%2),3 \n\t"
113-
114-
"vlef %%v28,192(%%r1,%2),0 \n\t"
115-
"vlef %%v29,196(%%r1,%2),0 \n\t"
116-
"vlef %%v28,200(%%r1,%2),1 \n\t"
117-
"vlef %%v29,204(%%r1,%2),1 \n\t"
118-
"vlef %%v28,208(%%r1,%2),2 \n\t"
119-
"vlef %%v29,212(%%r1,%2),2 \n\t"
120-
"vlef %%v28,216(%%r1,%2),3 \n\t"
121-
"vlef %%v29,220(%%r1,%2),3 \n\t"
122-
123-
"vlef %%v30,224(%%r1,%2),0 \n\t"
124-
"vlef %%v31,228(%%r1,%2),0 \n\t"
125-
"vlef %%v30,232(%%r1,%2),1 \n\t"
126-
"vlef %%v31,236(%%r1,%2),1 \n\t"
127-
"vlef %%v30,240(%%r1,%2),2 \n\t"
128-
"vlef %%v31,244(%%r1,%2),2 \n\t"
129-
"vlef %%v30,248(%%r1,%2),3 \n\t"
130-
"vlef %%v31,252(%%r1,%2),3 \n\t"
76+
"vl %%v16,0(%%r1,%2) \n\t"
77+
"vl %%v2,16(%%r1,%2) \n\t"
78+
"vpkg %%v17,%%v16,%%v2 \n\t"
79+
"vperm %%v16,%%v16,%%v2,%%v1 \n\t"
80+
81+
"vl %%v18,32(%%r1,%2) \n\t"
82+
"vl %%v2,48(%%r1,%2) \n\t"
83+
"vpkg %%v19,%%v18,%%v2 \n\t"
84+
"vperm %%v18,%%v18,%%v2,%%v1 \n\t"
85+
86+
"vl %%v20,64(%%r1,%2) \n\t"
87+
"vl %%v2,80(%%r1,%2) \n\t"
88+
"vpkg %%v21,%%v20,%%v2 \n\t"
89+
"vperm %%v20,%%v20,%%v2,%%v1 \n\t"
90+
91+
"vl %%v22,96(%%r1,%2) \n\t"
92+
"vl %%v2,112(%%r1,%2) \n\t"
93+
"vpkg %%v23,%%v22,%%v2 \n\t"
94+
"vperm %%v22,%%v22,%%v2,%%v1 \n\t"
95+
96+
"vl %%v24,128(%%r1,%2) \n\t"
97+
"vl %%v2,144(%%r1,%2) \n\t"
98+
"vpkg %%v25,%%v24,%%v2 \n\t"
99+
"vperm %%v24,%%v24,%%v2,%%v1 \n\t"
100+
101+
"vl %%v26,160(%%r1,%2) \n\t"
102+
"vl %%v2,176(%%r1,%2) \n\t"
103+
"vpkg %%v27,%%v26,%%v2 \n\t"
104+
"vperm %%v26,%%v26,%%v2,%%v1 \n\t"
105+
106+
"vl %%v28,192(%%r1,%2) \n\t"
107+
"vl %%v2,208(%%r1,%2) \n\t"
108+
"vpkg %%v29,%%v28,%%v2 \n\t"
109+
"vperm %%v28,%%v28,%%v2,%%v1 \n\t"
110+
111+
"vl %%v30,224(%%r1,%2) \n\t"
112+
"vl %%v2,240(%%r1,%2) \n\t"
113+
"vpkg %%v31,%%v30,%%v2 \n\t"
114+
"vperm %%v30,%%v30,%%v2,%%v1 \n\t"
131115

132116
"vflpsb %%v16,%%v16 \n\t"
133117
"vflpsb %%v17,%%v17 \n\t"
@@ -178,7 +162,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
178162
"ler %0,%%f0 "
179163
:"=f"(amax)
180164
:"r"(n),"ZR"((const FLOAT (*)[n])x)
181-
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
165+
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
182166
);
183167

184168
return amax;

kernel/zarch/camin.c

Lines changed: 56 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -52,82 +52,66 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
5252
"vflpsb %%v0,%%v0 \n\t"
5353
"vflpsb %%v16,%%v16 \n\t"
5454
"vfasb %%v0,%%v0,%%v16 \n\t"
55+
"vleib %%v1,0,0 \n\t"
56+
"vleib %%v1,1,1 \n\t"
57+
"vleib %%v1,2,2 \n\t"
58+
"vleib %%v1,3,3 \n\t"
59+
"vleib %%v1,8,4 \n\t"
60+
"vleib %%v1,9,5 \n\t"
61+
"vleib %%v1,10,6 \n\t"
62+
"vleib %%v1,11,7 \n\t"
63+
"vleib %%v1,16,8 \n\t"
64+
"vleib %%v1,17,9 \n\t"
65+
"vleib %%v1,18,10 \n\t"
66+
"vleib %%v1,19,11 \n\t"
67+
"vleib %%v1,24,12 \n\t"
68+
"vleib %%v1,25,13 \n\t"
69+
"vleib %%v1,26,14 \n\t"
70+
"vleib %%v1,27,15 \n\t"
5571
"srlg %%r0,%1,5 \n\t"
5672
"xgr %%r1,%%r1 \n\t"
5773
"0: \n\t"
5874
"pfd 1, 1024(%%r1,%2) \n\t"
5975

60-
"vlef %%v16,0(%%r1,%2),0 \n\t"
61-
"vlef %%v17,4(%%r1,%2),0 \n\t"
62-
"vlef %%v16,8(%%r1,%2),1 \n\t"
63-
"vlef %%v17,12(%%r1,%2),1 \n\t"
64-
"vlef %%v16,16(%%r1,%2),2 \n\t"
65-
"vlef %%v17,20(%%r1,%2),2 \n\t"
66-
"vlef %%v16,24(%%r1,%2),3 \n\t"
67-
"vlef %%v17,28(%%r1,%2),3 \n\t"
68-
69-
"vlef %%v18,32(%%r1,%2),0 \n\t"
70-
"vlef %%v19,36(%%r1,%2),0 \n\t"
71-
"vlef %%v18,40(%%r1,%2),1 \n\t"
72-
"vlef %%v19,44(%%r1,%2),1 \n\t"
73-
"vlef %%v18,48(%%r1,%2),2 \n\t"
74-
"vlef %%v19,52(%%r1,%2),2 \n\t"
75-
"vlef %%v18,56(%%r1,%2),3 \n\t"
76-
"vlef %%v19,30(%%r1,%2),3 \n\t"
77-
78-
"vlef %%v20,64(%%r1,%2),0 \n\t"
79-
"vlef %%v21,68(%%r1,%2),0 \n\t"
80-
"vlef %%v20,72(%%r1,%2),1 \n\t"
81-
"vlef %%v21,76(%%r1,%2),1 \n\t"
82-
"vlef %%v20,80(%%r1,%2),2 \n\t"
83-
"vlef %%v21,84(%%r1,%2),2 \n\t"
84-
"vlef %%v20,88(%%r1,%2),3 \n\t"
85-
"vlef %%v21,92(%%r1,%2),3 \n\t"
86-
87-
"vlef %%v22,96(%%r1,%2),0 \n\t"
88-
"vlef %%v23,100(%%r1,%2),0 \n\t"
89-
"vlef %%v22,104(%%r1,%2),1 \n\t"
90-
"vlef %%v23,108(%%r1,%2),1 \n\t"
91-
"vlef %%v22,112(%%r1,%2),2 \n\t"
92-
"vlef %%v23,116(%%r1,%2),2 \n\t"
93-
"vlef %%v22,120(%%r1,%2),3 \n\t"
94-
"vlef %%v23,124(%%r1,%2),3 \n\t"
95-
96-
"vlef %%v24,128(%%r1,%2),0 \n\t"
97-
"vlef %%v25,132(%%r1,%2),0 \n\t"
98-
"vlef %%v24,136(%%r1,%2),1 \n\t"
99-
"vlef %%v25,140(%%r1,%2),1 \n\t"
100-
"vlef %%v24,144(%%r1,%2),2 \n\t"
101-
"vlef %%v25,148(%%r1,%2),2 \n\t"
102-
"vlef %%v24,152(%%r1,%2),3 \n\t"
103-
"vlef %%v25,156(%%r1,%2),3 \n\t"
104-
105-
"vlef %%v26,160(%%r1,%2),0 \n\t"
106-
"vlef %%v27,164(%%r1,%2),0 \n\t"
107-
"vlef %%v26,168(%%r1,%2),1 \n\t"
108-
"vlef %%v27,172(%%r1,%2),1 \n\t"
109-
"vlef %%v26,176(%%r1,%2),2 \n\t"
110-
"vlef %%v27,180(%%r1,%2),2 \n\t"
111-
"vlef %%v26,184(%%r1,%2),3 \n\t"
112-
"vlef %%v27,188(%%r1,%2),3 \n\t"
113-
114-
"vlef %%v28,192(%%r1,%2),0 \n\t"
115-
"vlef %%v29,196(%%r1,%2),0 \n\t"
116-
"vlef %%v28,200(%%r1,%2),1 \n\t"
117-
"vlef %%v29,204(%%r1,%2),1 \n\t"
118-
"vlef %%v28,208(%%r1,%2),2 \n\t"
119-
"vlef %%v29,212(%%r1,%2),2 \n\t"
120-
"vlef %%v28,216(%%r1,%2),3 \n\t"
121-
"vlef %%v29,220(%%r1,%2),3 \n\t"
122-
123-
"vlef %%v30,224(%%r1,%2),0 \n\t"
124-
"vlef %%v31,228(%%r1,%2),0 \n\t"
125-
"vlef %%v30,232(%%r1,%2),1 \n\t"
126-
"vlef %%v31,236(%%r1,%2),1 \n\t"
127-
"vlef %%v30,240(%%r1,%2),2 \n\t"
128-
"vlef %%v31,244(%%r1,%2),2 \n\t"
129-
"vlef %%v30,248(%%r1,%2),3 \n\t"
130-
"vlef %%v31,252(%%r1,%2),3 \n\t"
76+
"vl %%v16,0(%%r1,%2) \n\t"
77+
"vl %%v2,16(%%r1,%2) \n\t"
78+
"vpkg %%v17,%%v16,%%v2 \n\t"
79+
"vperm %%v16,%%v16,%%v2,%%v1 \n\t"
80+
81+
"vl %%v18,32(%%r1,%2) \n\t"
82+
"vl %%v2,48(%%r1,%2) \n\t"
83+
"vpkg %%v19,%%v18,%%v2 \n\t"
84+
"vperm %%v18,%%v18,%%v2,%%v1 \n\t"
85+
86+
"vl %%v20,64(%%r1,%2) \n\t"
87+
"vl %%v2,80(%%r1,%2) \n\t"
88+
"vpkg %%v21,%%v20,%%v2 \n\t"
89+
"vperm %%v20,%%v20,%%v2,%%v1 \n\t"
90+
91+
"vl %%v22,96(%%r1,%2) \n\t"
92+
"vl %%v2,112(%%r1,%2) \n\t"
93+
"vpkg %%v23,%%v22,%%v2 \n\t"
94+
"vperm %%v22,%%v22,%%v2,%%v1 \n\t"
95+
96+
"vl %%v24,128(%%r1,%2) \n\t"
97+
"vl %%v2,144(%%r1,%2) \n\t"
98+
"vpkg %%v25,%%v24,%%v2 \n\t"
99+
"vperm %%v24,%%v24,%%v2,%%v1 \n\t"
100+
101+
"vl %%v26,160(%%r1,%2) \n\t"
102+
"vl %%v2,176(%%r1,%2) \n\t"
103+
"vpkg %%v27,%%v26,%%v2 \n\t"
104+
"vperm %%v26,%%v26,%%v2,%%v1 \n\t"
105+
106+
"vl %%v28,192(%%r1,%2) \n\t"
107+
"vl %%v2,208(%%r1,%2) \n\t"
108+
"vpkg %%v29,%%v28,%%v2 \n\t"
109+
"vperm %%v28,%%v28,%%v2,%%v1 \n\t"
110+
111+
"vl %%v30,224(%%r1,%2) \n\t"
112+
"vl %%v2,240(%%r1,%2) \n\t"
113+
"vpkg %%v31,%%v30,%%v2 \n\t"
114+
"vperm %%v30,%%v30,%%v2,%%v1 \n\t"
131115

132116
"vflpsb %%v16,%%v16 \n\t"
133117
"vflpsb %%v17,%%v17 \n\t"
@@ -178,7 +162,7 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
178162
"ler %0,%%f0 "
179163
:"=f"(amin)
180164
:"r"(n),"ZR"((const FLOAT (*)[n])x)
181-
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
165+
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
182166
);
183167

184168
return amin;

0 commit comments

Comments
 (0)