Skip to content

Commit 87669d1

Browse files
small fixes, some (i(dz)amin,i(dz)amax,(dz)dot,(dz)asum) mikrokernels can be inlined
1 parent def146e commit 87669d1

File tree

9 files changed

+404
-356
lines changed

9 files changed

+404
-356
lines changed

kernel/zarch/dasum.c

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3636
#endif
3737

3838

39-
static FLOAT __attribute__ ((noinline)) dasum_kernel_32(BLASLONG n, FLOAT *x) {
40-
41-
__asm__ (
42-
"pfd 1, 0(%1) \n\t"
43-
"sllg %%r0,%0,3 \n\t"
44-
"agr %%r0,%1 \n\t"
39+
40+
41+
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
42+
FLOAT asum ;
43+
__asm__ (
44+
"pfd 1, 0(%3) \n\t"
45+
"sllg %%r0,%2,3 \n\t"
46+
"agr %%r0,%3 \n\t"
4547
"vzero %%v0 \n\t"
4648
"vzero %%v1 \n\t"
4749
"vzero %%v2 \n\t"
@@ -95,15 +97,18 @@ static FLOAT __attribute__ ((noinline)) dasum_kernel_32(BLASLONG n, FLOAT *x) {
9597
"vfadb %%v0,%%v25,%%v24 \n\t"
9698
"vrepg %%v1,%%v0,1 \n\t"
9799
"adbr %%f0,%%f1 \n\t"
98-
:
99-
: "r"(n), "a"(x)
100-
: "cc", "memory","r0","f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
100+
"ldr %0,%%f0 \n\t"
101+
: "=f"(asum),"+&a"(x)
102+
: "r"(n), "1"(x)
103+
: "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
101104
);
105+
return asum;
102106

103107
}
104108

105109

106110

111+
107112
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
108113
BLASLONG i = 0;
109114
BLASLONG j = 0;

kernel/zarch/ddot.c

Lines changed: 66 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -30,75 +30,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3030

3131

3232
#if defined(Z13)
33-
static void __attribute__ ((noinline)) ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
33+
static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y)
3434
{
35-
36-
__asm__ volatile(
37-
"pfd 1, 0(%1) \n\t"
38-
"pfd 1, 0(%2) \n\t"
39-
"vzero %%v24 \n\t"
40-
"vzero %%v25 \n\t"
41-
"vzero %%v26 \n\t"
42-
"vzero %%v27 \n\t"
43-
"srlg %%r0,%0,4 \n\t"
44-
"xgr %%r1,%%r1 \n\t"
45-
".align 16 \n\t"
46-
"1: \n\t"
47-
"pfd 1, 256(%%r1,%1) \n\t"
48-
"pfd 1, 256(%%r1,%2) \n\t"
49-
"vl %%v16, 0(%%r1,%1) \n\t"
50-
"vl %%v17, 16(%%r1,%1) \n\t"
51-
"vl %%v18, 32(%%r1,%1) \n\t"
52-
"vl %%v19, 48(%%r1,%1) \n\t"
53-
54-
"vl %%v28, 0(%%r1,%2) \n\t"
55-
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
56-
"vl %%v29, 16(%%r1,%2) \n\t"
57-
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
35+
FLOAT dot;
36+
__asm__ volatile(
37+
"pfd 1, 0(%2) \n\t"
38+
"pfd 1, 0(%3) \n\t"
39+
"vzero %%v24 \n\t"
40+
"vzero %%v25 \n\t"
41+
"vzero %%v26 \n\t"
42+
"vzero %%v27 \n\t"
43+
"srlg %1,%1,4 \n\t"
44+
"xgr %%r1,%%r1 \n\t"
45+
".align 16 \n\t"
46+
"1: \n\t"
47+
"pfd 1, 256(%%r1,%2) \n\t"
48+
"pfd 1, 256(%%r1,%3) \n\t"
49+
"vl %%v16, 0(%%r1,%2) \n\t"
50+
"vl %%v17, 16(%%r1,%2) \n\t"
51+
"vl %%v18, 32(%%r1,%2) \n\t"
52+
"vl %%v19, 48(%%r1,%2) \n\t"
53+
54+
"vl %%v28, 0(%%r1,%3) \n\t"
55+
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
56+
"vl %%v29, 16(%%r1,%3) \n\t"
57+
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
58+
59+
"vl %%v30, 32(%%r1,%3) \n\t"
60+
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
61+
"vl %%v31, 48(%%r1,%3) \n\t"
62+
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
63+
64+
"vl %%v16, 64(%%r1,%2) \n\t"
65+
"vl %%v17, 80(%%r1,%2) \n\t"
66+
"vl %%v18, 96(%%r1,%2) \n\t"
67+
"vl %%v19, 112(%%r1,%2) \n\t"
68+
69+
"vl %%v28, 64(%%r1,%3) \n\t"
70+
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
71+
"vl %%v29, 80(%%r1,%3) \n\t"
72+
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
73+
74+
75+
"vl %%v30, 96(%%r1,%3) \n\t"
76+
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
77+
"vl %%v31, 112(%%r1,%3) \n\t"
78+
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
5879

59-
"vl %%v30, 32(%%r1,%2) \n\t"
60-
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
61-
"vl %%v31, 48(%%r1,%2) \n\t"
62-
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
63-
64-
"vl %%v16, 64(%%r1,%1) \n\t"
65-
"vl %%v17, 80(%%r1,%1) \n\t"
66-
"vl %%v18, 96(%%r1,%1) \n\t"
67-
"vl %%v19, 112(%%r1,%1) \n\t"
68-
69-
"vl %%v28, 64(%%r1,%2) \n\t"
70-
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
71-
"vl %%v29, 80(%%r1,%2) \n\t"
72-
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
73-
74-
75-
"vl %%v30, 96(%%r1,%2) \n\t"
76-
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
77-
"vl %%v31, 112(%%r1,%2) \n\t"
78-
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
79-
80-
81-
"la %%r1,128(%%r1) \n\t"
82-
"brctg %%r0,1b \n\t"
83-
"vfadb %%v24,%%v25,%%v24 \n\t"
84-
"vfadb %%v24,%%v26,%%v24 \n\t"
85-
"vfadb %%v24,%%v27,%%v24 \n\t"
86-
"vrepg %%v1,%%v24,1 \n\t"
87-
"vfadb %%v1,%%v24,%%v1 \n\t"
88-
" std %%f1,0(%3) \n\t"
89-
:
90-
:"r"(n),"a"(x),"a"(y),"a"(d)
91-
:"cc" , "memory" ,"r0","r1","v16", "v17","v18","v19","v20","v21","v22","v23",
92-
"v24","v25","v26","v27","v28","v29","v30","v31"
93-
94-
);
80+
81+
"la %%r1,128(%%r1) \n\t"
82+
"brctg %1,1b \n\t"
83+
"vfadb %%v24,%%v25,%%v24 \n\t"
84+
"vfadb %%v24,%%v26,%%v24 \n\t"
85+
"vfadb %%v24,%%v27,%%v24 \n\t"
86+
"vrepg %%v1,%%v24,1 \n\t"
87+
"vfadb %%v1,%%v24,%%v1 \n\t"
88+
"ldr %0, %%f1 \n\t"
89+
: "=f"(dot) ,"+&r"(n)
90+
: "a"(x),"a"(y)
91+
:"cc" , "r1","v16", "v17","v18","v19","v20","v21","v22","v23",
92+
"v24","v25","v26","v27","v28","v29","v30","v31"
93+
94+
);
95+
return dot;
9596

9697
}
9798

9899

99100
#else
100101

101-
static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
102+
static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y )
102103
{
103104
BLASLONG register i = 0;
104105
FLOAT dot = 0.0;
@@ -117,8 +118,8 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
117118
i+=8 ;
118119

119120
}
120-
*d += dot;
121-
121+
return dot;
122+
122123
}
123124

124125
#endif
@@ -136,9 +137,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
136137
{
137138

138139
BLASLONG n1 = n & -16;
139-
140+
140141
if ( n1 )
141-
ddot_kernel_8(n1, x, y , &dot );
142+
dot = ddot_kernel_8(n1, x, y );
142143

143144
i = n1;
144145
while(i < n)

kernel/zarch/dgemv_n_4.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *
186186

187187
#endif
188188

189-
190189

191-
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
192190

193191
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
194192
{

0 commit comments

Comments
 (0)