@@ -30,75 +30,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
30
31
31
32
32
#if defined(Z13 )
33
- static void __attribute__ (( noinline )) ddot_kernel_8 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT * d )
33
+ static FLOAT ddot_kernel_8 (BLASLONG n , FLOAT * x , FLOAT * y )
34
34
{
35
-
36
- __asm__ volatile (
37
- "pfd 1, 0(%1) \n\t"
38
- "pfd 1, 0(%2) \n\t"
39
- "vzero %%v24 \n\t"
40
- "vzero %%v25 \n\t"
41
- "vzero %%v26 \n\t"
42
- "vzero %%v27 \n\t"
43
- "srlg %%r0,%0,4 \n\t"
44
- "xgr %%r1,%%r1 \n\t"
45
- ".align 16 \n\t"
46
- "1: \n\t"
47
- "pfd 1, 256(%%r1,%1) \n\t"
48
- "pfd 1, 256(%%r1,%2) \n\t"
49
- "vl %%v16, 0(%%r1,%1) \n\t"
50
- "vl %%v17, 16(%%r1,%1) \n\t"
51
- "vl %%v18, 32(%%r1,%1) \n\t"
52
- "vl %%v19, 48(%%r1,%1) \n\t"
53
-
54
- "vl %%v28, 0(%%r1,%2) \n\t"
55
- "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
56
- "vl %%v29, 16(%%r1,%2) \n\t"
57
- "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
35
+ FLOAT dot ;
36
+ __asm__ volatile (
37
+ "pfd 1, 0(%2) \n\t"
38
+ "pfd 1, 0(%3) \n\t"
39
+ "vzero %%v24 \n\t"
40
+ "vzero %%v25 \n\t"
41
+ "vzero %%v26 \n\t"
42
+ "vzero %%v27 \n\t"
43
+ "srlg %1,%1,4 \n\t"
44
+ "xgr %%r1,%%r1 \n\t"
45
+ ".align 16 \n\t"
46
+ "1: \n\t"
47
+ "pfd 1, 256(%%r1,%2) \n\t"
48
+ "pfd 1, 256(%%r1,%3) \n\t"
49
+ "vl %%v16, 0(%%r1,%2) \n\t"
50
+ "vl %%v17, 16(%%r1,%2) \n\t"
51
+ "vl %%v18, 32(%%r1,%2) \n\t"
52
+ "vl %%v19, 48(%%r1,%2) \n\t"
53
+
54
+ "vl %%v28, 0(%%r1,%3) \n\t"
55
+ "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
56
+ "vl %%v29, 16(%%r1,%3) \n\t"
57
+ "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
58
+
59
+ "vl %%v30, 32(%%r1,%3) \n\t"
60
+ "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
61
+ "vl %%v31, 48(%%r1,%3) \n\t"
62
+ "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
63
+
64
+ "vl %%v16, 64(%%r1,%2) \n\t"
65
+ "vl %%v17, 80(%%r1,%2) \n\t"
66
+ "vl %%v18, 96(%%r1,%2) \n\t"
67
+ "vl %%v19, 112(%%r1,%2) \n\t"
68
+
69
+ "vl %%v28, 64(%%r1,%3) \n\t"
70
+ "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
71
+ "vl %%v29, 80(%%r1,%3) \n\t"
72
+ "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
73
+
74
+
75
+ "vl %%v30, 96(%%r1,%3) \n\t"
76
+ "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
77
+ "vl %%v31, 112(%%r1,%3) \n\t"
78
+ "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
58
79
59
- "vl %%v30, 32(%%r1,%2) \n\t"
60
- "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
61
- "vl %%v31, 48(%%r1,%2) \n\t"
62
- "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
63
-
64
- "vl %%v16, 64(%%r1,%1) \n\t"
65
- "vl %%v17, 80(%%r1,%1) \n\t"
66
- "vl %%v18, 96(%%r1,%1) \n\t"
67
- "vl %%v19, 112(%%r1,%1) \n\t"
68
-
69
- "vl %%v28, 64(%%r1,%2) \n\t"
70
- "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
71
- "vl %%v29, 80(%%r1,%2) \n\t"
72
- "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
73
-
74
-
75
- "vl %%v30, 96(%%r1,%2) \n\t"
76
- "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
77
- "vl %%v31, 112(%%r1,%2) \n\t"
78
- "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
79
-
80
-
81
- "la %%r1,128(%%r1) \n\t"
82
- "brctg %%r0,1b \n\t"
83
- "vfadb %%v24,%%v25,%%v24 \n\t"
84
- "vfadb %%v24,%%v26,%%v24 \n\t"
85
- "vfadb %%v24,%%v27,%%v24 \n\t"
86
- "vrepg %%v1,%%v24,1 \n\t"
87
- "vfadb %%v1,%%v24,%%v1 \n\t"
88
- " std %%f1,0(%3) \n\t"
89
- :
90
- :"r" (n ),"a" (x ),"a" (y ),"a" (d )
91
- :"cc" , "memory" ,"r0" ,"r1" ,"v16" , "v17" ,"v18" ,"v19" ,"v20" ,"v21" ,"v22" ,"v23" ,
92
- "v24" ,"v25" ,"v26" ,"v27" ,"v28" ,"v29" ,"v30" ,"v31"
93
-
94
- );
80
+
81
+ "la %%r1,128(%%r1) \n\t"
82
+ "brctg %1,1b \n\t"
83
+ "vfadb %%v24,%%v25,%%v24 \n\t"
84
+ "vfadb %%v24,%%v26,%%v24 \n\t"
85
+ "vfadb %%v24,%%v27,%%v24 \n\t"
86
+ "vrepg %%v1,%%v24,1 \n\t"
87
+ "vfadb %%v1,%%v24,%%v1 \n\t"
88
+ "ldr %0, %%f1 \n\t"
89
+ : "=f" (dot ) ,"+&r" (n )
90
+ : "a" (x ),"a" (y )
91
+ :"cc" , "r1" ,"v16" , "v17" ,"v18" ,"v19" ,"v20" ,"v21" ,"v22" ,"v23" ,
92
+ "v24" ,"v25" ,"v26" ,"v27" ,"v28" ,"v29" ,"v30" ,"v31"
93
+
94
+ );
95
+ return dot ;
95
96
96
97
}
97
98
98
99
99
100
#else
100
101
101
- static void ddot_kernel_8 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT * d )
102
+ static FLOAT ddot_kernel_8 (BLASLONG n , FLOAT * x , FLOAT * y )
102
103
{
103
104
BLASLONG register i = 0 ;
104
105
FLOAT dot = 0.0 ;
@@ -117,8 +118,8 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
117
118
i += 8 ;
118
119
119
120
}
120
- * d += dot ;
121
-
121
+ return dot ;
122
+
122
123
}
123
124
124
125
#endif
@@ -136,9 +137,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
136
137
{
137
138
138
139
BLASLONG n1 = n & -16 ;
139
-
140
+
140
141
if ( n1 )
141
- ddot_kernel_8 (n1 , x , y , & dot );
142
+ dot = ddot_kernel_8 (n1 , x , y );
142
143
143
144
i = n1 ;
144
145
while (i < n )
0 commit comments