@@ -78,7 +78,17 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
78
78
temp7 += v_x [i ] * va7 [i ];
79
79
}
80
80
81
-
81
+ #if defined(POWER8 )
82
+ y [0 ] += alpha * (temp0 [0 ] + temp0 [1 ]+ temp0 [2 ] + temp0 [3 ]);
83
+ y [1 ] += alpha * (temp1 [0 ] + temp1 [1 ]+ temp1 [2 ] + temp1 [3 ]);
84
+ y [2 ] += alpha * (temp2 [0 ] + temp2 [1 ]+ temp2 [2 ] + temp2 [3 ]);
85
+ y [3 ] += alpha * (temp3 [0 ] + temp3 [1 ]+ temp3 [2 ] + temp3 [3 ]);
86
+
87
+ y [4 ] += alpha * (temp4 [0 ] + temp4 [1 ]+ temp4 [2 ] + temp4 [3 ]);
88
+ y [5 ] += alpha * (temp5 [0 ] + temp5 [1 ]+ temp5 [2 ] + temp5 [3 ]);
89
+ y [6 ] += alpha * (temp6 [0 ] + temp6 [1 ]+ temp6 [2 ] + temp6 [3 ]);
90
+ y [7 ] += alpha * (temp7 [0 ] + temp7 [1 ]+ temp7 [2 ] + temp7 [3 ]);
91
+ #else
82
92
register __vector float t0 , t1 , t2 , t3 ;
83
93
register __vector float a = { alpha , alpha , alpha , alpha };
84
94
__vector float * v_y = (__vector float * ) y ;
@@ -105,7 +115,7 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
105
115
106
116
v_y [0 ] += a * temp0 ;
107
117
v_y [1 ] += a * temp4 ;
108
-
118
+ #endif
109
119
}
110
120
111
121
@@ -132,7 +142,12 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
132
142
temp2 += v_x [i ] * va2 [i ];
133
143
temp3 += v_x [i ] * va3 [i ];
134
144
}
135
-
145
+ #if defined(POWER8 )
146
+ y [0 ] += alpha * (temp0 [0 ] + temp0 [1 ]+ temp0 [2 ] + temp0 [3 ]);
147
+ y [1 ] += alpha * (temp1 [0 ] + temp1 [1 ]+ temp1 [2 ] + temp1 [3 ]);
148
+ y [2 ] += alpha * (temp2 [0 ] + temp2 [1 ]+ temp2 [2 ] + temp2 [3 ]);
149
+ y [3 ] += alpha * (temp3 [0 ] + temp3 [1 ]+ temp3 [2 ] + temp3 [3 ]);
150
+ #else
136
151
register __vector float t0 , t1 , t2 , t3 ;
137
152
register __vector float a = { alpha , alpha , alpha , alpha };
138
153
__vector float * v_y = (__vector float * ) y ;
@@ -148,7 +163,7 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
148
163
temp0 += temp1 + temp2 + temp3 ;
149
164
150
165
v_y [0 ] += a * temp0 ;
151
-
166
+ #endif
152
167
}
153
168
154
169
0 commit comments