@@ -24,12 +24,21 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
24
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25
25
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
26
*****************************************************************************/
27
-
28
27
#include "common.h"
29
-
30
-
31
28
#ifndef HAVE_ASM_KERNEL
32
29
#include <altivec.h>
30
+
31
+ #define offset_0 0
32
+ #define offset_1 16
33
+ #define offset_2 32
34
+ #define offset_3 48
35
+ #define offset_4 64
36
+ #define offset_5 80
37
+ #define offset_6 96
38
+ #define offset_7 112
39
+
40
+ static const unsigned char __attribute__((aligned (16 ))) swap_mask_arr []= { 4 ,5 ,6 ,7 ,0 ,1 ,2 ,3 , 12 ,13 ,14 ,15 , 8 ,9 ,10 ,11 };
41
+
33
42
static void caxpy_kernel_16 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT alpha_r , FLOAT alpha_i )
34
43
{
35
44
@@ -43,28 +52,29 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
43
52
register __vector float valpha_i = {alpha_i , alpha_i ,alpha_i , alpha_i };
44
53
#endif
45
54
46
- __vector unsigned char swap_mask = { 4 , 5 , 6 , 7 , 0 , 1 , 2 , 3 , 12 , 13 , 14 , 15 , 8 , 9 , 10 , 11 } ;
47
- register __vector float * vy = (__vector float * ) y ;
48
- register __vector float * vx = (__vector float * ) x ;
55
+ __vector unsigned char swap_mask = * (( __vector unsigned char * ) swap_mask_arr ) ;
56
+ register __vector float * vptr_y = (__vector float * ) y ;
57
+ register __vector float * vptr_x = (__vector float * ) x ;
49
58
BLASLONG i = 0 ;
50
- for (; i < n /2 ; i += 8 ) {
59
+ for (;i < n /2 ;i += 8 ){
60
+
61
+ register __vector float vy_0 = vec_vsx_ld ( offset_0 ,vptr_y ) ;
62
+ register __vector float vy_1 = vec_vsx_ld ( offset_1 ,vptr_y ) ;
63
+ register __vector float vy_2 = vec_vsx_ld ( offset_2 ,vptr_y ) ;
64
+ register __vector float vy_3 = vec_vsx_ld ( offset_3 ,vptr_y ) ;
65
+ register __vector float vy_4 = vec_vsx_ld ( offset_4 ,vptr_y ) ;
66
+ register __vector float vy_5 = vec_vsx_ld ( offset_5 ,vptr_y ) ;
67
+ register __vector float vy_6 = vec_vsx_ld ( offset_6 ,vptr_y ) ;
68
+ register __vector float vy_7 = vec_vsx_ld ( offset_7 ,vptr_y ) ;
51
69
52
- register __vector float vy_0 = vy [i ];
53
- register __vector float vy_1 = vy [i + 1 ];
54
- register __vector float vy_2 = vy [i + 2 ];
55
- register __vector float vy_3 = vy [i + 3 ];
56
- register __vector float vy_4 = vy [i + 4 ];
57
- register __vector float vy_5 = vy [i + 5 ];
58
- register __vector float vy_6 = vy [i + 6 ];
59
- register __vector float vy_7 = vy [i + 7 ];
60
- register __vector float vx_0 = vx [i ];
61
- register __vector float vx_1 = vx [i + 1 ];
62
- register __vector float vx_2 = vx [i + 2 ];
63
- register __vector float vx_3 = vx [i + 3 ];
64
- register __vector float vx_4 = vx [i + 4 ];
65
- register __vector float vx_5 = vx [i + 5 ];
66
- register __vector float vx_6 = vx [i + 6 ];
67
- register __vector float vx_7 = vx [i + 7 ];
70
+ register __vector float vx_0 = vec_vsx_ld ( offset_0 ,vptr_x ) ;
71
+ register __vector float vx_1 = vec_vsx_ld ( offset_1 ,vptr_x ) ;
72
+ register __vector float vx_2 = vec_vsx_ld ( offset_2 ,vptr_x ) ;
73
+ register __vector float vx_3 = vec_vsx_ld ( offset_3 ,vptr_x ) ;
74
+ register __vector float vx_4 = vec_vsx_ld ( offset_4 ,vptr_x ) ;
75
+ register __vector float vx_5 = vec_vsx_ld ( offset_5 ,vptr_x ) ;
76
+ register __vector float vx_6 = vec_vsx_ld ( offset_6 ,vptr_x ) ;
77
+ register __vector float vx_7 = vec_vsx_ld ( offset_7 ,vptr_x ) ;
68
78
vy_0 += vx_0 * valpha_r ;
69
79
vy_1 += vx_1 * valpha_r ;
70
80
vy_2 += vx_2 * valpha_r ;
@@ -89,15 +99,17 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
89
99
vy_5 += vx_5 * valpha_i ;
90
100
vy_6 += vx_6 * valpha_i ;
91
101
vy_7 += vx_7 * valpha_i ;
92
- vy [ i ] = vy_0 ;
93
- vy [ i + 1 ] = vy_1 ;
94
- vy [ i + 2 ] = vy_2 ;
95
- vy [ i + 3 ] = vy_3 ;
96
- vy [ i + 4 ] = vy_4 ;
97
- vy [ i + 5 ] = vy_5 ;
98
- vy [ i + 6 ] = vy_6 ;
99
- vy [ i + 7 ] = vy_7 ;
102
+ vec_vsx_st ( vy_0 , offset_0 , vptr_y ) ;
103
+ vec_vsx_st ( vy_1 , offset_1 , vptr_y ) ;
104
+ vec_vsx_st ( vy_2 , offset_2 , vptr_y ) ;
105
+ vec_vsx_st ( vy_3 , offset_3 , vptr_y ) ;
106
+ vec_vsx_st ( vy_4 , offset_4 , vptr_y ) ;
107
+ vec_vsx_st ( vy_5 , offset_5 , vptr_y ) ;
108
+ vec_vsx_st ( vy_6 , offset_6 , vptr_y ) ;
109
+ vec_vsx_st ( vy_7 , offset_7 , vptr_y ) ;
100
110
111
+ vptr_x += 8 ;
112
+ vptr_y += 8 ;
101
113
}
102
114
}
103
115
#endif
0 commit comments