@@ -38,22 +38,18 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
38
38
39
39
__asm__ __volatile__
40
40
(
41
- "vmovddup (%2), %%xmm0 \n\t" // alpha
41
+ "vbroadcastsd (%2), %%ymm0 \n\t" // alpha
42
42
43
43
"addq $128, %1 \n\t"
44
44
45
45
"cmpq $0, %0 \n\t"
46
46
"je 4f \n\t"
47
47
48
- "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t"
49
- "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t"
50
- "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t"
51
- "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t"
48
+ "vmulpd -128(%1), %%ymm0, %%ymm4 \n\t"
49
+ "vmulpd -96(%1), %%ymm0, %%ymm5 \n\t"
52
50
53
- "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t"
54
- "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t"
55
- "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t"
56
- "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t"
51
+ "vmulpd -64(%1), %%ymm0, %%ymm6 \n\t"
52
+ "vmulpd -32(%1), %%ymm0, %%ymm7 \n\t"
57
53
58
54
"subq $1 , %0 \n\t"
59
55
"jz 2f \n\t"
@@ -62,26 +58,18 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
62
58
"1: \n\t"
63
59
// "prefetcht0 640(%1) \n\t"
64
60
65
- "vmovups %%xmm4 ,-128(%1) \n\t"
66
- "vmovups %%xmm5 ,-112(%1) \n\t"
67
- "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t"
68
- "vmovups %%xmm6 , -96(%1) \n\t"
69
- "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t"
70
- "vmovups %%xmm7 , -80(%1) \n\t"
71
- "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t"
61
+ "vmovups %%ymm4 ,-128(%1) \n\t"
62
+ "vmovups %%ymm5 , -96(%1) \n\t"
63
+ "vmulpd 0(%1), %%ymm0, %%ymm4 \n\t"
72
64
73
65
// "prefetcht0 704(%1) \n\t"
74
66
75
- "vmovups %%xmm8 , -64(%1) \n\t"
76
- "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t"
77
- "vmovups %%xmm9 , -48(%1) \n\t"
78
- "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t"
79
- "vmovups %%xmm10 , -32(%1) \n\t"
80
- "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t"
81
- "vmovups %%xmm11 , -16(%1) \n\t"
67
+ "vmovups %%ymm6 , -64(%1) \n\t"
68
+ "vmulpd 32(%1), %%ymm0, %%ymm5 \n\t"
69
+ "vmovups %%ymm7 , -32(%1) \n\t"
82
70
83
- "vmulpd 96 (%1), %%xmm0 , %%xmm10 \n\t"
84
- "vmulpd 112 (%1), %%xmm0 , %%xmm11 \n\t"
71
+ "vmulpd 64 (%1), %%ymm0 , %%ymm6 \n\t"
72
+ "vmulpd 96 (%1), %%ymm0 , %%ymm7 \n\t"
85
73
86
74
87
75
"addq $128, %1 \n\t"
@@ -90,15 +78,11 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
90
78
91
79
"2: \n\t"
92
80
93
- "vmovups %%xmm4 ,-128(%1) \n\t"
94
- "vmovups %%xmm5 ,-112(%1) \n\t"
95
- "vmovups %%xmm6 , -96(%1) \n\t"
96
- "vmovups %%xmm7 , -80(%1) \n\t"
81
+ "vmovups %%ymm4 ,-128(%1) \n\t"
82
+ "vmovups %%ymm5 , -96(%1) \n\t"
97
83
98
- "vmovups %%xmm8 , -64(%1) \n\t"
99
- "vmovups %%xmm9 , -48(%1) \n\t"
100
- "vmovups %%xmm10 , -32(%1) \n\t"
101
- "vmovups %%xmm11 , -16(%1) \n\t"
84
+ "vmovups %%ymm6 , -64(%1) \n\t"
85
+ "vmovups %%ymm7 , -32(%1) \n\t"
102
86
103
87
"addq $128, %1 \n\t"
104
88
@@ -107,15 +91,11 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
107
91
"cmpq $8 ,%3 \n\t"
108
92
"jne 5f \n\t"
109
93
110
- "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t"
111
- "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t"
112
- "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t"
113
- "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t"
94
+ "vmulpd -128(%1), %%ymm0, %%ymm4 \n\t"
95
+ "vmulpd -96(%1), %%ymm0, %%ymm5 \n\t"
114
96
115
- "vmovups %%xmm4 ,-128(%1) \n\t"
116
- "vmovups %%xmm5 ,-112(%1) \n\t"
117
- "vmovups %%xmm6 , -96(%1) \n\t"
118
- "vmovups %%xmm7 , -80(%1) \n\t"
97
+ "vmovups %%ymm4 ,-128(%1) \n\t"
98
+ "vmovups %%ymm5 , -96(%1) \n\t"
119
99
120
100
"5: \n\t"
121
101
@@ -149,7 +129,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
149
129
150
130
__asm__ __volatile__
151
131
(
152
- "vxorpd %%xmm0 , %%xmm0 , %%xmm0 \n\t"
132
+ "vxorpd %%ymm0 , %%ymm0 , %%ymm0 \n\t"
153
133
154
134
"addq $128, %1 \n\t"
155
135
@@ -159,15 +139,11 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
159
139
".p2align 4 \n\t"
160
140
"1: \n\t"
161
141
162
- "vmovups %%xmm0 ,-128(%1) \n\t"
163
- "vmovups %%xmm0 ,-112(%1) \n\t"
164
- "vmovups %%xmm0 , -96(%1) \n\t"
165
- "vmovups %%xmm0 , -80(%1) \n\t"
142
+ "vmovups %%ymm0 , -128(%1) \n\t"
143
+ "vmovups %%ymm0 , -96(%1) \n\t"
166
144
167
- "vmovups %%xmm0 , -64(%1) \n\t"
168
- "vmovups %%xmm0 , -48(%1) \n\t"
169
- "vmovups %%xmm0 , -32(%1) \n\t"
170
- "vmovups %%xmm0 , -16(%1) \n\t"
145
+ "vmovups %%ymm0 , -64(%1) \n\t"
146
+ "vmovups %%ymm0 , -32(%1) \n\t"
171
147
172
148
"addq $128, %1 \n\t"
173
149
"subq $1 , %0 \n\t"
@@ -178,10 +154,8 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
178
154
"cmpq $8 ,%3 \n\t"
179
155
"jne 4f \n\t"
180
156
181
- "vmovups %%xmm0 ,-128(%1) \n\t"
182
- "vmovups %%xmm0 ,-112(%1) \n\t"
183
- "vmovups %%xmm0 , -96(%1) \n\t"
184
- "vmovups %%xmm0 , -80(%1) \n\t"
157
+ "vmovups %%ymm0 ,-128(%1) \n\t"
158
+ "vmovups %%ymm0 , -96(%1) \n\t"
185
159
186
160
"4: \n\t"
187
161
0 commit comments