Skip to content

Commit 042e3c0

Browse files
authored
Merge pull request #3848 from bartoldeman/dscal-haswell-ymm
dscal: use ymm registers in Haswell microkernel
2 parents 0276307 + 5c3169e commit 042e3c0

File tree

1 file changed

+28
-54
lines changed

1 file changed

+28
-54
lines changed

kernel/x86_64/dscal_microk_haswell-2.c

Lines changed: 28 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -38,22 +38,18 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
3838

3939
__asm__ __volatile__
4040
(
41-
"vmovddup (%2), %%xmm0 \n\t" // alpha
41+
"vbroadcastsd (%2), %%ymm0 \n\t" // alpha
4242

4343
"addq $128, %1 \n\t"
4444

4545
"cmpq $0, %0 \n\t"
4646
"je 4f \n\t"
4747

48-
"vmulpd -128(%1), %%xmm0, %%xmm4 \n\t"
49-
"vmulpd -112(%1), %%xmm0, %%xmm5 \n\t"
50-
"vmulpd -96(%1), %%xmm0, %%xmm6 \n\t"
51-
"vmulpd -80(%1), %%xmm0, %%xmm7 \n\t"
48+
"vmulpd -128(%1), %%ymm0, %%ymm4 \n\t"
49+
"vmulpd -96(%1), %%ymm0, %%ymm5 \n\t"
5250

53-
"vmulpd -64(%1), %%xmm0, %%xmm8 \n\t"
54-
"vmulpd -48(%1), %%xmm0, %%xmm9 \n\t"
55-
"vmulpd -32(%1), %%xmm0, %%xmm10 \n\t"
56-
"vmulpd -16(%1), %%xmm0, %%xmm11 \n\t"
51+
"vmulpd -64(%1), %%ymm0, %%ymm6 \n\t"
52+
"vmulpd -32(%1), %%ymm0, %%ymm7 \n\t"
5753

5854
"subq $1 , %0 \n\t"
5955
"jz 2f \n\t"
@@ -62,26 +58,18 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
6258
"1: \n\t"
6359
// "prefetcht0 640(%1) \n\t"
6460

65-
"vmovups %%xmm4 ,-128(%1) \n\t"
66-
"vmovups %%xmm5 ,-112(%1) \n\t"
67-
"vmulpd 0(%1), %%xmm0, %%xmm4 \n\t"
68-
"vmovups %%xmm6 , -96(%1) \n\t"
69-
"vmulpd 16(%1), %%xmm0, %%xmm5 \n\t"
70-
"vmovups %%xmm7 , -80(%1) \n\t"
71-
"vmulpd 32(%1), %%xmm0, %%xmm6 \n\t"
61+
"vmovups %%ymm4 ,-128(%1) \n\t"
62+
"vmovups %%ymm5 , -96(%1) \n\t"
63+
"vmulpd 0(%1), %%ymm0, %%ymm4 \n\t"
7264

7365
// "prefetcht0 704(%1) \n\t"
7466

75-
"vmovups %%xmm8 , -64(%1) \n\t"
76-
"vmulpd 48(%1), %%xmm0, %%xmm7 \n\t"
77-
"vmovups %%xmm9 , -48(%1) \n\t"
78-
"vmulpd 64(%1), %%xmm0, %%xmm8 \n\t"
79-
"vmovups %%xmm10 , -32(%1) \n\t"
80-
"vmulpd 80(%1), %%xmm0, %%xmm9 \n\t"
81-
"vmovups %%xmm11 , -16(%1) \n\t"
67+
"vmovups %%ymm6 , -64(%1) \n\t"
68+
"vmulpd 32(%1), %%ymm0, %%ymm5 \n\t"
69+
"vmovups %%ymm7 , -32(%1) \n\t"
8270

83-
"vmulpd 96(%1), %%xmm0, %%xmm10 \n\t"
84-
"vmulpd 112(%1), %%xmm0, %%xmm11 \n\t"
71+
"vmulpd 64(%1), %%ymm0, %%ymm6 \n\t"
72+
"vmulpd 96(%1), %%ymm0, %%ymm7 \n\t"
8573

8674

8775
"addq $128, %1 \n\t"
@@ -90,15 +78,11 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
9078

9179
"2: \n\t"
9280

93-
"vmovups %%xmm4 ,-128(%1) \n\t"
94-
"vmovups %%xmm5 ,-112(%1) \n\t"
95-
"vmovups %%xmm6 , -96(%1) \n\t"
96-
"vmovups %%xmm7 , -80(%1) \n\t"
81+
"vmovups %%ymm4 ,-128(%1) \n\t"
82+
"vmovups %%ymm5 , -96(%1) \n\t"
9783

98-
"vmovups %%xmm8 , -64(%1) \n\t"
99-
"vmovups %%xmm9 , -48(%1) \n\t"
100-
"vmovups %%xmm10 , -32(%1) \n\t"
101-
"vmovups %%xmm11 , -16(%1) \n\t"
84+
"vmovups %%ymm6 , -64(%1) \n\t"
85+
"vmovups %%ymm7 , -32(%1) \n\t"
10286

10387
"addq $128, %1 \n\t"
10488

@@ -107,15 +91,11 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
10791
"cmpq $8 ,%3 \n\t"
10892
"jne 5f \n\t"
10993

110-
"vmulpd -128(%1), %%xmm0, %%xmm4 \n\t"
111-
"vmulpd -112(%1), %%xmm0, %%xmm5 \n\t"
112-
"vmulpd -96(%1), %%xmm0, %%xmm6 \n\t"
113-
"vmulpd -80(%1), %%xmm0, %%xmm7 \n\t"
94+
"vmulpd -128(%1), %%ymm0, %%ymm4 \n\t"
95+
"vmulpd -96(%1), %%ymm0, %%ymm5 \n\t"
11496

115-
"vmovups %%xmm4 ,-128(%1) \n\t"
116-
"vmovups %%xmm5 ,-112(%1) \n\t"
117-
"vmovups %%xmm6 , -96(%1) \n\t"
118-
"vmovups %%xmm7 , -80(%1) \n\t"
97+
"vmovups %%ymm4 ,-128(%1) \n\t"
98+
"vmovups %%ymm5 , -96(%1) \n\t"
11999

120100
"5: \n\t"
121101

@@ -149,7 +129,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
149129

150130
__asm__ __volatile__
151131
(
152-
"vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t"
132+
"vxorpd %%ymm0, %%ymm0 , %%ymm0 \n\t"
153133

154134
"addq $128, %1 \n\t"
155135

@@ -159,15 +139,11 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
159139
".p2align 4 \n\t"
160140
"1: \n\t"
161141

162-
"vmovups %%xmm0 ,-128(%1) \n\t"
163-
"vmovups %%xmm0 ,-112(%1) \n\t"
164-
"vmovups %%xmm0 , -96(%1) \n\t"
165-
"vmovups %%xmm0 , -80(%1) \n\t"
142+
"vmovups %%ymm0 , -128(%1) \n\t"
143+
"vmovups %%ymm0 , -96(%1) \n\t"
166144

167-
"vmovups %%xmm0 , -64(%1) \n\t"
168-
"vmovups %%xmm0 , -48(%1) \n\t"
169-
"vmovups %%xmm0 , -32(%1) \n\t"
170-
"vmovups %%xmm0 , -16(%1) \n\t"
145+
"vmovups %%ymm0 , -64(%1) \n\t"
146+
"vmovups %%ymm0 , -32(%1) \n\t"
171147

172148
"addq $128, %1 \n\t"
173149
"subq $1 , %0 \n\t"
@@ -178,10 +154,8 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
178154
"cmpq $8 ,%3 \n\t"
179155
"jne 4f \n\t"
180156

181-
"vmovups %%xmm0 ,-128(%1) \n\t"
182-
"vmovups %%xmm0 ,-112(%1) \n\t"
183-
"vmovups %%xmm0 , -96(%1) \n\t"
184-
"vmovups %%xmm0 , -80(%1) \n\t"
157+
"vmovups %%ymm0 ,-128(%1) \n\t"
158+
"vmovups %%ymm0 , -96(%1) \n\t"
185159

186160
"4: \n\t"
187161

0 commit comments

Comments
 (0)