Skip to content

Commit 0db9c03

Browse files
authored
Merge pull request #2028 from brada4/mv
Move one of clobber fixes to right place
2 parents 343b301 + 6eee1be commit 0db9c03

File tree

2 files changed

+49
-296
lines changed

2 files changed

+49
-296
lines changed

dgemv_n_microk_piledriver-4.c

Lines changed: 0 additions & 247 deletions
This file was deleted.

kernel/x86_64/dgemv_n_microk_piledriver-4.c

Lines changed: 49 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
3838
__asm__ __volatile__
3939
(
4040
"vzeroupper \n\t"
41-
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
42-
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
43-
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
44-
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
45-
"vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
46-
"vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
47-
"vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
48-
"vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
41+
"vbroadcastsd (%3), %%ymm12 \n\t" // x0
42+
"vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
43+
"vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
44+
"vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
45+
"vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
46+
"vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
47+
"vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
48+
"vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
4949

5050
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
5151

5252
"testq $0x04, %1 \n\t"
5353
"jz 2f \n\t"
5454

55-
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
55+
"vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
5656
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
5757
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
5858

59-
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
60-
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
61-
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
62-
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
59+
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
60+
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
61+
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
62+
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
6363

64-
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
65-
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
66-
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
67-
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
64+
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
65+
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
66+
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
67+
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
6868

6969
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
7070
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
7171
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
7272

7373

74-
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
74+
"vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
7575

76-
"addq $4 , %8 \n\t"
76+
"addq $4 , %2 \n\t"
7777
"addq $4 , %0 \n\t"
7878
"subq $4 , %1 \n\t"
7979

@@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
8888

8989
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
9090
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
91-
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
92-
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
93-
94-
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
95-
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
96-
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
97-
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
98-
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
99-
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
100-
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
101-
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
102-
103-
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
91+
"vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
92+
"vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
93+
94+
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
95+
"vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
96+
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
97+
"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
98+
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
99+
"vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
100+
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
101+
"vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
102+
103+
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
104104
"addq $8 , %0 \n\t"
105-
"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
106-
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
107-
"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
108-
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
109-
"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
110-
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
111-
"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
105+
"vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
106+
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
107+
"vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
108+
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
109+
"vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
110+
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
111+
"vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
112112

113113
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
114114
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
115115

116-
"addq $8 , %8 \n\t"
116+
"addq $8 , %2 \n\t"
117117
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
118118
"subq $8 , %1 \n\t"
119-
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
119+
"vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
120120

121121
"jnz 1b \n\t"
122122

@@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
125125

126126
:
127127
"+r" (i), // 0
128-
"+r" (n) // 1
128+
"+r" (n), // 1
129+
"+r" (lda4) // 2
129130
:
130-
"r" (x), // 2
131-
"r" (y), // 3
132-
"r" (ap[0]), // 4
133-
"r" (ap[1]), // 5
134-
"r" (ap[2]), // 6
135-
"r" (ap[3]), // 7
136-
"r" (lda4), // 8
131+
"r" (x), // 3
132+
"r" (y), // 4
133+
"r" (ap[0]), // 5
134+
"r" (ap[1]), // 6
135+
"r" (ap[2]), // 7
136+
"r" (ap[3]), // 8
137137
"r" (alpha) // 9
138138
: "cc",
139139
"%xmm0", "%xmm1",

0 commit comments

Comments
 (0)