Skip to content

Commit 1c6da2d

Browse files
authored
Merge pull request #2019 from martin-frbg/gcc9fixes
Fix unannounced modification of input operand 8 (lda4) in Haswell GEMVN microkernel
2 parents d3e4725 + 4255a58 commit 1c6da2d

File tree

1 file changed

+62
-63
lines changed

1 file changed

+62
-63
lines changed

kernel/x86_64/sgemv_n_microk_haswell-4.c

Lines changed: 62 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
*****************************************************************************/
2727

2828

29-
3029
#define HAVE_KERNEL_4x8 1
3130
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
3231

@@ -38,41 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
3837
__asm__ __volatile__
3938
(
4039
"vzeroupper \n\t"
41-
"vbroadcastss (%2), %%ymm12 \n\t" // x0
42-
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
43-
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
44-
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
45-
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
46-
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
47-
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
48-
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
40+
"vbroadcastss (%3), %%ymm12 \n\t" // x0
41+
"vbroadcastss 4(%3), %%ymm13 \n\t" // x1
42+
"vbroadcastss 8(%3), %%ymm14 \n\t" // x2
43+
"vbroadcastss 12(%3), %%ymm15 \n\t" // x3
44+
"vbroadcastss 16(%3), %%ymm0 \n\t" // x4
45+
"vbroadcastss 20(%3), %%ymm1 \n\t" // x5
46+
"vbroadcastss 24(%3), %%ymm2 \n\t" // x6
47+
"vbroadcastss 28(%3), %%ymm3 \n\t" // x7
4948

5049
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
5150

5251
"testq $0x04, %1 \n\t"
5352
"jz 2f \n\t"
5453

55-
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
54+
"vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
5655
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
5756
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
5857

59-
"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
60-
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
61-
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
62-
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
58+
"vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t"
59+
"vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t"
60+
"vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t"
61+
"vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t"
6362

64-
"vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
65-
"vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
66-
"vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
67-
"vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
63+
"vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
64+
"vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
65+
"vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
66+
"vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
6867

6968
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
7069
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
7170
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
7271

73-
"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
72+
"vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
7473

75-
"addq $4 , %8 \n\t"
74+
"addq $4 , %2 \n\t"
7675
"addq $4 , %0 \n\t"
7776
"subq $4 , %1 \n\t"
7877

@@ -81,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
8180
"testq $0x08, %1 \n\t"
8281
"jz 3f \n\t"
8382

84-
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
83+
"vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
8584
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
8685
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
8786

88-
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
89-
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
90-
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
91-
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
87+
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
88+
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t"
89+
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
90+
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t"
9291

93-
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
94-
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
95-
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
96-
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
92+
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
93+
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t"
94+
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
95+
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t"
9796

9897
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
9998
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
10099
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
101100

102101

103-
"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
102+
"vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
104103

105-
"addq $8 , %8 \n\t"
104+
"addq $8 , %2 \n\t"
106105
"addq $8 , %0 \n\t"
107106
"subq $8 , %1 \n\t"
108107

@@ -117,35 +116,35 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
117116

118117
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
119118
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
120-
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
121-
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
122-
123-
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
124-
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
125-
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
126-
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
127-
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
128-
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
129-
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
130-
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
131-
132-
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
119+
"vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y
120+
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y
121+
122+
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
123+
"vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t"
124+
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t"
125+
"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t"
126+
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
127+
"vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t"
128+
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t"
129+
"vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t"
130+
131+
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
133132
"addq $16, %0 \n\t"
134-
"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
135-
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
136-
"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
137-
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
138-
"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
139-
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
140-
"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
133+
"vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t"
134+
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t"
135+
"vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t"
136+
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
137+
"vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t"
138+
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t"
139+
"vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t"
141140

142141
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
143142
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
144143

145-
"addq $16, %8 \n\t"
146-
"vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
144+
"addq $16, %2 \n\t"
145+
"vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y
147146
"subq $16, %1 \n\t"
148-
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
147+
"vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y
149148

150149
"jnz 1b \n\t"
151150

@@ -154,15 +153,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
154153

155154
:
156155
"+r" (i), // 0
157-
"+r" (n) // 1
156+
"+r" (n), // 1
157+
"+r" (lda4) // 2
158158
:
159-
"r" (x), // 2
160-
"r" (y), // 3
161-
"r" (ap[0]), // 4
162-
"r" (ap[1]), // 5
163-
"r" (ap[2]), // 6
164-
"r" (ap[3]), // 7
165-
"r" (lda4), // 8
159+
"r" (x), // 3
160+
"r" (y), // 4
161+
"r" (ap[0]), // 5
162+
"r" (ap[1]), // 6
163+
"r" (ap[2]), // 7
164+
"r" (ap[3]), // 8
166165
"r" (alpha) // 9
167166
: "cc",
168167
"%xmm0", "%xmm1",
@@ -177,7 +176,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
177176
}
178177

179178

180-
181179
#define HAVE_KERNEL_4x4 1
182180
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
183181

@@ -196,6 +194,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
196194

197195
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
198196

197+
199198
"testq $0x04, %1 \n\t"
200199
"jz 2f \n\t"
201200

0 commit comments

Comments
 (0)