Skip to content

Commit 9d8be15

Browse files
authored
Fix inline assembly constraints
rework indices to allow marking argument lda4 as input and output. For #2009
1 parent 69edc5b commit 9d8be15

File tree

1 file changed

+27
-27
lines changed

1 file changed

+27
-27
lines changed

kernel/x86_64/sgemv_n_microk_nehalem-4.c

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
3737

3838
__asm__ __volatile__
3939
(
40-
"movss (%2), %%xmm12 \n\t" // x0
41-
"movss 4(%2), %%xmm13 \n\t" // x1
42-
"movss 8(%2), %%xmm14 \n\t" // x2
43-
"movss 12(%2), %%xmm15 \n\t" // x3
40+
"movss (%3), %%xmm12 \n\t" // x0
41+
"movss 4(%3), %%xmm13 \n\t" // x1
42+
"movss 8(%3), %%xmm14 \n\t" // x2
43+
"movss 12(%3), %%xmm15 \n\t" // x3
4444
"shufps $0, %%xmm12, %%xmm12\n\t"
4545
"shufps $0, %%xmm13, %%xmm13\n\t"
4646
"shufps $0, %%xmm14, %%xmm14\n\t"
4747
"shufps $0, %%xmm15, %%xmm15\n\t"
4848

49-
"movss 16(%2), %%xmm0 \n\t" // x4
50-
"movss 20(%2), %%xmm1 \n\t" // x5
51-
"movss 24(%2), %%xmm2 \n\t" // x6
52-
"movss 28(%2), %%xmm3 \n\t" // x7
49+
"movss 16(%3), %%xmm0 \n\t" // x4
50+
"movss 20(%3), %%xmm1 \n\t" // x5
51+
"movss 24(%3), %%xmm2 \n\t" // x6
52+
"movss 28(%3), %%xmm3 \n\t" // x7
5353
"shufps $0, %%xmm0 , %%xmm0 \n\t"
5454
"shufps $0, %%xmm1 , %%xmm1 \n\t"
5555
"shufps $0, %%xmm2 , %%xmm2 \n\t"
@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
6363
"1: \n\t"
6464
"xorps %%xmm4 , %%xmm4 \n\t"
6565
"xorps %%xmm5 , %%xmm5 \n\t"
66-
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
66+
"movups (%4,%0,4), %%xmm7 \n\t" // 4 * y
6767

6868
".p2align 1 \n\t"
69-
"movups (%4,%0,4), %%xmm8 \n\t"
70-
"movups (%5,%0,4), %%xmm9 \n\t"
71-
"movups (%6,%0,4), %%xmm10 \n\t"
72-
"movups (%7,%0,4), %%xmm11 \n\t"
69+
"movups (%5,%0,4), %%xmm8 \n\t"
70+
"movups (%6,%0,4), %%xmm9 \n\t"
71+
"movups (%7,%0,4), %%xmm10 \n\t"
72+
"movups (%8,%0,4), %%xmm11 \n\t"
7373
".p2align 1 \n\t"
7474
"mulps %%xmm12, %%xmm8 \n\t"
7575
"mulps %%xmm13, %%xmm9 \n\t"
@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
8080
"addps %%xmm10, %%xmm4 \n\t"
8181
"addps %%xmm11, %%xmm5 \n\t"
8282

83-
"movups (%4,%8,4), %%xmm8 \n\t"
84-
"movups (%5,%8,4), %%xmm9 \n\t"
85-
"movups (%6,%8,4), %%xmm10 \n\t"
86-
"movups (%7,%8,4), %%xmm11 \n\t"
83+
"movups (%5,%2,4), %%xmm8 \n\t"
84+
"movups (%6,%2,4), %%xmm9 \n\t"
85+
"movups (%7,%2,4), %%xmm10 \n\t"
86+
"movups (%8,%2,4), %%xmm11 \n\t"
8787
".p2align 1 \n\t"
8888
"mulps %%xmm0 , %%xmm8 \n\t"
8989
"mulps %%xmm1 , %%xmm9 \n\t"
@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
9494
"addps %%xmm10, %%xmm4 \n\t"
9595
"addps %%xmm11, %%xmm5 \n\t"
9696

97-
"addq $4 , %8 \n\t"
97+
"addq $4 , %2 \n\t"
9898
"addps %%xmm5 , %%xmm4 \n\t"
9999
"addq $4 , %0 \n\t"
100100
"mulps %%xmm6 , %%xmm4 \n\t"
101101
"subq $4 , %1 \n\t"
102102
"addps %%xmm4 , %%xmm7 \n\t"
103103

104-
"movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
104+
"movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y
105105

106106
"jnz 1b \n\t"
107107

108108
:
109109
"+r" (i), // 0
110-
"+r" (n) // 1
110+
"+r" (n), // 1
111+
"+r" (lda4) // 2
111112
:
112-
"r" (x), // 2
113-
"r" (y), // 3
114-
"r" (ap[0]), // 4
115-
"r" (ap[1]), // 5
116-
"r" (ap[2]), // 6
117-
"r" (ap[3]), // 7
118-
"r" (lda4), // 8
113+
"r" (x), // 3
114+
"r" (y), // 4
115+
"r" (ap[0]), // 5
116+
"r" (ap[1]), // 6
117+
"r" (ap[2]), // 7
118+
"r" (ap[3]), // 8
119119
"r" (alpha) // 9
120120
: "cc",
121121
"%xmm0", "%xmm1",

0 commit comments

Comments
 (0)