@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
37
37
38
38
__asm__ __volatile__
39
39
(
40
- "movss (%2 ), %%xmm12 \n\t" // x0
41
- "movss 4(%2 ), %%xmm13 \n\t" // x1
42
- "movss 8(%2 ), %%xmm14 \n\t" // x2
43
- "movss 12(%2 ), %%xmm15 \n\t" // x3
40
+ "movss (%3 ), %%xmm12 \n\t" // x0
41
+ "movss 4(%3 ), %%xmm13 \n\t" // x1
42
+ "movss 8(%3 ), %%xmm14 \n\t" // x2
43
+ "movss 12(%3 ), %%xmm15 \n\t" // x3
44
44
"shufps $0, %%xmm12, %%xmm12\n\t"
45
45
"shufps $0, %%xmm13, %%xmm13\n\t"
46
46
"shufps $0, %%xmm14, %%xmm14\n\t"
47
47
"shufps $0, %%xmm15, %%xmm15\n\t"
48
48
49
- "movss 16(%2 ), %%xmm0 \n\t" // x4
50
- "movss 20(%2 ), %%xmm1 \n\t" // x5
51
- "movss 24(%2 ), %%xmm2 \n\t" // x6
52
- "movss 28(%2 ), %%xmm3 \n\t" // x7
49
+ "movss 16(%3 ), %%xmm0 \n\t" // x4
50
+ "movss 20(%3 ), %%xmm1 \n\t" // x5
51
+ "movss 24(%3 ), %%xmm2 \n\t" // x6
52
+ "movss 28(%3 ), %%xmm3 \n\t" // x7
53
53
"shufps $0, %%xmm0 , %%xmm0 \n\t"
54
54
"shufps $0, %%xmm1 , %%xmm1 \n\t"
55
55
"shufps $0, %%xmm2 , %%xmm2 \n\t"
@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
63
63
"1: \n\t"
64
64
"xorps %%xmm4 , %%xmm4 \n\t"
65
65
"xorps %%xmm5 , %%xmm5 \n\t"
66
- "movups (%3 ,%0,4), %%xmm7 \n\t" // 4 * y
66
+ "movups (%4 ,%0,4), %%xmm7 \n\t" // 4 * y
67
67
68
68
".p2align 1 \n\t"
69
- "movups (%4 ,%0,4), %%xmm8 \n\t"
70
- "movups (%5 ,%0,4), %%xmm9 \n\t"
71
- "movups (%6 ,%0,4), %%xmm10 \n\t"
72
- "movups (%7 ,%0,4), %%xmm11 \n\t"
69
+ "movups (%5 ,%0,4), %%xmm8 \n\t"
70
+ "movups (%6 ,%0,4), %%xmm9 \n\t"
71
+ "movups (%7 ,%0,4), %%xmm10 \n\t"
72
+ "movups (%8 ,%0,4), %%xmm11 \n\t"
73
73
".p2align 1 \n\t"
74
74
"mulps %%xmm12, %%xmm8 \n\t"
75
75
"mulps %%xmm13, %%xmm9 \n\t"
@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
80
80
"addps %%xmm10, %%xmm4 \n\t"
81
81
"addps %%xmm11, %%xmm5 \n\t"
82
82
83
- "movups (%4,%8 ,4), %%xmm8 \n\t"
84
- "movups (%5,%8 ,4), %%xmm9 \n\t"
85
- "movups (%6,%8 ,4), %%xmm10 \n\t"
86
- "movups (%7,%8 ,4), %%xmm11 \n\t"
83
+ "movups (%5,%2 ,4), %%xmm8 \n\t"
84
+ "movups (%6,%2 ,4), %%xmm9 \n\t"
85
+ "movups (%7,%2 ,4), %%xmm10 \n\t"
86
+ "movups (%8,%2 ,4), %%xmm11 \n\t"
87
87
".p2align 1 \n\t"
88
88
"mulps %%xmm0 , %%xmm8 \n\t"
89
89
"mulps %%xmm1 , %%xmm9 \n\t"
@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
94
94
"addps %%xmm10, %%xmm4 \n\t"
95
95
"addps %%xmm11, %%xmm5 \n\t"
96
96
97
- "addq $4 , %8 \n\t"
97
+ "addq $4 , %2 \n\t"
98
98
"addps %%xmm5 , %%xmm4 \n\t"
99
99
"addq $4 , %0 \n\t"
100
100
"mulps %%xmm6 , %%xmm4 \n\t"
101
101
"subq $4 , %1 \n\t"
102
102
"addps %%xmm4 , %%xmm7 \n\t"
103
103
104
- "movups %%xmm7 , -16(%3 ,%0,4) \n\t" // 4 * y
104
+ "movups %%xmm7 , -16(%4 ,%0,4) \n\t" // 4 * y
105
105
106
106
"jnz 1b \n\t"
107
107
108
108
:
109
109
"+r" (i ), // 0
110
- "+r" (n ) // 1
110
+ "+r" (n ), // 1
111
+ "+r" (lda4 ) // 2
111
112
:
112
- "r" (x ), // 2
113
- "r" (y ), // 3
114
- "r" (ap [0 ]), // 4
115
- "r" (ap [1 ]), // 5
116
- "r" (ap [2 ]), // 6
117
- "r" (ap [3 ]), // 7
118
- "r" (lda4 ), // 8
113
+ "r" (x ), // 3
114
+ "r" (y ), // 4
115
+ "r" (ap [0 ]), // 5
116
+ "r" (ap [1 ]), // 6
117
+ "r" (ap [2 ]), // 7
118
+ "r" (ap [3 ]), // 8
119
119
"r" (alpha ) // 9
120
120
: "cc" ,
121
121
"%xmm0" , "%xmm1" ,
0 commit comments