@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
39
39
__asm__ __volatile__
40
40
(
41
41
"vzeroupper \n\t"
42
- "vbroadcastss (%2 ), %%ymm12 \n\t" // x0
43
- "vbroadcastss 4(%2 ), %%ymm13 \n\t" // x1
44
- "vbroadcastss 8(%2 ), %%ymm14 \n\t" // x2
45
- "vbroadcastss 12(%2 ), %%ymm15 \n\t" // x3
46
- "vbroadcastss 16(%2 ), %%ymm0 \n\t" // x4
47
- "vbroadcastss 20(%2 ), %%ymm1 \n\t" // x5
48
- "vbroadcastss 24(%2 ), %%ymm2 \n\t" // x6
49
- "vbroadcastss 28(%2 ), %%ymm3 \n\t" // x7
42
+ "vbroadcastss (%3 ), %%ymm12 \n\t" // x0
43
+ "vbroadcastss 4(%3 ), %%ymm13 \n\t" // x1
44
+ "vbroadcastss 8(%3 ), %%ymm14 \n\t" // x2
45
+ "vbroadcastss 12(%3 ), %%ymm15 \n\t" // x3
46
+ "vbroadcastss 16(%3 ), %%ymm0 \n\t" // x4
47
+ "vbroadcastss 20(%3 ), %%ymm1 \n\t" // x5
48
+ "vbroadcastss 24(%3 ), %%ymm2 \n\t" // x6
49
+ "vbroadcastss 28(%3 ), %%ymm3 \n\t" // x7
50
50
51
51
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
52
52
@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
55
55
56
56
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
57
57
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
58
- "vmovups (%3 ,%0,4), %%xmm7 \n\t" // 4 * y
58
+ "vmovups (%4 ,%0,4), %%xmm7 \n\t" // 4 * y
59
59
60
- "vmulps (%4 ,%0,4), %%xmm12, %%xmm8 \n\t"
61
- "vmulps (%5 ,%0,4), %%xmm13, %%xmm10 \n\t"
62
- "vmulps (%6 ,%0,4), %%xmm14, %%xmm9 \n\t"
63
- "vmulps (%7 ,%0,4), %%xmm15, %%xmm11 \n\t"
60
+ "vmulps (%5 ,%0,4), %%xmm12, %%xmm8 \n\t"
61
+ "vmulps (%6 ,%0,4), %%xmm13, %%xmm10 \n\t"
62
+ "vmulps (%7 ,%0,4), %%xmm14, %%xmm9 \n\t"
63
+ "vmulps (%8 ,%0,4), %%xmm15, %%xmm11 \n\t"
64
64
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
65
65
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
66
66
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
67
67
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
68
68
69
- "vmulps (%4,%8 ,4), %%xmm0 , %%xmm8 \n\t"
70
- "vmulps (%5,%8 ,4), %%xmm1 , %%xmm10 \n\t"
71
- "vmulps (%6,%8 ,4), %%xmm2 , %%xmm9 \n\t"
72
- "vmulps (%7,%8 ,4), %%xmm3 , %%xmm11 \n\t"
69
+ "vmulps (%5,%2 ,4), %%xmm0 , %%xmm8 \n\t"
70
+ "vmulps (%6,%2 ,4), %%xmm1 , %%xmm10 \n\t"
71
+ "vmulps (%7,%2 ,4), %%xmm2 , %%xmm9 \n\t"
72
+ "vmulps (%8,%2 ,4), %%xmm3 , %%xmm11 \n\t"
73
73
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
74
74
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
75
75
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
79
79
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
80
80
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
81
81
82
- "vmovups %%xmm5, (%3 ,%0,4) \n\t" // 4 * y
82
+ "vmovups %%xmm5, (%4 ,%0,4) \n\t" // 4 * y
83
83
84
- "addq $4, %8 \n\t"
84
+ "addq $4, %2 \n\t"
85
85
"addq $4, %0 \n\t"
86
86
"subq $4, %1 \n\t"
87
87
@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
92
92
93
93
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
94
94
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
95
- "vmovups (%3 ,%0,4), %%ymm7 \n\t" // 8 * y
95
+ "vmovups (%4 ,%0,4), %%ymm7 \n\t" // 8 * y
96
96
97
- "vmulps (%4 ,%0,4), %%ymm12, %%ymm8 \n\t"
98
- "vmulps (%5 ,%0,4), %%ymm13, %%ymm10 \n\t"
99
- "vmulps (%6 ,%0,4), %%ymm14, %%ymm9 \n\t"
100
- "vmulps (%7 ,%0,4), %%ymm15, %%ymm11 \n\t"
97
+ "vmulps (%5 ,%0,4), %%ymm12, %%ymm8 \n\t"
98
+ "vmulps (%6 ,%0,4), %%ymm13, %%ymm10 \n\t"
99
+ "vmulps (%7 ,%0,4), %%ymm14, %%ymm9 \n\t"
100
+ "vmulps (%8 ,%0,4), %%ymm15, %%ymm11 \n\t"
101
101
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
102
102
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
103
103
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
104
104
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
105
105
106
- "vmulps (%4,%8 ,4), %%ymm0 , %%ymm8 \n\t"
107
- "vmulps (%5,%8 ,4), %%ymm1 , %%ymm10 \n\t"
108
- "vmulps (%6,%8 ,4), %%ymm2 , %%ymm9 \n\t"
109
- "vmulps (%7,%8 ,4), %%ymm3 , %%ymm11 \n\t"
106
+ "vmulps (%5,%2 ,4), %%ymm0 , %%ymm8 \n\t"
107
+ "vmulps (%6,%2 ,4), %%ymm1 , %%ymm10 \n\t"
108
+ "vmulps (%7,%2 ,4), %%ymm2 , %%ymm9 \n\t"
109
+ "vmulps (%8,%2 ,4), %%ymm3 , %%ymm11 \n\t"
110
110
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
111
111
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
112
112
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
116
116
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
117
117
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
118
118
119
- "vmovups %%ymm5, (%3 ,%0,4) \n\t" // 8 * y
119
+ "vmovups %%ymm5, (%4 ,%0,4) \n\t" // 8 * y
120
120
121
- "addq $8, %8 \n\t"
121
+ "addq $8, %2 \n\t"
122
122
"addq $8, %0 \n\t"
123
123
"subq $8, %1 \n\t"
124
124
@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
134
134
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
135
135
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
136
136
137
- "prefetcht0 192(%4,%0,4) \n\t"
138
- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
139
- "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
140
137
"prefetcht0 192(%5,%0,4) \n\t"
141
- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
142
- "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
138
+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
139
+ "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t"
140
+ "prefetcht0 192(%6,%0,4) \n\t"
141
+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
142
+ "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t"
143
143
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
144
144
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
145
145
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
146
146
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
147
147
148
- "prefetcht0 192(%6,%0,4) \n\t"
149
- "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
150
- "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
151
148
"prefetcht0 192(%7,%0,4) \n\t"
152
- "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
153
- "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
149
+ "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t"
150
+ "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t"
151
+ "prefetcht0 192(%8,%0,4) \n\t"
152
+ "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t"
153
+ "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t"
154
154
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
155
155
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
156
156
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
157
157
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
158
158
159
- "prefetcht0 192(%4,%8 ,4) \n\t"
160
- "vmulps (%4,%8 ,4), %%ymm0 , %%ymm8 \n\t"
161
- "vmulps 32(%4,%8 ,4), %%ymm0 , %%ymm9 \n\t"
162
- "prefetcht0 192(%5,%8 ,4) \n\t"
163
- "vmulps (%5,%8 ,4), %%ymm1 , %%ymm10 \n\t"
164
- "vmulps 32(%5,%8 ,4), %%ymm1 , %%ymm11 \n\t"
159
+ "prefetcht0 192(%5,%2 ,4) \n\t"
160
+ "vmulps (%5,%2 ,4), %%ymm0 , %%ymm8 \n\t"
161
+ "vmulps 32(%5,%2 ,4), %%ymm0 , %%ymm9 \n\t"
162
+ "prefetcht0 192(%6,%2 ,4) \n\t"
163
+ "vmulps (%6,%2 ,4), %%ymm1 , %%ymm10 \n\t"
164
+ "vmulps 32(%6,%2 ,4), %%ymm1 , %%ymm11 \n\t"
165
165
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
166
166
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
167
167
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
168
168
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
169
169
170
- "prefetcht0 192(%6,%8 ,4) \n\t"
171
- "vmulps (%6,%8 ,4), %%ymm2 , %%ymm8 \n\t"
172
- "vmulps 32(%6,%8 ,4), %%ymm2 , %%ymm9 \n\t"
173
- "prefetcht0 192(%7,%8 ,4) \n\t"
174
- "vmulps (%7,%8 ,4), %%ymm3 , %%ymm10 \n\t"
175
- "vmulps 32(%7,%8 ,4), %%ymm3 , %%ymm11 \n\t"
170
+ "prefetcht0 192(%7,%2 ,4) \n\t"
171
+ "vmulps (%7,%2 ,4), %%ymm2 , %%ymm8 \n\t"
172
+ "vmulps 32(%7,%2 ,4), %%ymm2 , %%ymm9 \n\t"
173
+ "prefetcht0 192(%8,%2 ,4) \n\t"
174
+ "vmulps (%8,%2 ,4), %%ymm3 , %%ymm10 \n\t"
175
+ "vmulps 32(%8,%2 ,4), %%ymm3 , %%ymm11 \n\t"
176
176
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
177
177
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
178
178
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
181
181
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
182
182
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
183
183
184
- "vaddps (%3 ,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
185
- "vaddps 32(%3 ,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
184
+ "vaddps (%4 ,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
185
+ "vaddps 32(%4 ,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
186
186
187
- "vmovups %%ymm4, (%3 ,%0,4) \n\t" // 8 * y
188
- "vmovups %%ymm5, 32(%3 ,%0,4) \n\t" // 8 * y
187
+ "vmovups %%ymm4, (%4 ,%0,4) \n\t" // 8 * y
188
+ "vmovups %%ymm5, 32(%4 ,%0,4) \n\t" // 8 * y
189
189
190
- "addq $16, %8 \n\t"
190
+ "addq $16, %2 \n\t"
191
191
"addq $16, %0 \n\t"
192
192
"subq $16, %1 \n\t"
193
193
"jnz 1b \n\t"
@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
197
197
198
198
:
199
199
"+r" (i ), // 0
200
- "+r" (n ) // 1
200
+ "+r" (n ), // 1
201
+ "+r" (lda4 ) // 2
201
202
:
202
- "r" (x ), // 2
203
- "r" (y ), // 3
204
- "r" (ap [0 ]), // 4
205
- "r" (ap [1 ]), // 5
206
- "r" (ap [2 ]), // 6
207
- "r" (ap [3 ]), // 7
208
- "r" (lda4 ), // 8
203
+ "r" (x ), // 3
204
+ "r" (y ), // 4
205
+ "r" (ap [0 ]), // 5
206
+ "r" (ap [1 ]), // 6
207
+ "r" (ap [2 ]), // 7
208
+ "r" (ap [3 ]), // 8
209
209
"r" (alpha ) // 9
210
210
: "cc" ,
211
211
"%xmm0" , "%xmm1" ,
0 commit comments