@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
125
125
" .align 16 \n\t"
126
126
"1: \n\t"
127
127
128
- " prefetcht0 384(%2 ,%1,8) \n\t"
129
- " prefetcht0 384(%3 ,%1,8) \n\t"
130
- " vmovddup (%3 ,%1,2), %%xmm0 \n\t" // read b
131
- " vmovups (%2 ,%1,8), %%xmm4 \n\t"
132
- " vmovddup 8(%3 ,%1,2), %%xmm1 \n\t"
133
- " vmovups 16(%2 ,%1,8), %%xmm5 \n\t"
134
- " vmovups 32(%2 ,%1,8), %%xmm6 \n\t"
135
- " vmovups 48(%2 ,%1,8), %%xmm7 \n\t"
128
+ " prefetcht0 384(%6 ,%1,8) \n\t"
129
+ " prefetcht0 384(%7 ,%1,8) \n\t"
130
+ " vmovddup (%7 ,%1,2), %%xmm0 \n\t" // read b
131
+ " vmovups (%6 ,%1,8), %%xmm4 \n\t"
132
+ " vmovddup 8(%7 ,%1,2), %%xmm1 \n\t"
133
+ " vmovups 16(%6 ,%1,8), %%xmm5 \n\t"
134
+ " vmovups 32(%6 ,%1,8), %%xmm6 \n\t"
135
+ " vmovups 48(%6 ,%1,8), %%xmm7 \n\t"
136
136
137
137
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
138
138
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
147
147
148
148
" jz 2f \n\t"
149
149
150
- " prefetcht0 384(%2 ,%1,8) \n\t"
151
- " vmovddup (%3 ,%1,2), %%xmm0 \n\t" // read b
152
- " vmovups (%2 ,%1,8), %%xmm4 \n\t"
153
- " vmovddup 8(%3 ,%1,2), %%xmm1 \n\t"
154
- " vmovups 16(%2 ,%1,8), %%xmm5 \n\t"
155
- " vmovups 32(%2 ,%1,8), %%xmm6 \n\t"
156
- " vmovups 48(%2 ,%1,8), %%xmm7 \n\t"
150
+ " prefetcht0 384(%6 ,%1,8) \n\t"
151
+ " vmovddup (%7 ,%1,2), %%xmm0 \n\t" // read b
152
+ " vmovups (%6 ,%1,8), %%xmm4 \n\t"
153
+ " vmovddup 8(%7 ,%1,2), %%xmm1 \n\t"
154
+ " vmovups 16(%6 ,%1,8), %%xmm5 \n\t"
155
+ " vmovups 32(%6 ,%1,8), %%xmm6 \n\t"
156
+ " vmovups 48(%6 ,%1,8), %%xmm7 \n\t"
157
157
158
158
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
159
159
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
168
168
169
169
" jz 2f \n\t"
170
170
171
- " prefetcht0 384(%2 ,%1,8) \n\t"
172
- " vmovddup (%3 ,%1,2), %%xmm0 \n\t" // read b
173
- " vmovups (%2 ,%1,8), %%xmm4 \n\t"
174
- " vmovddup 8(%3 ,%1,2), %%xmm1 \n\t"
175
- " vmovups 16(%2 ,%1,8), %%xmm5 \n\t"
176
- " vmovups 32(%2 ,%1,8), %%xmm6 \n\t"
177
- " vmovups 48(%2 ,%1,8), %%xmm7 \n\t"
171
+ " prefetcht0 384(%6 ,%1,8) \n\t"
172
+ " vmovddup (%7 ,%1,2), %%xmm0 \n\t" // read b
173
+ " vmovups (%6 ,%1,8), %%xmm4 \n\t"
174
+ " vmovddup 8(%7 ,%1,2), %%xmm1 \n\t"
175
+ " vmovups 16(%6 ,%1,8), %%xmm5 \n\t"
176
+ " vmovups 32(%6 ,%1,8), %%xmm6 \n\t"
177
+ " vmovups 48(%6 ,%1,8), %%xmm7 \n\t"
178
178
179
179
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
180
180
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
189
189
190
190
" jz 2f \n\t"
191
191
192
- " prefetcht0 384(%2 ,%1,8) \n\t"
193
- " vmovddup (%3 ,%1,2), %%xmm0 \n\t" // read b
194
- " vmovddup 8(%3 ,%1,2), %%xmm1 \n\t"
195
- " vmovups (%2 ,%1,8), %%xmm4 \n\t"
196
- " vmovups 16(%2 ,%1,8), %%xmm5 \n\t"
197
- " vmovups 32(%2 ,%1,8), %%xmm6 \n\t"
198
- " vmovups 48(%2 ,%1,8), %%xmm7 \n\t"
192
+ " prefetcht0 384(%6 ,%1,8) \n\t"
193
+ " vmovddup (%7 ,%1,2), %%xmm0 \n\t" // read b
194
+ " vmovddup 8(%7 ,%1,2), %%xmm1 \n\t"
195
+ " vmovups (%6 ,%1,8), %%xmm4 \n\t"
196
+ " vmovups 16(%6 ,%1,8), %%xmm5 \n\t"
197
+ " vmovups 32(%6 ,%1,8), %%xmm6 \n\t"
198
+ " vmovups 48(%6 ,%1,8), %%xmm7 \n\t"
199
199
200
200
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
201
201
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
235
235
236
236
"3: \n\t" // i = 1
237
237
238
- " vmovddup (%7 ), %%xmm1 \n\t" // read b
239
- " vmovddup 8(%7 ), %%xmm0 \n\t" // read bb
238
+ " vmovddup (%3 ), %%xmm1 \n\t" // read b
239
+ " vmovddup 8(%3 ), %%xmm0 \n\t" // read bb
240
240
241
241
" vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
242
242
" vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
243
243
" vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
244
244
" vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
245
245
246
- " vmovups %%xmm12 , (%6 ) \n\t" // write a
247
- " vmovups %%xmm13 , 16(%6 ) \n\t" // write a
248
- " vmovups %%xmm14 , 32(%6 ) \n\t" // write a
249
- " vmovups %%xmm15 , 48(%6 ) \n\t" // write a
246
+ " vmovups %%xmm12 , (%2 ) \n\t" // write a
247
+ " vmovups %%xmm13 , 16(%2 ) \n\t" // write a
248
+ " vmovups %%xmm14 , 32(%2 ) \n\t" // write a
249
+ " vmovups %%xmm15 , 48(%2 ) \n\t" // write a
250
250
251
251
" vmovups %%xmm12 , (%5) \n\t" // write c1
252
252
" vmovups %%xmm13 , 16(%5) \n\t"
@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
259
259
" vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"
260
260
261
261
" \n\t" // i = 0
262
- " subq $16 , %7 \n\t" // b = b - 2
263
- " subq $64 , %6 \n\t" // a = a - 8
262
+ " subq $16 , %3 \n\t" // b = b - 2
263
+ " subq $64 , %2 \n\t" // a = a - 8
264
264
265
- " vmovddup (%7 ), %%xmm0 \n\t" // read bb
265
+ " vmovddup (%3 ), %%xmm0 \n\t" // read bb
266
266
267
267
" vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
268
268
" vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t"
269
269
" vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t"
270
270
" vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t"
271
271
272
- " vmovups %%xmm8 , (%6 ) \n\t" // write a
273
- " vmovups %%xmm9 , 16(%6 ) \n\t"
274
- " vmovups %%xmm10 , 32(%6 ) \n\t"
275
- " vmovups %%xmm11 , 48(%6 ) \n\t"
272
+ " vmovups %%xmm8 , (%2 ) \n\t" // write a
273
+ " vmovups %%xmm9 , 16(%2 ) \n\t"
274
+ " vmovups %%xmm10 , 32(%2 ) \n\t"
275
+ " vmovups %%xmm11 , 48(%2 ) \n\t"
276
276
277
277
" vmovups %%xmm8 , (%4) \n\t" // write c0
278
278
" vmovups %%xmm9 , 16(%4) \n\t"
@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
282
282
" vzeroupper \n\t"
283
283
284
284
:
285
+ "+r" (n1 ), // 0
286
+ "+a" (i ), // 1
287
+ "+r" (as ), // 2
288
+ "+r" (bs ) // 3
285
289
:
286
- "r" (n1 ), // 0
287
- "a" (i ), // 1
288
- "r" (a ), // 2
289
- "r" (b ), // 3
290
290
"r" (c ), // 4
291
291
"r" (c1 ), // 5
292
- "r" (as ), // 6
293
- "r" (bs ) // 7
292
+ "r" (a ), // 6
293
+ "r" (b ) // 7
294
294
: "cc" ,
295
295
"%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
296
296
"%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,
0 commit comments