@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
119
119
" cmpq $0, %0 \n\t"
120
120
" je 4f \n\t"
121
121
122
- " vmovups (%2 ,%1,4), %%ymm0 \n\t" // read a
123
- " vmovups (%3 ,%1,8), %%ymm1 \n\t" // read b0
124
- " vmovups 32(%3 ,%1,8), %%ymm2 \n\t" // read b1
122
+ " vmovups (%8 ,%1,4), %%ymm0 \n\t" // read a
123
+ " vmovups (%9 ,%1,8), %%ymm1 \n\t" // read b0
124
+ " vmovups 32(%9 ,%1,8), %%ymm2 \n\t" // read b1
125
125
126
126
127
127
" addq $8, %1 \n\t"
@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
131
131
" .p2align 4 \n\t"
132
132
"1: \n\t"
133
133
134
- " vmovups (%2 ,%1,4), %%ymm4 \n\t" // read a
134
+ " vmovups (%8 ,%1,4), %%ymm4 \n\t" // read a
135
135
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
136
136
137
137
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
138
138
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
139
139
140
- " vmovups (%3 ,%1,8), %%ymm5 \n\t" // read b0
140
+ " vmovups (%9 ,%1,8), %%ymm5 \n\t" // read b0
141
141
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
142
142
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"
143
143
144
144
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
145
- " vmovups 32(%3 ,%1,8), %%ymm6 \n\t" // read b1
145
+ " vmovups 32(%9 ,%1,8), %%ymm6 \n\t" // read b1
146
146
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
147
147
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
148
148
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
155
155
156
156
" jz 22f \n\t"
157
157
158
- " vmovups (%2 ,%1,4), %%ymm0 \n\t" // read a
158
+ " vmovups (%8 ,%1,4), %%ymm0 \n\t" // read a
159
159
160
160
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
161
161
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
162
162
163
163
" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
164
- " vmovups (%3 ,%1,8), %%ymm1 \n\t" // read b0
164
+ " vmovups (%9 ,%1,8), %%ymm1 \n\t" // read b0
165
165
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
166
166
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
167
167
168
168
" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
169
- " vmovups 32(%3 ,%1,8), %%ymm2 \n\t" // read b1
169
+ " vmovups 32(%9 ,%1,8), %%ymm2 \n\t" // read b1
170
170
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
171
171
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
172
172
@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
268
268
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7
269
269
270
270
" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
271
- " vmovups (%9 ), %%ymm0 \n\t"
271
+ " vmovups (%3 ), %%ymm0 \n\t"
272
272
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
273
273
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
274
274
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
278
278
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"
279
279
280
280
" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
281
- " vmovups 32(%9 ), %%ymm4 \n\t"
281
+ " vmovups 32(%3 ), %%ymm4 \n\t"
282
282
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
283
283
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
284
284
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
290
290
291
291
"5: \n\t" // i = 0
292
292
293
- " addq $64, %9 \n\t" // b=b+8
293
+ " addq $64, %3 \n\t" // b=b+8
294
294
295
295
" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
296
- " vmovups (%9 ), %%ymm0 \n\t"
297
- " vmovups %%ymm8 , (%8 ) \n\t" // write a
296
+ " vmovups (%3 ), %%ymm0 \n\t"
297
+ " vmovups %%ymm8 , (%2 ) \n\t" // write a
298
298
" vmovups %%ymm8 , (%4) \n\t" // write c
299
299
300
300
" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
301
- " vmovups 32(%9 ), %%ymm1 \n\t"
301
+ " vmovups 32(%3 ), %%ymm1 \n\t"
302
302
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
303
303
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
304
304
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
313
313
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
314
314
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
315
315
316
- " addq $64, %9 \n\t" // b=b+8
317
- " addq $32, %8 \n\t" // a=a+8
316
+ " addq $64, %3 \n\t" // b=b+8
317
+ " addq $32, %2 \n\t" // a=a+8
318
318
319
319
320
320
321
321
" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
322
- " vmovups (%9 ), %%ymm0 \n\t"
323
- " vmovups 32(%9 ), %%ymm1 \n\t"
324
- " vmovups %%ymm9 , (%8 ) \n\t" // write a
322
+ " vmovups (%3 ), %%ymm0 \n\t"
323
+ " vmovups 32(%3 ), %%ymm1 \n\t"
324
+ " vmovups %%ymm9 , (%2 ) \n\t" // write a
325
325
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c
326
326
327
327
" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
337
337
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
338
338
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
339
339
340
- " addq $64, %9 \n\t" // b=b+8
341
- " addq $32, %8 \n\t" // a=a+8
340
+ " addq $64, %3 \n\t" // b=b+8
341
+ " addq $32, %2 \n\t" // a=a+8
342
342
343
343
" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
344
- " vmovups (%9 ), %%ymm0 \n\t"
345
- " vmovups 32(%9 ), %%ymm1 \n\t"
346
- " vmovups %%ymm10, (%8 ) \n\t" // write a
344
+ " vmovups (%3 ), %%ymm0 \n\t"
345
+ " vmovups 32(%3 ), %%ymm1 \n\t"
346
+ " vmovups %%ymm10, (%2 ) \n\t" // write a
347
347
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c
348
348
349
349
" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
358
358
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
359
359
360
360
361
- " addq $64, %9 \n\t" // b=b+8
362
- " addq $32, %8 \n\t" // a=a+8
361
+ " addq $64, %3 \n\t" // b=b+8
362
+ " addq $32, %2 \n\t" // a=a+8
363
363
364
364
365
365
366
366
" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
367
- " vmovups 32(%9 ), %%ymm1 \n\t"
368
- " vmovups %%ymm11, (%8 ) \n\t" // write a
367
+ " vmovups 32(%3 ), %%ymm1 \n\t"
368
+ " vmovups %%ymm11, (%2 ) \n\t" // write a
369
369
" vmovups %%ymm11, (%5) \n\t" // write c
370
370
371
371
" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
378
378
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"
379
379
380
380
381
- " addq $64, %9 \n\t" // b=b+8
382
- " addq $32, %8 \n\t" // a=a+8
381
+ " addq $64, %3 \n\t" // b=b+8
382
+ " addq $32, %2 \n\t" // a=a+8
383
383
384
384
385
385
" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
386
- " vmovups 32(%9 ), %%ymm1 \n\t"
387
- " vmovups %%ymm12, (%8 ) \n\t" // write a
386
+ " vmovups 32(%3 ), %%ymm1 \n\t"
387
+ " vmovups %%ymm12, (%2 ) \n\t" // write a
388
388
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c
389
389
390
390
" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
394
394
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
395
395
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"
396
396
397
- " addq $64, %9 \n\t" // b=b+8
398
- " addq $32, %8 \n\t" // a=a+8
397
+ " addq $64, %3 \n\t" // b=b+8
398
+ " addq $32, %2 \n\t" // a=a+8
399
399
400
400
" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
401
- " vmovups 32(%9 ), %%ymm1 \n\t"
402
- " vmovups %%ymm13, (%8 ) \n\t" // write a
401
+ " vmovups 32(%3 ), %%ymm1 \n\t"
402
+ " vmovups %%ymm13, (%2 ) \n\t" // write a
403
403
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c
404
404
405
405
" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
408
408
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"
409
409
410
410
411
- " addq $64, %9 \n\t" // b=b+8
412
- " addq $32, %8 \n\t" // a=a+8
411
+ " addq $64, %3 \n\t" // b=b+8
412
+ " addq $32, %2 \n\t" // a=a+8
413
413
414
414
415
415
" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
416
- " vmovups 32(%9 ), %%ymm1 \n\t"
417
- " vmovups %%ymm14, (%8 ) \n\t" // write a
416
+ " vmovups 32(%3 ), %%ymm1 \n\t"
417
+ " vmovups %%ymm14, (%2 ) \n\t" // write a
418
418
" vmovups %%ymm14, (%6) \n\t" // write c
419
419
420
420
" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"
421
421
422
422
" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"
423
423
424
- " addq $32, %8 \n\t" // a=a+8
424
+ " addq $32, %2 \n\t" // a=a+8
425
425
426
426
" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
427
- " vmovups %%ymm15, (%8 ) \n\t" // write a
427
+ " vmovups %%ymm15, (%2 ) \n\t" // write a
428
428
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c
429
429
430
430
" vzeroupper \n\t"
431
431
432
432
:
433
+ "+r" (n1 ), // 0
434
+ "+a" (i ), // 1
435
+ "+r" (as ), // 2
436
+ "+r" (bs ) // 3
433
437
:
434
- "r" (n1 ), // 0
435
- "a" (i ), // 1
436
- "r" (a ), // 2
437
- "r" (b ), // 3
438
438
"r" (c ), // 4
439
439
"r" (c3 ), // 5
440
440
"r" (c6 ), // 6
441
441
"r" (ldc ), // 7
442
- "r" (as ), // 8
443
- "r" (bs ) // 9
442
+ "r" (a ), // 8
443
+ "r" (b ) // 9
444
444
: "cc" ,
445
445
"%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
446
446
"%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,
0 commit comments