Skip to content

Commit d752799

Browse files
authored
Merge pull request #2021 from martin-frbg/gcc9fixes2
Fix wrong constraints in inline assembly of Haswell DTRSM kernel
2 parents 1c6da2d + c26c0b7 commit d752799

File tree

1 file changed

+49
-49
lines changed

1 file changed

+49
-49
lines changed

kernel/x86_64/dtrsm_kernel_RN_haswell.c

Lines changed: 49 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
119119
" cmpq $0, %0 \n\t"
120120
" je 4f \n\t"
121121

122-
" vmovups (%2,%1,4), %%ymm0 \n\t" // read a
123-
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
124-
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
122+
" vmovups (%8,%1,4), %%ymm0 \n\t" // read a
123+
" vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
124+
" vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
125125

126126

127127
" addq $8, %1 \n\t"
@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
131131
" .p2align 4 \n\t"
132132
"1: \n\t"
133133

134-
" vmovups (%2,%1,4), %%ymm4 \n\t" // read a
134+
" vmovups (%8,%1,4), %%ymm4 \n\t" // read a
135135
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
136136

137137
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
138138
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
139139

140-
" vmovups (%3,%1,8), %%ymm5 \n\t" // read b0
140+
" vmovups (%9,%1,8), %%ymm5 \n\t" // read b0
141141
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
142142
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"
143143

144144
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
145-
" vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1
145+
" vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
146146
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
147147
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
148148
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
155155

156156
" jz 22f \n\t"
157157

158-
" vmovups (%2,%1,4), %%ymm0 \n\t" // read a
158+
" vmovups (%8,%1,4), %%ymm0 \n\t" // read a
159159

160160
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
161161
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
162162

163163
" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
164-
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
164+
" vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
165165
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
166166
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
167167

168168
" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
169-
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
169+
" vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
170170
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
171171
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
172172

@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
268268
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7
269269

270270
" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
271-
" vmovups (%9), %%ymm0 \n\t"
271+
" vmovups (%3), %%ymm0 \n\t"
272272
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
273273
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
274274
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
278278
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"
279279

280280
" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
281-
" vmovups 32(%9), %%ymm4 \n\t"
281+
" vmovups 32(%3), %%ymm4 \n\t"
282282
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
283283
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
284284
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
290290

291291
"5: \n\t" // i = 0
292292

293-
" addq $64, %9 \n\t" // b=b+8
293+
" addq $64, %3 \n\t" // b=b+8
294294

295295
" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
296-
" vmovups (%9), %%ymm0 \n\t"
297-
" vmovups %%ymm8 , (%8) \n\t" // write a
296+
" vmovups (%3), %%ymm0 \n\t"
297+
" vmovups %%ymm8 , (%2) \n\t" // write a
298298
" vmovups %%ymm8 , (%4) \n\t" // write c
299299

300300
" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
301-
" vmovups 32(%9), %%ymm1 \n\t"
301+
" vmovups 32(%3), %%ymm1 \n\t"
302302
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
303303
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
304304
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
313313
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
314314
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
315315

316-
" addq $64, %9 \n\t" // b=b+8
317-
" addq $32, %8 \n\t" // a=a+8
316+
" addq $64, %3 \n\t" // b=b+8
317+
" addq $32, %2 \n\t" // a=a+8
318318

319319

320320

321321
" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
322-
" vmovups (%9), %%ymm0 \n\t"
323-
" vmovups 32(%9), %%ymm1 \n\t"
324-
" vmovups %%ymm9 , (%8) \n\t" // write a
322+
" vmovups (%3), %%ymm0 \n\t"
323+
" vmovups 32(%3), %%ymm1 \n\t"
324+
" vmovups %%ymm9 , (%2) \n\t" // write a
325325
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c
326326

327327
" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
337337
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
338338
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
339339

340-
" addq $64, %9 \n\t" // b=b+8
341-
" addq $32, %8 \n\t" // a=a+8
340+
" addq $64, %3 \n\t" // b=b+8
341+
" addq $32, %2 \n\t" // a=a+8
342342

343343
" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
344-
" vmovups (%9), %%ymm0 \n\t"
345-
" vmovups 32(%9), %%ymm1 \n\t"
346-
" vmovups %%ymm10, (%8) \n\t" // write a
344+
" vmovups (%3), %%ymm0 \n\t"
345+
" vmovups 32(%3), %%ymm1 \n\t"
346+
" vmovups %%ymm10, (%2) \n\t" // write a
347347
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c
348348

349349
" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
358358
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
359359

360360

361-
" addq $64, %9 \n\t" // b=b+8
362-
" addq $32, %8 \n\t" // a=a+8
361+
" addq $64, %3 \n\t" // b=b+8
362+
" addq $32, %2 \n\t" // a=a+8
363363

364364

365365

366366
" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
367-
" vmovups 32(%9), %%ymm1 \n\t"
368-
" vmovups %%ymm11, (%8) \n\t" // write a
367+
" vmovups 32(%3), %%ymm1 \n\t"
368+
" vmovups %%ymm11, (%2) \n\t" // write a
369369
" vmovups %%ymm11, (%5) \n\t" // write c
370370

371371
" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
378378
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"
379379

380380

381-
" addq $64, %9 \n\t" // b=b+8
382-
" addq $32, %8 \n\t" // a=a+8
381+
" addq $64, %3 \n\t" // b=b+8
382+
" addq $32, %2 \n\t" // a=a+8
383383

384384

385385
" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
386-
" vmovups 32(%9), %%ymm1 \n\t"
387-
" vmovups %%ymm12, (%8) \n\t" // write a
386+
" vmovups 32(%3), %%ymm1 \n\t"
387+
" vmovups %%ymm12, (%2) \n\t" // write a
388388
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c
389389

390390
" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
394394
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
395395
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"
396396

397-
" addq $64, %9 \n\t" // b=b+8
398-
" addq $32, %8 \n\t" // a=a+8
397+
" addq $64, %3 \n\t" // b=b+8
398+
" addq $32, %2 \n\t" // a=a+8
399399

400400
" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
401-
" vmovups 32(%9), %%ymm1 \n\t"
402-
" vmovups %%ymm13, (%8) \n\t" // write a
401+
" vmovups 32(%3), %%ymm1 \n\t"
402+
" vmovups %%ymm13, (%2) \n\t" // write a
403403
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c
404404

405405
" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
408408
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"
409409

410410

411-
" addq $64, %9 \n\t" // b=b+8
412-
" addq $32, %8 \n\t" // a=a+8
411+
" addq $64, %3 \n\t" // b=b+8
412+
" addq $32, %2 \n\t" // a=a+8
413413

414414

415415
" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
416-
" vmovups 32(%9), %%ymm1 \n\t"
417-
" vmovups %%ymm14, (%8) \n\t" // write a
416+
" vmovups 32(%3), %%ymm1 \n\t"
417+
" vmovups %%ymm14, (%2) \n\t" // write a
418418
" vmovups %%ymm14, (%6) \n\t" // write c
419419

420420
" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"
421421

422422
" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"
423423

424-
" addq $32, %8 \n\t" // a=a+8
424+
" addq $32, %2 \n\t" // a=a+8
425425

426426
" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
427-
" vmovups %%ymm15, (%8) \n\t" // write a
427+
" vmovups %%ymm15, (%2) \n\t" // write a
428428
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c
429429

430430
" vzeroupper \n\t"
431431

432432
:
433+
"+r" (n1), // 0
434+
"+a" (i), // 1
435+
"+r" (as), // 2
436+
"+r" (bs) // 3
433437
:
434-
"r" (n1), // 0
435-
"a" (i), // 1
436-
"r" (a), // 2
437-
"r" (b), // 3
438438
"r" (c), // 4
439439
"r" (c3), // 5
440440
"r" (c6), // 6
441441
"r" (ldc), // 7
442-
"r" (as), // 8
443-
"r" (bs) // 9
442+
"r" (a), // 8
443+
"r" (b) // 9
444444
: "cc",
445445
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
446446
"%xmm4", "%xmm5", "%xmm6", "%xmm7",

0 commit comments

Comments
 (0)