Skip to content

Commit e12cdf5

Browse files
authored
Merge pull request #2024 from martin-frbg/gcc9fixes4
Fix inline assembly constraints in Bulldozer TRSM kernels
2 parents 1860c94 + f9bb76d commit e12cdf5

File tree

5 files changed

+356
-356
lines changed

5 files changed

+356
-356
lines changed

kernel/x86_64/dtrsm_kernel_RT_bulldozer.c

Lines changed: 48 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
125125
" .align 16 \n\t"
126126
"1: \n\t"
127127

128-
" prefetcht0 384(%2,%1,8) \n\t"
129-
" prefetcht0 384(%3,%1,8) \n\t"
130-
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
131-
" vmovups (%2,%1,8), %%xmm4 \n\t"
132-
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
133-
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
134-
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
135-
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
128+
" prefetcht0 384(%6,%1,8) \n\t"
129+
" prefetcht0 384(%7,%1,8) \n\t"
130+
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
131+
" vmovups (%6,%1,8), %%xmm4 \n\t"
132+
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
133+
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
134+
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
135+
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
136136

137137
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
138138
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
147147

148148
" jz 2f \n\t"
149149

150-
" prefetcht0 384(%2,%1,8) \n\t"
151-
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
152-
" vmovups (%2,%1,8), %%xmm4 \n\t"
153-
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
154-
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
155-
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
156-
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
150+
" prefetcht0 384(%6,%1,8) \n\t"
151+
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
152+
" vmovups (%6,%1,8), %%xmm4 \n\t"
153+
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
154+
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
155+
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
156+
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
157157

158158
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
159159
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
168168

169169
" jz 2f \n\t"
170170

171-
" prefetcht0 384(%2,%1,8) \n\t"
172-
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
173-
" vmovups (%2,%1,8), %%xmm4 \n\t"
174-
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
175-
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
176-
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
177-
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
171+
" prefetcht0 384(%6,%1,8) \n\t"
172+
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
173+
" vmovups (%6,%1,8), %%xmm4 \n\t"
174+
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
175+
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
176+
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
177+
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
178178

179179
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
180180
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
189189

190190
" jz 2f \n\t"
191191

192-
" prefetcht0 384(%2,%1,8) \n\t"
193-
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
194-
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
195-
" vmovups (%2,%1,8), %%xmm4 \n\t"
196-
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
197-
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
198-
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
192+
" prefetcht0 384(%6,%1,8) \n\t"
193+
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
194+
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
195+
" vmovups (%6,%1,8), %%xmm4 \n\t"
196+
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
197+
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
198+
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
199199

200200
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
201201
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
235235

236236
"3: \n\t" // i = 1
237237

238-
" vmovddup (%7), %%xmm1 \n\t" // read b
239-
" vmovddup 8(%7), %%xmm0 \n\t" // read bb
238+
" vmovddup (%3), %%xmm1 \n\t" // read b
239+
" vmovddup 8(%3), %%xmm0 \n\t" // read bb
240240

241241
" vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
242242
" vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
243243
" vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
244244
" vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
245245

246-
" vmovups %%xmm12 , (%6) \n\t" // write a
247-
" vmovups %%xmm13 , 16(%6) \n\t" // write a
248-
" vmovups %%xmm14 , 32(%6) \n\t" // write a
249-
" vmovups %%xmm15 , 48(%6) \n\t" // write a
246+
" vmovups %%xmm12 , (%2) \n\t" // write a
247+
" vmovups %%xmm13 , 16(%2) \n\t" // write a
248+
" vmovups %%xmm14 , 32(%2) \n\t" // write a
249+
" vmovups %%xmm15 , 48(%2) \n\t" // write a
250250

251251
" vmovups %%xmm12 , (%5) \n\t" // write c1
252252
" vmovups %%xmm13 , 16(%5) \n\t"
@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
259259
" vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"
260260

261261
" \n\t" // i = 0
262-
" subq $16 , %7 \n\t" // b = b - 2
263-
" subq $64 , %6 \n\t" // a = a - 8
262+
" subq $16 , %3 \n\t" // b = b - 2
263+
" subq $64 , %2 \n\t" // a = a - 8
264264

265-
" vmovddup (%7), %%xmm0 \n\t" // read bb
265+
" vmovddup (%3), %%xmm0 \n\t" // read bb
266266

267267
" vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
268268
" vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t"
269269
" vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t"
270270
" vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t"
271271

272-
" vmovups %%xmm8 , (%6) \n\t" // write a
273-
" vmovups %%xmm9 , 16(%6) \n\t"
274-
" vmovups %%xmm10 , 32(%6) \n\t"
275-
" vmovups %%xmm11 , 48(%6) \n\t"
272+
" vmovups %%xmm8 , (%2) \n\t" // write a
273+
" vmovups %%xmm9 , 16(%2) \n\t"
274+
" vmovups %%xmm10 , 32(%2) \n\t"
275+
" vmovups %%xmm11 , 48(%2) \n\t"
276276

277277
" vmovups %%xmm8 , (%4) \n\t" // write c0
278278
" vmovups %%xmm9 , 16(%4) \n\t"
@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
282282
" vzeroupper \n\t"
283283

284284
:
285+
"+r" (n1), // 0
286+
"+a" (i), // 1
287+
"+r" (as), // 2
288+
"+r" (bs) // 3
285289
:
286-
"r" (n1), // 0
287-
"a" (i), // 1
288-
"r" (a), // 2
289-
"r" (b), // 3
290290
"r" (c), // 4
291291
"r" (c1), // 5
292-
"r" (as), // 6
293-
"r" (bs) // 7
292+
"r" (a), // 6
293+
"r" (b) // 7
294294
: "cc",
295295
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
296296
"%xmm4", "%xmm5", "%xmm6", "%xmm7",

0 commit comments

Comments
 (0)