Skip to content

Commit f9bb76d

Browse files
authored
Fix inline assembly constraints in Bulldozer TRSM kernels
rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009
1 parent 69edc5b commit f9bb76d

File tree

5 files changed

+356
-356
lines changed

5 files changed

+356
-356
lines changed

kernel/x86_64/dtrsm_kernel_RT_bulldozer.c

Lines changed: 48 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
125125
" .align 16 \n\t"
126126
"1: \n\t"
127127

128-
" prefetcht0 384(%2,%1,8) \n\t"
129-
" prefetcht0 384(%3,%1,8) \n\t"
130-
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
131-
" vmovups (%2,%1,8), %%xmm4 \n\t"
132-
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
133-
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
134-
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
135-
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
128+
" prefetcht0 384(%6,%1,8) \n\t"
129+
" prefetcht0 384(%7,%1,8) \n\t"
130+
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
131+
" vmovups (%6,%1,8), %%xmm4 \n\t"
132+
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
133+
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
134+
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
135+
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
136136

137137
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
138138
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
147147

148148
" jz 2f \n\t"
149149

150-
" prefetcht0 384(%2,%1,8) \n\t"
151-
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
152-
" vmovups (%2,%1,8), %%xmm4 \n\t"
153-
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
154-
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
155-
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
156-
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
150+
" prefetcht0 384(%6,%1,8) \n\t"
151+
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
152+
" vmovups (%6,%1,8), %%xmm4 \n\t"
153+
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
154+
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
155+
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
156+
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
157157

158158
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
159159
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
168168

169169
" jz 2f \n\t"
170170

171-
" prefetcht0 384(%2,%1,8) \n\t"
172-
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
173-
" vmovups (%2,%1,8), %%xmm4 \n\t"
174-
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
175-
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
176-
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
177-
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
171+
" prefetcht0 384(%6,%1,8) \n\t"
172+
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
173+
" vmovups (%6,%1,8), %%xmm4 \n\t"
174+
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
175+
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
176+
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
177+
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
178178

179179
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
180180
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
189189

190190
" jz 2f \n\t"
191191

192-
" prefetcht0 384(%2,%1,8) \n\t"
193-
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
194-
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
195-
" vmovups (%2,%1,8), %%xmm4 \n\t"
196-
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
197-
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
198-
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
192+
" prefetcht0 384(%6,%1,8) \n\t"
193+
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
194+
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
195+
" vmovups (%6,%1,8), %%xmm4 \n\t"
196+
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
197+
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
198+
" vmovups 48(%6,%1,8), %%xmm7 \n\t"
199199

200200
" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
201201
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
235235

236236
"3: \n\t" // i = 1
237237

238-
" vmovddup (%7), %%xmm1 \n\t" // read b
239-
" vmovddup 8(%7), %%xmm0 \n\t" // read bb
238+
" vmovddup (%3), %%xmm1 \n\t" // read b
239+
" vmovddup 8(%3), %%xmm0 \n\t" // read bb
240240

241241
" vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
242242
" vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
243243
" vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
244244
" vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb
245245

246-
" vmovups %%xmm12 , (%6) \n\t" // write a
247-
" vmovups %%xmm13 , 16(%6) \n\t" // write a
248-
" vmovups %%xmm14 , 32(%6) \n\t" // write a
249-
" vmovups %%xmm15 , 48(%6) \n\t" // write a
246+
" vmovups %%xmm12 , (%2) \n\t" // write a
247+
" vmovups %%xmm13 , 16(%2) \n\t" // write a
248+
" vmovups %%xmm14 , 32(%2) \n\t" // write a
249+
" vmovups %%xmm15 , 48(%2) \n\t" // write a
250250

251251
" vmovups %%xmm12 , (%5) \n\t" // write c1
252252
" vmovups %%xmm13 , 16(%5) \n\t"
@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
259259
" vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"
260260

261261
" \n\t" // i = 0
262-
" subq $16 , %7 \n\t" // b = b - 2
263-
" subq $64 , %6 \n\t" // a = a - 8
262+
" subq $16 , %3 \n\t" // b = b - 2
263+
" subq $64 , %2 \n\t" // a = a - 8
264264

265-
" vmovddup (%7), %%xmm0 \n\t" // read bb
265+
" vmovddup (%3), %%xmm0 \n\t" // read bb
266266

267267
" vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
268268
" vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t"
269269
" vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t"
270270
" vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t"
271271

272-
" vmovups %%xmm8 , (%6) \n\t" // write a
273-
" vmovups %%xmm9 , 16(%6) \n\t"
274-
" vmovups %%xmm10 , 32(%6) \n\t"
275-
" vmovups %%xmm11 , 48(%6) \n\t"
272+
" vmovups %%xmm8 , (%2) \n\t" // write a
273+
" vmovups %%xmm9 , 16(%2) \n\t"
274+
" vmovups %%xmm10 , 32(%2) \n\t"
275+
" vmovups %%xmm11 , 48(%2) \n\t"
276276

277277
" vmovups %%xmm8 , (%4) \n\t" // write c0
278278
" vmovups %%xmm9 , 16(%4) \n\t"
@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
282282
" vzeroupper \n\t"
283283

284284
:
285+
"+r" (n1), // 0
286+
"+a" (i), // 1
287+
"+r" (as), // 2
288+
"+r" (bs) // 3
285289
:
286-
"r" (n1), // 0
287-
"a" (i), // 1
288-
"r" (a), // 2
289-
"r" (b), // 3
290290
"r" (c), // 4
291291
"r" (c1), // 5
292-
"r" (as), // 6
293-
"r" (bs) // 7
292+
"r" (a), // 6
293+
"r" (b) // 7
294294
: "cc",
295295
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
296296
"%xmm4", "%xmm5", "%xmm6", "%xmm7",

0 commit comments

Comments
 (0)