@@ -268,30 +268,30 @@ define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x)
268
268
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
269
269
; GFX10-NEXT: v_lshrrev_b16 v6, 8, v1
270
270
; GFX10-NEXT: ds_write_b8 v0, v1
271
- ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2
272
- ; GFX10-NEXT: v_lshrrev_b16 v8, 8, v2
273
- ; GFX10-NEXT: v_lshrrev_b16 v7, 8, v5
271
+ ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
272
+ ; GFX10-NEXT: v_lshrrev_b16 v7, 8, v2
273
+ ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v5
274
+ ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v2
274
275
; GFX10-NEXT: ds_write_b8 v0, v2 offset:4
275
276
; GFX10-NEXT: ds_write_b8 v0, v6 offset:1
276
- ; GFX10-NEXT: ds_write_b8 v0, v5 offset:2
277
- ; GFX10-NEXT: ds_write_b8 v0, v7 offset:3
278
- ; GFX10-NEXT: v_lshrrev_b16 v2, 8, v1
279
- ; GFX10-NEXT: ds_write_b8 v0, v8 offset:5
280
- ; GFX10-NEXT: ds_write_b8 v0, v1 offset:6
281
- ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v3
277
+ ; GFX10-NEXT: ds_write_b8 v0, v1 offset:3
278
+ ; GFX10-NEXT: ds_write_b8 v0, v7 offset:5
279
+ ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6
280
+ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v3
281
+ ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v5
282
282
; GFX10-NEXT: v_lshrrev_b16 v5, 8, v3
283
- ; GFX10-NEXT: ds_write_b8 v0, v2 offset:7
283
+ ; GFX10-NEXT: ds_write_b8 v0, v1 offset:7
284
284
; GFX10-NEXT: ds_write_b8 v0, v3 offset:8
285
- ; GFX10-NEXT: v_lshrrev_b32_e32 v2 , 16, v4
286
- ; GFX10-NEXT: v_lshrrev_b16 v3 , 8, v1
285
+ ; GFX10-NEXT: v_lshrrev_b32_e32 v1 , 16, v4
286
+ ; GFX10-NEXT: v_lshrrev_b16 v2 , 8, v2
287
287
; GFX10-NEXT: ds_write_b8 v0, v5 offset:9
288
288
; GFX10-NEXT: v_lshrrev_b16 v5, 8, v4
289
- ; GFX10-NEXT: ds_write_b8 v0, v1 offset:10
290
- ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v2
291
- ; GFX10-NEXT: ds_write_b8 v0, v3 offset:11
289
+ ; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:10
290
+ ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v1
291
+ ; GFX10-NEXT: ds_write_b8 v0, v2 offset:11
292
292
; GFX10-NEXT: ds_write_b8 v0, v4 offset:12
293
293
; GFX10-NEXT: ds_write_b8 v0, v5 offset:13
294
- ; GFX10-NEXT: ds_write_b8 v0, v2 offset:14
294
+ ; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:14
295
295
; GFX10-NEXT: ds_write_b8 v0, v1 offset:15
296
296
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
297
297
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -342,24 +342,24 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x)
342
342
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v1
343
343
; GFX10-NEXT: v_lshrrev_b16 v5, 8, v1
344
344
; GFX10-NEXT: ds_write_b8 v0, v1
345
+ ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
345
346
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2
347
+ ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3
348
+ ; GFX10-NEXT: v_lshrrev_b16 v4, 8, v4
346
349
; GFX10-NEXT: v_lshrrev_b16 v6, 8, v2
347
- ; GFX10-NEXT: v_lshrrev_b16 v7, 8, v4
348
350
; GFX10-NEXT: ds_write_b8 v0, v2 offset:4
349
- ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v3
351
+ ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v1
350
352
; GFX10-NEXT: ds_write_b8 v0, v5 offset:1
351
- ; GFX10-NEXT: ds_write_b8 v0, v4 offset:2
352
- ; GFX10-NEXT: ds_write_b8 v0, v7 offset:3
353
- ; GFX10-NEXT: v_lshrrev_b16 v4, 8, v1
354
- ; GFX10-NEXT: v_lshrrev_b16 v5, 8, v3
355
- ; GFX10-NEXT: ds_write_b8 v0, v1 offset:6
356
- ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v2
353
+ ; GFX10-NEXT: ds_write_b8 v0, v4 offset:3
354
+ ; GFX10-NEXT: v_lshrrev_b16 v4, 8, v3
355
+ ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6
356
+ ; GFX10-NEXT: v_lshrrev_b16 v2, 8, v7
357
357
; GFX10-NEXT: ds_write_b8 v0, v6 offset:5
358
- ; GFX10-NEXT: ds_write_b8 v0, v4 offset:7
358
+ ; GFX10-NEXT: ds_write_b8 v0, v1 offset:7
359
359
; GFX10-NEXT: ds_write_b8 v0, v3 offset:8
360
- ; GFX10-NEXT: ds_write_b8 v0, v5 offset:9
361
- ; GFX10-NEXT: ds_write_b8 v0, v2 offset:10
362
- ; GFX10-NEXT: ds_write_b8 v0, v1 offset:11
360
+ ; GFX10-NEXT: ds_write_b8 v0, v4 offset:9
361
+ ; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:10
362
+ ; GFX10-NEXT: ds_write_b8 v0, v2 offset:11
363
363
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
364
364
; GFX10-NEXT: s_setpc_b64 s[30:31]
365
365
store <3 x i32 > %x , <3 x i32 > addrspace (3 )* %out , align 1
0 commit comments