@@ -4295,15 +4295,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4295
4295
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
4296
4296
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
4297
4297
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
4298
- ; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4 , 0x200, v4
4298
+ ; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6 , 0x200, v4
4299
4299
; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
4300
4300
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4301
- ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6 , 3, v4
4302
- ; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4
4303
- ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8 , 3, v6
4301
+ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4 , 3, v6
4302
+ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6
4303
+ ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4 , 3, v4
4304
4304
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4305
- ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8 , 0xffff
4306
- ; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v6
4305
+ ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4 , 0xffff
4306
+ ; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v7
4307
4307
; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
4308
4308
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
4309
4309
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -4317,30 +4317,29 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4317
4317
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
4318
4318
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
4319
4319
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4320
- ; GFX12-TRUE16-NEXT: buffer_load_b32 v4 , v9, s[4:7], null offen
4320
+ ; GFX12-TRUE16-NEXT: buffer_load_b32 v6 , v9, s[4:7], null offen
4321
4321
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
4322
4322
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
4323
4323
; GFX12-TRUE16-NEXT: ; %bb.2:
4324
4324
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
4325
- ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
4326
4325
; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
4327
4326
; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
4328
4327
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
4329
4328
; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
4330
4329
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4331
- ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
4330
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6
4332
4331
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
4333
4332
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
4334
4333
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4335
- ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6
4336
- ; GFX12-TRUE16-NEXT: v_add_f16_e32 v4 .l, v4 .l, v7 .l
4334
+ ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8
4335
+ ; GFX12-TRUE16-NEXT: v_add_f16_e32 v6 .l, v6 .l, v5 .l
4337
4336
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4338
- ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4 , 0xffff, v4
4339
- ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4
4337
+ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6 , 0xffff, v6
4338
+ ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
4340
4339
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4341
- ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6 , v10, v4
4342
- ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
4343
- ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
4340
+ ; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8 , v10, v6
4341
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7
4342
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8
4344
4343
; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
4345
4344
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
4346
4345
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -4355,21 +4354,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4355
4354
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
4356
4355
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
4357
4356
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4358
- ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5 ], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
4357
+ ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7 ], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
4359
4358
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
4360
4359
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
4361
4360
; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4362
4361
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
4363
4362
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4364
- ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
4363
+ ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
4365
4364
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
4366
4365
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
4367
4366
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
4368
4367
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
4369
4368
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
4370
4369
; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
4371
4370
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
4372
- ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4
4371
+ ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6
4373
4372
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4374
4373
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
4375
4374
;
@@ -4527,16 +4526,16 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4527
4526
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
4528
4527
; GFX11-TRUE16: ; %bb.0:
4529
4528
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4530
- ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4 , 0x200, v4
4529
+ ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6 , 0x200, v4
4531
4530
; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
4532
4531
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
4533
4532
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4534
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6 , 3, v4
4535
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4
4536
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8 , 3, v6
4533
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4 , 3, v6
4534
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6
4535
+ ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4 , 3, v4
4537
4536
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4538
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8 , 0xffff
4539
- ; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v6
4537
+ ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4 , 0xffff
4538
+ ; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v7
4540
4539
; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
4541
4540
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
4542
4541
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -4548,30 +4547,29 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4548
4547
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
4549
4548
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4550
4549
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
4551
- ; GFX11-TRUE16-NEXT: buffer_load_b32 v4 , v9, s[4:7], 0 offen
4550
+ ; GFX11-TRUE16-NEXT: buffer_load_b32 v6 , v9, s[4:7], 0 offen
4552
4551
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
4553
4552
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
4554
4553
; GFX11-TRUE16-NEXT: ; %bb.2:
4555
4554
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
4556
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
4557
4555
; GFX11-TRUE16-NEXT: .p2align 6
4558
4556
; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
4559
4557
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
4560
4558
; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
4561
4559
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4562
- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
4560
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6
4563
4561
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
4564
4562
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
4565
4563
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4566
- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6
4567
- ; GFX11-TRUE16-NEXT: v_add_f16_e32 v4 .l, v4 .l, v7 .l
4564
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8
4565
+ ; GFX11-TRUE16-NEXT: v_add_f16_e32 v6 .l, v6 .l, v5 .l
4568
4566
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4569
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4 , 0xffff, v4
4570
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4
4567
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6 , 0xffff, v6
4568
+ ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
4571
4569
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4572
- ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6 , v10, v4
4573
- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
4574
- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
4570
+ ; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8 , v10, v6
4571
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
4572
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8
4575
4573
; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
4576
4574
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
4577
4575
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -4585,13 +4583,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4585
4583
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4586
4584
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
4587
4585
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4588
- ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5 ], v9, s[4:7], 0 offen glc
4586
+ ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7 ], v9, s[4:7], 0 offen glc
4589
4587
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
4590
4588
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
4591
4589
; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4592
4590
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
4593
4591
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4594
- ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
4592
+ ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
4595
4593
; GFX11-TRUE16-NEXT: buffer_gl1_inv
4596
4594
; GFX11-TRUE16-NEXT: buffer_gl0_inv
4597
4595
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -4600,7 +4598,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4600
4598
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
4601
4599
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
4602
4600
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
4603
- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4
4601
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6
4604
4602
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
4605
4603
;
4606
4604
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
0 commit comments