@@ -4295,15 +4295,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4295
4295
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
4296
4296
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
4297
4297
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
4298
- ; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6 , 0x200, v4
4298
+ ; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4 , 0x200, v4
4299
4299
; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
4300
4300
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4301
- ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4 , 3, v6
4302
- ; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6
4303
- ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4 , 3, v4
4301
+ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6 , 3, v4
4302
+ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4
4303
+ ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8 , 3, v6
4304
4304
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4305
- ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4 , 0xffff
4306
- ; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v7
4305
+ ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8 , 0xffff
4306
+ ; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v6
4307
4307
; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
4308
4308
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
4309
4309
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -4317,29 +4317,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4317
4317
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
4318
4318
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
4319
4319
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4320
- ; GFX12-TRUE16-NEXT: buffer_load_b32 v6 , v9, s[4:7], null offen
4320
+ ; GFX12-TRUE16-NEXT: buffer_load_b32 v4 , v9, s[4:7], null offen
4321
4321
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
4322
4322
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
4323
4323
; GFX12-TRUE16-NEXT: ; %bb.2:
4324
4324
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
4325
+ ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
4325
4326
; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
4326
4327
; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
4327
4328
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
4328
4329
; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
4329
4330
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4330
- ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6
4331
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
4331
4332
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
4332
4333
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
4333
4334
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4334
- ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8
4335
- ; GFX12-TRUE16-NEXT: v_add_f16_e32 v6 .l, v6 .l, v5 .l
4335
+ ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6
4336
+ ; GFX12-TRUE16-NEXT: v_add_f16_e32 v4 .l, v4 .l, v7 .l
4336
4337
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4337
- ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6 , 0xffff, v6
4338
- ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
4338
+ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4 , 0xffff, v4
4339
+ ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4
4339
4340
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4340
- ; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8 , v10, v6
4341
- ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7
4342
- ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8
4341
+ ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6 , v10, v4
4342
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
4343
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
4343
4344
; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
4344
4345
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
4345
4346
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -4354,21 +4355,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4354
4355
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
4355
4356
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
4356
4357
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4357
- ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7 ], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
4358
+ ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5 ], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
4358
4359
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
4359
4360
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
4360
4361
; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4361
4362
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
4362
4363
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4363
- ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
4364
+ ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
4364
4365
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
4365
4366
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
4366
4367
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
4367
4368
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
4368
4369
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
4369
4370
; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
4370
4371
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
4371
- ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6
4372
+ ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4
4372
4373
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4373
4374
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
4374
4375
;
@@ -4526,16 +4527,16 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4526
4527
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
4527
4528
; GFX11-TRUE16: ; %bb.0:
4528
4529
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4529
- ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6 , 0x200, v4
4530
+ ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4 , 0x200, v4
4530
4531
; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
4531
4532
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
4532
4533
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4533
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4 , 3, v6
4534
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6
4535
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4 , 3, v4
4534
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6 , 3, v4
4535
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4
4536
+ ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8 , 3, v6
4536
4537
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4537
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4 , 0xffff
4538
- ; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v7
4538
+ ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8 , 0xffff
4539
+ ; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v6
4539
4540
; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
4540
4541
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
4541
4542
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -4547,29 +4548,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4547
4548
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
4548
4549
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4549
4550
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
4550
- ; GFX11-TRUE16-NEXT: buffer_load_b32 v6 , v9, s[4:7], 0 offen
4551
+ ; GFX11-TRUE16-NEXT: buffer_load_b32 v4 , v9, s[4:7], 0 offen
4551
4552
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
4552
4553
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
4553
4554
; GFX11-TRUE16-NEXT: ; %bb.2:
4554
4555
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
4556
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
4555
4557
; GFX11-TRUE16-NEXT: .p2align 6
4556
4558
; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
4557
4559
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
4558
4560
; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
4559
4561
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4560
- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6
4562
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
4561
4563
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
4562
4564
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
4563
4565
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4564
- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8
4565
- ; GFX11-TRUE16-NEXT: v_add_f16_e32 v6 .l, v6 .l, v5 .l
4566
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6
4567
+ ; GFX11-TRUE16-NEXT: v_add_f16_e32 v4 .l, v4 .l, v7 .l
4566
4568
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4567
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6 , 0xffff, v6
4568
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
4569
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4 , 0xffff, v4
4570
+ ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4
4569
4571
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4570
- ; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8 , v10, v6
4571
- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
4572
- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8
4572
+ ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6 , v10, v4
4573
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
4574
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
4573
4575
; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
4574
4576
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
4575
4577
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -4583,13 +4585,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4583
4585
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4584
4586
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
4585
4587
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4586
- ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7 ], v9, s[4:7], 0 offen glc
4588
+ ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5 ], v9, s[4:7], 0 offen glc
4587
4589
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
4588
4590
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
4589
4591
; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4590
4592
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
4591
4593
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4592
- ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
4594
+ ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
4593
4595
; GFX11-TRUE16-NEXT: buffer_gl1_inv
4594
4596
; GFX11-TRUE16-NEXT: buffer_gl0_inv
4595
4597
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -4598,7 +4600,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4598
4600
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
4599
4601
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
4600
4602
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
4601
- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6
4603
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4
4602
4604
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
4603
4605
;
4604
4606
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
0 commit comments