@@ -4299,15 +4299,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4299
4299
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
4300
4300
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
4301
4301
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
4302
- ; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4 , 0x200, v4
4302
+ ; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6 , 0x200, v4
4303
4303
; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
4304
4304
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4305
- ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6 , 3, v4
4306
- ; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4
4307
- ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8 , 3, v6
4305
+ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4 , 3, v6
4306
+ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6
4307
+ ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4 , 3, v4
4308
4308
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4309
- ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8 , 0xffff
4310
- ; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v6
4309
+ ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4 , 0xffff
4310
+ ; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v7
4311
4311
; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
4312
4312
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
4313
4313
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -4326,24 +4326,24 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4326
4326
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
4327
4327
; GFX12-TRUE16-NEXT: ; %bb.2:
4328
4328
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
4329
- ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
4330
4329
; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
4331
4330
; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
4332
4331
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
4333
4332
; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
4334
4333
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4335
- ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6
4334
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6
4336
4335
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
4337
4336
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
4338
4337
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4339
- ; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l , v4.l, v7.l
4340
- ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
4338
+ ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6 , v4, v8
4339
+ ; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
4341
4340
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4342
- ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4
4343
- ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4
4344
- ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
4345
- ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
4346
- ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
4341
+ ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
4342
+ ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
4343
+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4344
+ ; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6
4345
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7
4346
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8
4347
4347
; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
4348
4348
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
4349
4349
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -4358,22 +4358,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4358
4358
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
4359
4359
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
4360
4360
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4361
- ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5 ], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
4361
+ ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7 ], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
4362
4362
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
4363
4363
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
4364
4364
; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4365
4365
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
4366
4366
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4367
- ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
4368
- ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
4367
+ ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
4369
4368
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
4370
4369
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
4371
4370
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
4372
4371
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
4373
4372
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
4374
4373
; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
4375
4374
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
4376
- ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4
4375
+ ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6
4377
4376
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4378
4377
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
4379
4378
;
@@ -4530,16 +4529,16 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4530
4529
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
4531
4530
; GFX11-TRUE16: ; %bb.0:
4532
4531
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4533
- ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4 , 0x200, v4
4532
+ ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6 , 0x200, v4
4534
4533
; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
4535
4534
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
4536
4535
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4537
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6 , 3, v4
4538
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4
4539
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8 , 3, v6
4536
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4 , 3, v6
4537
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6
4538
+ ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4 , 3, v4
4540
4539
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4541
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8 , 0xffff
4542
- ; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v6
4540
+ ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4 , 0xffff
4541
+ ; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v7
4543
4542
; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
4544
4543
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
4545
4544
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -4556,24 +4555,24 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4556
4555
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
4557
4556
; GFX11-TRUE16-NEXT: ; %bb.2:
4558
4557
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
4559
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
4560
4558
; GFX11-TRUE16-NEXT: .p2align 6
4561
4559
; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
4562
4560
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
4563
4561
; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
4564
4562
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4565
- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6
4563
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6
4566
4564
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
4567
4565
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
4568
4566
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4569
- ; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l , v4.l, v7.l
4570
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
4567
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6 , v4, v8
4568
+ ; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
4571
4569
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4572
- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4
4573
- ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4
4574
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
4575
- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
4576
- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
4570
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
4571
+ ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
4572
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4573
+ ; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6
4574
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
4575
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8
4577
4576
; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
4578
4577
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
4579
4578
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -4587,14 +4586,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4587
4586
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4588
4587
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
4589
4588
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4590
- ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5 ], v9, s[4:7], 0 offen glc
4589
+ ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7 ], v9, s[4:7], 0 offen glc
4591
4590
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
4592
4591
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
4593
4592
; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
4594
4593
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
4595
4594
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4596
- ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
4597
- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
4595
+ ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
4598
4596
; GFX11-TRUE16-NEXT: buffer_gl1_inv
4599
4597
; GFX11-TRUE16-NEXT: buffer_gl0_inv
4600
4598
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -4603,7 +4601,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
4603
4601
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
4604
4602
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
4605
4603
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
4606
- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4
4604
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6
4607
4605
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
4608
4606
;
4609
4607
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
0 commit comments