From bb51f44c03c9a10113ac53e1647c9475fc1f175c Mon Sep 17 00:00:00 2001 From: pvanhout Date: Thu, 26 Jun 2025 12:48:44 +0200 Subject: [PATCH 1/2] [AMDGPU][DAG] Remove AssertZext before some intrinsics --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 18 +++++++ llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll | 4 +- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 54 +++++++++---------- .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 6 +-- .../llvm.amdgcn.sched.group.barrier.gfx11.ll | 16 +++--- .../llvm.amdgcn.sched.group.barrier.gfx12.ll | 8 +-- ...vm.amdgcn.sched.group.barrier.iterative.ll | 20 +++---- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 16 +++--- .../CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll | 4 +- ...kitem.id-unsupported-calling-convention.ll | 16 ++---- llvm/test/CodeGen/AMDGPU/memory_clause.ll | 8 +-- 11 files changed, 91 insertions(+), 79 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index d75c7a178b4a8..aa7aca1c83b85 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3985,6 +3985,24 @@ SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, } } + // AssertZext in front of these intrinsics is not necessary, the lowering of + // the intrinsics into a register read will insert one if it is needed. + if (N->getOpcode() == ISD::AssertZext && + N0.getOpcode() == ISD::INTRINSIC_WO_CHAIN) { + unsigned IID = N0.getConstantOperandVal(0); + switch (IID) { + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::amdgcn_workgroup_id_x: + case Intrinsic::amdgcn_workgroup_id_y: + case Intrinsic::amdgcn_workgroup_id_z: + return N0; + default: + break; + } + } + return SDValue(); } diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index 9cf9d81773037..7a542d2e9c514 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -442,9 +442,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:456 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index b5e579b78a59c..b25d9b245f5f6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -714,10 +714,10 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -732,9 +732,9 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX12-LABEL: store_load_vindex_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -769,8 +769,8 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX942-LABEL: store_load_vindex_kernel: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_lshl_b32 s0, s0, 7 @@ -809,10 +809,10 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX11-PAL-LABEL: store_load_vindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -827,9 +827,9 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX12-PAL-LABEL: store_load_vindex_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -1958,10 +1958,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:384 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1976,10 +1976,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX12-LABEL: store_load_vindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -2021,8 +2021,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_lshl_b32 s0, s0, 7 @@ -2092,10 +2092,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:384 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) @@ -2110,10 +2110,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:384 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 @@ -3254,10 +3254,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3274,10 +3274,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX12-LABEL: store_load_vindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16512 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -3319,8 +3319,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_lshl_b32 s0, s0, 7 @@ -3391,10 +3391,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3411,10 +3411,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16512 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 565ad295ebbb3..08c0d15432915 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -15,8 +15,8 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-LABEL: test_iglp_opt_mfma_gemm: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -153,8 +153,8 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -289,8 +289,8 @@ define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias ; GCN-LABEL: test_iglp_opt_asm_sideeffect: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll index 6507976872410..46359f7e99059 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll @@ -6,9 +6,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 5, v0 +; GCN-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 @@ -74,9 +74,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v40, 5, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 @@ -178,9 +178,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 @@ -260,9 +260,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 5, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll index 02e80b62fed6e..dcc3e0df0c744 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -8,10 +8,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GCN-NEXT: v_mov_b32_e32 v48, 0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0 +; GCN-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 @@ -60,10 +60,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll index 371b4f070094d..0764cd5d34d75 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN-MINREG: ; %bb.0: ; %entry ; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) @@ -140,8 +140,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN-MAXOCC: ; %bb.0: ; %entry ; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-MAXOCC-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 ; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) @@ -274,8 +274,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN-ILP: ; %bb.0: ; %entry ; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-ILP-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-ILP-NEXT: v_mov_b32_e32 v2, 2.0 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) @@ -469,8 +469,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: ; GCN-MINREG: ; %bb.0: ; %entry ; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MINREG-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v0, 2.0 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) @@ -604,8 +604,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: ; GCN-MAXOCC: ; %bb.0: ; %entry ; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-MAXOCC-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v3, 7, v0 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v3, 0x1ff80, v0 ; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 2.0 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) @@ -739,8 +739,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: ; GCN-ILP: ; %bb.0: ; %entry ; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-ILP-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-ILP-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 ; GCN-ILP-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 73586b1243376..37f335561a52c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -621,8 +621,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 @@ -728,8 +728,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112 @@ -871,8 +871,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,8 +1005,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -1202,7 +1202,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v7, 0x32a5705f -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v4, s0, v3 ; GCN-NEXT: v_rndne_f32_e32 v5, v4 @@ -1212,7 +1212,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_add_f32_e32 v4, v6, v4 ; GCN-NEXT: v_exp_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_add_u32_e32 v1, s6, v0 ; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:96 @@ -1387,7 +1387,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x32a5705f -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s0, v3 ; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v5, v4 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v6, v4 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 ; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s6, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index 44415657b6336..858921659ae7a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -208,11 +208,11 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { ; ; GFX11-LABEL: dpp_test1: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: ds_load_b32 v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_barrier diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll index 37cfb6cc10180..f21c17449dd7d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll @@ -9,16 +9,9 @@ declare i32 @llvm.amdgcn.workitem.id.z() ; FIXME: It's not worth adding AssertZext to the intrinsic calls, and ; we don't fold out assertzext undef ->undef define amdgpu_ps void @undefined_workitems(ptr addrspace(1) %p, ptr addrspace(1) %q, ptr addrspace(1) %r) { -; SDAG-LABEL: undefined_workitems: -; SDAG: ; %bb.0: -; SDAG-NEXT: global_store_dword v[0:1], v0, off -; SDAG-NEXT: global_store_dword v[2:3], v0, off -; SDAG-NEXT: global_store_dword v[4:5], v0, off -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: undefined_workitems: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_endpgm +; CHECK-LABEL: undefined_workitems: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() store i32 %id.x, ptr addrspace(1) %p %id.y = call i32 @llvm.amdgcn.workitem.id.y() @@ -28,4 +21,5 @@ define amdgpu_ps void @undefined_workitems(ptr addrspace(1) %p, ptr addrspace(1) ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} +; GISEL: {{.*}} +; SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 2bda61ab950f7..4e1d2a754fa61 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -146,8 +146,8 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr ; GCN-LABEL: mubuf_clause: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v31 +; GCN-NEXT: v_and_b32_e32 v2, 0x3ff0, v2 ; GCN-NEXT: v_add_u32_e32 v0, v0, v2 ; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:12 ; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8 @@ -205,8 +205,8 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr ; GCN-SCRATCH-LABEL: mubuf_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SCRATCH-NEXT: v_and_b32_e32 v2, 0x3ff, v31 -; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v18, 4, v2 +; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v2, 4, v31 +; GCN-SCRATCH-NEXT: v_and_b32_e32 v18, 0x3ff0, v2 ; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v0, v0, v18 ; GCN-SCRATCH-NEXT: s_clause 0x3 ; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[2:5], v0, off From 444c4702c1bcd076fb978d4904395e1f3f4a71ac Mon Sep 17 00:00:00 2001 From: pvanhout Date: Mon, 30 Jun 2025 11:09:09 +0200 Subject: [PATCH 2/2] change approach --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 34 ++++++++++++------- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 18 ---------- ...kitem.id-unsupported-calling-convention.ll | 16 ++++++--- 3 files changed, 33 insertions(+), 35 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8554db0a1220c..6ca243990c468 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15136,21 +15136,31 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) { } } - // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller - // than X, and the And doesn't change the lower iX bits, we can move the - // AssertZext in front of the And and drop the AssertSext. if (Opcode == ISD::AssertZext && N0.getOpcode() == ISD::AND && - N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::AssertSext && - isa(N0.getOperand(1))) { - SDValue BigA = N0.getOperand(0); - EVT BigA_AssertVT = cast(BigA.getOperand(1))->getVT(); + N0.hasOneUse() && isa(N0.getOperand(1))) { const APInt &Mask = N0.getConstantOperandAPInt(1); - if (AssertVT.bitsLT(BigA_AssertVT) && - Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) { + + // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller + // than X, and the And doesn't change the lower iX bits, we can move the + // AssertZext in front of the And and drop the AssertSext. + if (N0.getOperand(0).getOpcode() == ISD::AssertSext) { + SDValue BigA = N0.getOperand(0); + EVT BigA_AssertVT = cast(BigA.getOperand(1))->getVT(); + if (AssertVT.bitsLT(BigA_AssertVT) && + Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) { + SDLoc DL(N); + SDValue NewAssert = + DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1); + return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert, + N0.getOperand(1)); + } + } + + // Remove AssertZext entirely if the mask guarantees the assertion cannot + // fail. + if (Mask.isMask() && Mask.countr_one() <= AssertVT.getScalarSizeInBits()) { SDLoc DL(N); - SDValue NewAssert = - DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1); - return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert, + return DAG.getNode(ISD::AND, DL, N0.getValueType(), N0.getOperand(0), N0.getOperand(1)); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index aa7aca1c83b85..d75c7a178b4a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3985,24 +3985,6 @@ SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, } } - // AssertZext in front of these intrinsics is not necessary, the lowering of - // the intrinsics into a register read will insert one if it is needed. - if (N->getOpcode() == ISD::AssertZext && - N0.getOpcode() == ISD::INTRINSIC_WO_CHAIN) { - unsigned IID = N0.getConstantOperandVal(0); - switch (IID) { - case Intrinsic::amdgcn_workitem_id_x: - case Intrinsic::amdgcn_workitem_id_y: - case Intrinsic::amdgcn_workitem_id_z: - case Intrinsic::amdgcn_workgroup_id_x: - case Intrinsic::amdgcn_workgroup_id_y: - case Intrinsic::amdgcn_workgroup_id_z: - return N0; - default: - break; - } - } - return SDValue(); } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll index f21c17449dd7d..37cfb6cc10180 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll @@ -9,9 +9,16 @@ declare i32 @llvm.amdgcn.workitem.id.z() ; FIXME: It's not worth adding AssertZext to the intrinsic calls, and ; we don't fold out assertzext undef ->undef define amdgpu_ps void @undefined_workitems(ptr addrspace(1) %p, ptr addrspace(1) %q, ptr addrspace(1) %r) { -; CHECK-LABEL: undefined_workitems: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_endpgm +; SDAG-LABEL: undefined_workitems: +; SDAG: ; %bb.0: +; SDAG-NEXT: global_store_dword v[0:1], v0, off +; SDAG-NEXT: global_store_dword v[2:3], v0, off +; SDAG-NEXT: global_store_dword v[4:5], v0, off +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: undefined_workitems: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() store i32 %id.x, ptr addrspace(1) %p %id.y = call i32 @llvm.amdgcn.workitem.id.y() @@ -21,5 +28,4 @@ define amdgpu_ps void @undefined_workitems(ptr addrspace(1) %p, ptr addrspace(1) ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GISEL: {{.*}} -; SDAG: {{.*}} +; CHECK: {{.*}}