diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8554db0a1220c..6ca243990c468 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15136,21 +15136,31 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) { } } - // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller - // than X, and the And doesn't change the lower iX bits, we can move the - // AssertZext in front of the And and drop the AssertSext. if (Opcode == ISD::AssertZext && N0.getOpcode() == ISD::AND && - N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::AssertSext && - isa(N0.getOperand(1))) { - SDValue BigA = N0.getOperand(0); - EVT BigA_AssertVT = cast(BigA.getOperand(1))->getVT(); + N0.hasOneUse() && isa(N0.getOperand(1))) { const APInt &Mask = N0.getConstantOperandAPInt(1); - if (AssertVT.bitsLT(BigA_AssertVT) && - Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) { + + // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller + // than X, and the And doesn't change the lower iX bits, we can move the + // AssertZext in front of the And and drop the AssertSext. + if (N0.getOperand(0).getOpcode() == ISD::AssertSext) { + SDValue BigA = N0.getOperand(0); + EVT BigA_AssertVT = cast(BigA.getOperand(1))->getVT(); + if (AssertVT.bitsLT(BigA_AssertVT) && + Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) { + SDLoc DL(N); + SDValue NewAssert = + DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1); + return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert, + N0.getOperand(1)); + } + } + + // Remove AssertZext entirely if the mask guarantees the assertion cannot + // fail. + if (Mask.isMask() && Mask.countr_one() <= AssertVT.getScalarSizeInBits()) { SDLoc DL(N); - SDValue NewAssert = - DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1); - return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert, + return DAG.getNode(ISD::AND, DL, N0.getValueType(), N0.getOperand(0), N0.getOperand(1)); } } diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index 9cf9d81773037..7a542d2e9c514 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -442,9 +442,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:456 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index b5e579b78a59c..b25d9b245f5f6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -714,10 +714,10 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -732,9 +732,9 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX12-LABEL: store_load_vindex_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -769,8 +769,8 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX942-LABEL: store_load_vindex_kernel: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_lshl_b32 s0, s0, 7 @@ -809,10 +809,10 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX11-PAL-LABEL: store_load_vindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -827,9 +827,9 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX12-PAL-LABEL: store_load_vindex_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -1958,10 +1958,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:384 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1976,10 +1976,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX12-LABEL: store_load_vindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -2021,8 +2021,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_lshl_b32 s0, s0, 7 @@ -2092,10 +2092,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:384 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) @@ -2110,10 +2110,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:384 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 @@ -3254,10 +3254,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3274,10 +3274,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX12-LABEL: store_load_vindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16512 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -3319,8 +3319,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_lshl_b32 s0, s0, 7 @@ -3391,10 +3391,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3411,10 +3411,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16512 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 565ad295ebbb3..08c0d15432915 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -15,8 +15,8 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-LABEL: test_iglp_opt_mfma_gemm: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -153,8 +153,8 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -289,8 +289,8 @@ define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias ; GCN-LABEL: test_iglp_opt_asm_sideeffect: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll index 6507976872410..46359f7e99059 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll @@ -6,9 +6,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 5, v0 +; GCN-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 @@ -74,9 +74,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v40, 5, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 @@ -178,9 +178,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 @@ -260,9 +260,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 5, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll index 02e80b62fed6e..dcc3e0df0c744 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -8,10 +8,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GCN-NEXT: v_mov_b32_e32 v48, 0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0 +; GCN-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 @@ -60,10 +60,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll index 371b4f070094d..0764cd5d34d75 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN-MINREG: ; %bb.0: ; %entry ; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) @@ -140,8 +140,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN-MAXOCC: ; %bb.0: ; %entry ; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-MAXOCC-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 ; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) @@ -274,8 +274,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN-ILP: ; %bb.0: ; %entry ; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-ILP-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-ILP-NEXT: v_mov_b32_e32 v2, 2.0 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) @@ -469,8 +469,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: ; GCN-MINREG: ; %bb.0: ; %entry ; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MINREG-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v0, 2.0 ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) @@ -604,8 +604,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: ; GCN-MAXOCC: ; %bb.0: ; %entry ; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-MAXOCC-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v3, 7, v0 +; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-MAXOCC-NEXT: v_and_b32_e32 v3, 0x1ff80, v0 ; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 2.0 ; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0) @@ -739,8 +739,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: ; GCN-ILP: ; %bb.0: ; %entry ; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-ILP-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-ILP-NEXT: v_and_b32_e32 v2, 0x1ff80, v0 ; GCN-ILP-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 73586b1243376..37f335561a52c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -621,8 +621,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 @@ -728,8 +728,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112 @@ -871,8 +871,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,8 +1005,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -1202,7 +1202,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v7, 0x32a5705f -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v4, s0, v3 ; GCN-NEXT: v_rndne_f32_e32 v5, v4 @@ -1212,7 +1212,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_add_f32_e32 v4, v6, v4 ; GCN-NEXT: v_exp_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_add_u32_e32 v1, s6, v0 ; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:96 @@ -1387,7 +1387,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x32a5705f -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s0, v3 ; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v5, v4 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v6, v4 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 ; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s6, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index 44415657b6336..858921659ae7a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -208,11 +208,11 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { ; ; GFX11-LABEL: dpp_test1: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX11-NEXT: ds_load_b32 v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_barrier diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 2bda61ab950f7..4e1d2a754fa61 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -146,8 +146,8 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr ; GCN-LABEL: mubuf_clause: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v31 +; GCN-NEXT: v_and_b32_e32 v2, 0x3ff0, v2 ; GCN-NEXT: v_add_u32_e32 v0, v0, v2 ; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:12 ; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8 @@ -205,8 +205,8 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr ; GCN-SCRATCH-LABEL: mubuf_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SCRATCH-NEXT: v_and_b32_e32 v2, 0x3ff, v31 -; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v18, 4, v2 +; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v2, 4, v31 +; GCN-SCRATCH-NEXT: v_and_b32_e32 v18, 0x3ff0, v2 ; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v0, v0, v18 ; GCN-SCRATCH-NEXT: s_clause 0x3 ; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[2:5], v0, off