diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 9587fad1ecd63..94e1175b06b14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[ canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl ]>; +// Early select of uniform BFX into S_BFE instructions. +// These instructions encode the offset/width in a way that requires using +// bitwise operations. Selecting these instructions early allow the combiner +// to potentially fold these. +class lower_uniform_bfx : GICombineRule< + (defs root:$bfx), + (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); }])>; + +def lower_uniform_sbfx : lower_uniform_bfx; +def lower_uniform_ubfx : lower_uniform_bfx; + let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This // saves one instruction compared to the promotion. @@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner< zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, - cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> { + cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, + lower_uniform_sbfx, lower_uniform_ubfx]> { } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index ee324a5e93f0f..257acfd7f79b4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner { void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const; + bool lowerUniformBFX(MachineInstr &MI) const; + private: SIModeRegisterDefaults getMode() const; bool getIEEE() const; @@ -392,6 +394,55 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt( MI.eraseFromParent(); } +bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const { + assert(MI.getOpcode() == TargetOpcode::G_UBFX || + MI.getOpcode() == TargetOpcode::G_SBFX); + const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX); + + Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI); + assert(RB && "No RB?"); + if (RB->getID() != AMDGPU::SGPRRegBankID) + return false; + + Register SrcReg = MI.getOperand(1).getReg(); + Register OffsetReg = MI.getOperand(2).getReg(); + Register WidthReg = MI.getOperand(3).getReg(); + + const LLT S32 = LLT::scalar(32); + LLT Ty = MRI.getType(DstReg); + + const unsigned Opc = Ty == S32 + ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) + : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); + + // Pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + // The 64 bit variants use bits [6:0] + // + // If the value takes more than 5/6 bits, the G_U/SBFX is ill-formed. + // Thus, we do not clamp the values. We assume they are in range, + // and if they aren't, it is UB anyway. + + // Zeros out the low bits, so don't bother clamping the input value. + auto ShiftAmt = B.buildConstant(S32, 16); + auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt); + + auto MergedInputs = B.buildOr(S32, OffsetReg, ShiftWidth); + + MRI.setRegBank(ShiftAmt.getReg(0), *RB); + MRI.setRegBank(ShiftWidth.getReg(0), *RB); + MRI.setRegBank(MergedInputs.getReg(0), *RB); + + auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); + if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) + llvm_unreachable("failed to constrain BFE"); + + MI.eraseFromParent(); + return true; +} + SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const { return MF.getInfo()->getMode(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 6874657a4ffe7..140c2babb013f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1492,88 +1492,73 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B, Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); const RegisterBank *DstBank = - OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; - if (DstBank == &AMDGPU::VGPRRegBank) { - if (Ty == S32) - return true; - - // There is no 64-bit vgpr bitfield extract instructions so the operation - // is expanded to a sequence of instructions that implement the operation. - ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; - const LLT S64 = LLT::scalar(64); - // Shift the source operand so that extracted bits start at bit 0. - auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) - : B.buildLShr(S64, SrcReg, OffsetReg); - auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); - - // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions - // if the width is a constant. - if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) { - // Use the 32-bit bitfield extract instruction if the width is a constant. - // Depending on the width size, use either the low or high 32-bits. - auto Zero = B.buildConstant(S32, 0); - auto WidthImm = ConstWidth->Value.getZExtValue(); - if (WidthImm <= 32) { - // Use bitfield extract on the lower 32-bit source, and then sign-extend - // or clear the upper 32-bits. - auto Extract = - Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) - : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); - auto Extend = - Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; - B.buildMergeLikeInstr(DstReg, {Extract, Extend}); - } else { - // Use bitfield extract on upper 32-bit source, and combine with lower - // 32-bit source. - auto UpperWidth = B.buildConstant(S32, WidthImm - 32); - auto Extract = - Signed - ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) - : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); - B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract}); - } - MI.eraseFromParent(); + if (DstBank != &AMDGPU::VGPRRegBank) { + // SGPR: Canonicalize to a G_S/UBFX + if (!isa(MI)) return true; - } - // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit - // operations. - auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); - auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); + ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); if (Signed) - B.buildAShr(S64, SignBit, ExtShift); + B.buildSbfx(DstReg, SrcReg, OffsetReg, WidthReg); else - B.buildLShr(S64, SignBit, ExtShift); + B.buildUbfx(DstReg, SrcReg, OffsetReg, WidthReg); MI.eraseFromParent(); return true; } - // The scalar form packs the offset and width in a single operand. - - ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); - - // Ensure the high bits are clear to insert the offset. - auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6)); - auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); - - // Zeros out the low bits, so don't bother clamping the input value. - auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); - - // Transformation function, pack the offset and width of a BFE into - // the format expected by the S_BFE_I32 / S_BFE_U32. In the second - // source, bits [5:0] contain the offset and bits [22:16] the width. - auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); + // VGPR + if (Ty == S32) + return true; - // TODO: It might be worth using a pseudo here to avoid scc clobber and - // register class constraints. - unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : - (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); + // There is no 64-bit vgpr bitfield extract instructions so the operation + // is expanded to a sequence of instructions that implement the operation. + ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); - auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); - if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) - llvm_unreachable("failed to constrain BFE"); + const LLT S64 = LLT::scalar(64); + // Shift the source operand so that extracted bits start at bit 0. + auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) + : B.buildLShr(S64, SrcReg, OffsetReg); + auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); + + // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions + // if the width is a constant. + if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) { + // Use the 32-bit bitfield extract instruction if the width is a constant. + // Depending on the width size, use either the low or high 32-bits. + auto Zero = B.buildConstant(S32, 0); + auto WidthImm = ConstWidth->Value.getZExtValue(); + if (WidthImm <= 32) { + // Use bitfield extract on the lower 32-bit source, and then sign-extend + // or clear the upper 32-bits. + auto Extract = + Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) + : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); + auto Extend = + Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; + B.buildMergeLikeInstr(DstReg, {Extract, Extend}); + } else { + // Use bitfield extract on upper 32-bit source, and combine with lower + // 32-bit source. + auto UpperWidth = B.buildConstant(S32, WidthImm - 32); + auto Extract = + Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) + : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); + B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract}); + } + MI.eraseFromParent(); + return true; + } + // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit + // operations. + auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); + auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); + if (Signed) + B.buildAShr(S64, SignBit, ExtShift); + else + B.buildLShr(S64, SignBit, ExtShift); MI.eraseFromParent(); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 45bade21385be..0e65e1a23788b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -14,7 +14,6 @@ define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 { define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 { ; GFX6-LABEL: s_bfe_i32_arg_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 63 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_bfe_i32 s0, s0, s1 @@ -32,7 +31,6 @@ define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i3 define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 { ; GFX6-LABEL: s_bfe_i64_arg_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s2, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], s2 @@ -46,7 +44,6 @@ define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 ; GFX6-NEXT: s_bfe_i32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -65,7 +62,7 @@ define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s3, 59, s3 +; GFX6-NEXT: s_or_b32 s3, 0x7b, s3 ; GFX6-NEXT: s_bfe_i32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 @@ -82,9 +79,8 @@ define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s4, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_or_b32 s3, s2, s3 ; GFX6-NEXT: s_bfe_i32 s3, 0x7b, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 @@ -120,7 +116,6 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_bfe_i32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll index d327c15ae547f..6d435ace64599 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -14,7 +14,6 @@ define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 { define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 { ; GFX6-LABEL: s_bfe_i32_arg_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 63 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_bfe_u32 s0, s0, s1 @@ -32,7 +31,6 @@ define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i3 define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 { ; GFX6-LABEL: s_bfe_i64_arg_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s2, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_bfe_u64 s[0:1], s[0:1], s2 @@ -46,9 +44,8 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s4, s3, 63 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_lshl_b32 s4, s3, 16 +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_bfe_u32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 @@ -65,7 +62,6 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 ; GFX6-NEXT: s_bfe_u32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -84,7 +80,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s3, 59, s3 +; GFX6-NEXT: s_or_b32 s3, 0x7b, s3 ; GFX6-NEXT: s_bfe_u32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 @@ -101,9 +97,8 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s4, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_or_b32 s3, s2, s3 ; GFX6-NEXT: s_bfe_u32 s3, 0x7b, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 @@ -120,7 +115,6 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_bfe_u32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-lower-bfx.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-lower-bfx.mir new file mode 100644 index 0000000000000..3b7cb4158a897 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-lower-bfx.mir @@ -0,0 +1,107 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_s_bfe_i32__constants +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $sgpr0 + + ; CHECK-LABEL: name: test_s_bfe_i32__constants + ; CHECK: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %reg:sreg_32(s32) = COPY $sgpr0 + ; CHECK-NEXT: %width:sgpr(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %offset:sgpr(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL %width, [[C]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:sreg_32(s32) = G_OR %offset, [[SHL]] + ; CHECK-NEXT: %bfx:sreg_32(s32) = S_BFE_I32 %reg(s32), [[OR]](s32), implicit-def $scc + ; CHECK-NEXT: $sgpr0 = COPY %bfx(s32) + %reg:sgpr(s32) = COPY $sgpr0 + %width:sgpr(s32) = G_CONSTANT i32 5 + %offset:sgpr(s32) = G_CONSTANT i32 7 + %bfx:sgpr(s32) = G_SBFX %reg, %offset, %width + $sgpr0 = COPY %bfx +... +--- +name: test_s_bfe_u32__constants +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $sgpr0 + + ; CHECK-LABEL: name: test_s_bfe_u32__constants + ; CHECK: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %reg:sreg_32(s32) = COPY $sgpr0 + ; CHECK-NEXT: %width:sgpr(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %offset:sgpr(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL %width, [[C]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:sreg_32(s32) = G_OR %offset, [[SHL]] + ; CHECK-NEXT: %bfx:sreg_32(s32) = S_BFE_U32 %reg(s32), [[OR]](s32), implicit-def $scc + ; CHECK-NEXT: $sgpr0 = COPY %bfx(s32) + %reg:sgpr(s32) = COPY $sgpr0 + %width:sgpr(s32) = G_CONSTANT i32 5 + %offset:sgpr(s32) = G_CONSTANT i32 7 + %bfx:sgpr(s32) = G_UBFX %reg, %offset, %width + $sgpr0 = COPY %bfx +... +--- +name: test_s_bfe_i64__constants +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: test_s_bfe_i64__constants + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %reg:sreg_64(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: %width:sgpr(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %offset:sgpr(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL %width, [[C]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:sreg_32(s32) = G_OR %offset, [[SHL]] + ; CHECK-NEXT: %bfx:sreg_64(s64) = S_BFE_I64 %reg(s64), [[OR]](s32), implicit-def $scc + ; CHECK-NEXT: $sgpr0_sgpr1 = COPY %bfx(s64) + %reg:sgpr(s64) = COPY $sgpr0_sgpr1 + %width:sgpr(s32) = G_CONSTANT i32 5 + %offset:sgpr(s32) = G_CONSTANT i32 7 + %bfx:sgpr(s64) = G_SBFX %reg, %offset, %width + $sgpr0_sgpr1 = COPY %bfx +... +--- +name: test_s_bfe_u64__constants +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: test_s_bfe_u64__constants + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %reg:sreg_64(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: %width:sgpr(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %offset:sgpr(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL %width, [[C]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:sreg_32(s32) = G_OR %offset, [[SHL]] + ; CHECK-NEXT: %bfx:sreg_64(s64) = S_BFE_U64 %reg(s64), [[OR]](s32), implicit-def $scc + ; CHECK-NEXT: $sgpr0_sgpr1 = COPY %bfx(s64) + %reg:sgpr(s64) = COPY $sgpr0_sgpr1 + %width:sgpr(s32) = G_CONSTANT i32 5 + %offset:sgpr(s32) = G_CONSTANT i32 7 + %bfx:sgpr(s64) = G_UBFX %reg, %offset, %width + $sgpr0_sgpr1 = COPY %bfx +...