From b7007602276d7befd35a7836ba04e8d9ee8454d1 Mon Sep 17 00:00:00 2001 From: Yu Li Date: Tue, 1 Jul 2025 10:20:01 +0000 Subject: [PATCH 1/4] [GlobaISel] Allow expanding of sdiv -> mul by constant combine for general case --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 4 + .../include/llvm/Target/GlobalISel/Combine.td | 4 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 134 +- .../AArch64/GlobalISel/combine-sdiv.ll | 1663 +++++++++++++++++ .../AArch64/GlobalISel/combine-sdiv.mir | 11 +- llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll | 9 +- .../CodeGen/AArch64/arm64-neon-mul-div-cte.ll | 114 +- llvm/test/CodeGen/AArch64/select_const.ll | 11 +- .../combine-fold-binop-into-select.mir | 4 +- .../AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll | 16 +- .../CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll | 125 +- .../RISCV/GlobalISel/div-by-constant.ll | 511 +++-- 12 files changed, 2289 insertions(+), 317 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 7d7b5364d6b68..d47b5a0ad40bd 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -143,6 +143,10 @@ class CombinerHelper { /// Query is legal on the target. bool isLegalOrBeforeLegalizer(const LegalityQuery &Query) const; + /// \return true if \p Query is legal on the target, or if \p Query will + /// perform WidenScalar action on the target. + bool isLegalorHasWidenScalar(const LegalityQuery &Query) const; + /// \return true if the combine is running prior to legalization, or if \p Ty /// is a legal integer constant type on the target. bool isConstantLegalOrBeforeLegalizer(const LLT Ty) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 6033d80e717d3..85b86b4ed00d6 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -2054,9 +2054,9 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, div_rem_to_divrem, funnel_shift_combines, bitreverse_shift, commute_shift, form_bitfield_extract, constant_fold_binops, constant_fold_fma, constant_fold_cast_op, fabs_fneg_fold, - intdiv_combines, mulh_combines, redundant_neg_operands, + mulh_combines, redundant_neg_operands, and_or_disjoint_mask, fma_combines, fold_binop_into_select, - intrem_combines, sub_add_reg, select_to_minmax, + intrem_combines, intdiv_combines, sub_add_reg, select_to_minmax, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, simplify_neg_minmax, combine_concat_vector, sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 3b11d0848d300..80f1e3f2644ec 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -162,6 +162,11 @@ bool CombinerHelper::isLegalOrBeforeLegalizer( return isPreLegalize() || isLegal(Query); } +bool CombinerHelper::isLegalorHasWidenScalar(const LegalityQuery &Query) const { + return isLegal(Query) || + LI->getAction(Query).Action == LegalizeActions::WidenScalar; +} + bool CombinerHelper::isConstantLegalOrBeforeLegalizer(const LLT Ty) const { if (!Ty.isVector()) return isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {Ty}}); @@ -5522,6 +5527,8 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { Register Dst = MI.getOperand(0).getReg(); Register RHS = MI.getOperand(2).getReg(); LLT DstTy = MRI.getType(Dst); + auto SizeInBits = DstTy.getScalarSizeInBits(); + LLT WideTy = DstTy.changeElementSize(SizeInBits * 2); auto &MF = *MI.getMF(); AttributeList Attr = MF.getFunction().getAttributes(); @@ -5541,8 +5548,21 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } - // Don't support the general case for now. - return false; + auto *RHSDef = MRI.getVRegDef(RHS); + if (!isConstantOrConstantVector(*RHSDef, MRI)) + return false; + + // Don't do this if the types are not going to be legal. + if (LI) { + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}})) + return false; + if (!isLegal({TargetOpcode::G_SMULH, {DstTy}}) && + !isLegalorHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}})) + return false; + } + + return matchUnaryPredicate( + MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } void CombinerHelper::applySDivByConst(MachineInstr &MI) const { @@ -5558,21 +5578,22 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const { Register RHS = SDiv.getReg(2); LLT Ty = MRI.getType(Dst); LLT ScalarTy = Ty.getScalarType(); + const unsigned EltBits = ScalarTy.getScalarSizeInBits(); LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType(); auto &MIB = Builder; bool UseSRA = false; - SmallVector Shifts, Factors; + SmallVector ExactShifts, ExactFactors; - auto *RHSDef = cast(getDefIgnoringCopies(RHS, MRI)); - bool IsSplat = getIConstantSplatVal(*RHSDef, MRI).has_value(); + auto *RHSDefInstr = cast(getDefIgnoringCopies(RHS, MRI)); + bool IsSplat = getIConstantSplatVal(*RHSDefInstr, MRI).has_value(); - auto BuildSDIVPattern = [&](const Constant *C) { + auto BuildExactSDIVPattern = [&](const Constant *C) { // Don't recompute inverses for each splat element. - if (IsSplat && !Factors.empty()) { - Shifts.push_back(Shifts[0]); - Factors.push_back(Factors[0]); + if (IsSplat && !ExactFactors.empty()) { + ExactShifts.push_back(ExactShifts[0]); + ExactFactors.push_back(ExactFactors[0]); return true; } @@ -5587,31 +5608,104 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const { // Calculate the multiplicative inverse modulo BW. // 2^W requires W + 1 bits, so we have to extend and then truncate. APInt Factor = Divisor.multiplicativeInverse(); - Shifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0)); - Factors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0)); + ExactShifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0)); + ExactFactors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0)); return true; }; - // Collect all magic values from the build vector. + if (MI.getFlag(MachineInstr::MIFlag::IsExact)) { + // Collect all magic values from the build vector. + bool Matched = matchUnaryPredicate(MRI, RHS, BuildExactSDIVPattern); + (void)Matched; + assert(Matched && "Expected unary predicate match to succeed"); + + Register Shift, Factor; + if (Ty.isVector()) { + Shift = MIB.buildBuildVector(ShiftAmtTy, ExactShifts).getReg(0); + Factor = MIB.buildBuildVector(Ty, ExactFactors).getReg(0); + } else { + Shift = ExactShifts[0]; + Factor = ExactFactors[0]; + } + + Register Res = LHS; + + if (UseSRA) + Res = MIB.buildAShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0); + + return MIB.buildMul(Ty, Res, Factor); + } + + SmallVector MagicFactors, Factors, Shifts, ShiftMasks; + + auto BuildSDIVPattern = [&](const Constant *C) { + auto *CI = cast(C); + const APInt &Divisor = CI->getValue(); + + SignedDivisionByConstantInfo magics = + SignedDivisionByConstantInfo::get(Divisor); + int NumeratorFactor = 0; + int ShiftMask = -1; + + if (Divisor.isOne() || Divisor.isAllOnes()) { + // If d is +1/-1, we just multiply the numerator by +1/-1. + NumeratorFactor = Divisor.getSExtValue(); + magics.Magic = 0; + magics.ShiftAmount = 0; + ShiftMask = 0; + } else if (Divisor.isStrictlyPositive() && magics.Magic.isNegative()) { + // If d > 0 and m < 0, add the numerator. + NumeratorFactor = 1; + } else if (Divisor.isNegative() && magics.Magic.isStrictlyPositive()) { + // If d < 0 and m > 0, subtract the numerator. + NumeratorFactor = -1; + } + + MagicFactors.push_back(MIB.buildConstant(ScalarTy, magics.Magic).getReg(0)); + Factors.push_back(MIB.buildConstant(ScalarTy, NumeratorFactor).getReg(0)); + Shifts.push_back( + MIB.buildConstant(ScalarShiftAmtTy, magics.ShiftAmount).getReg(0)); + ShiftMasks.push_back(MIB.buildConstant(ScalarTy, ShiftMask).getReg(0)); + + return true; + }; + + // Collect the shifts/magic values from each element. bool Matched = matchUnaryPredicate(MRI, RHS, BuildSDIVPattern); (void)Matched; assert(Matched && "Expected unary predicate match to succeed"); - Register Shift, Factor; - if (Ty.isVector()) { - Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0); + Register MagicFactor, Factor, Shift, ShiftMask; + auto *RHSDef = getOpcodeDef(RHS, MRI); + if (RHSDef) { + MagicFactor = MIB.buildBuildVector(Ty, MagicFactors).getReg(0); Factor = MIB.buildBuildVector(Ty, Factors).getReg(0); + Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0); + ShiftMask = MIB.buildBuildVector(Ty, ShiftMasks).getReg(0); } else { - Shift = Shifts[0]; + assert(MRI.getType(RHS).isScalar() && + "Non-build_vector operation should have been a scalar"); + MagicFactor = MagicFactors[0]; Factor = Factors[0]; + Shift = Shifts[0]; + ShiftMask = ShiftMasks[0]; } - Register Res = LHS; + Register Q = LHS; + Q = MIB.buildSMulH(Ty, LHS, MagicFactor).getReg(0); + + // (Optionally) Add/subtract the numerator using Factor. + Factor = MIB.buildMul(Ty, LHS, Factor).getReg(0); + Q = MIB.buildAdd(Ty, Q, Factor).getReg(0); - if (UseSRA) - Res = MIB.buildAShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0); + // Shift right algebraic by shift value. + Q = MIB.buildAShr(Ty, Q, Shift).getReg(0); - return MIB.buildMul(Ty, Res, Factor); + // Extract the sign bit, mask it and add it to the quotient. + auto SignShift = MIB.buildConstant(ShiftAmtTy, EltBits - 1); + auto T = MIB.buildLShr(Ty, Q, SignShift); + T = MIB.buildAnd(Ty, T, ShiftMask); + return MIB.buildAdd(Ty, Q, T); } bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) const { diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll new file mode 100644 index 0000000000000..b7dadf711fce1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll @@ -0,0 +1,1663 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel=1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; These tests are taken from the combine-udiv.ll in X86. +define i32 @combine_sdiv_by_one(i32 %x) { +; CHECK-LABEL: combine_sdiv_by_one: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %1 = sdiv i32 %x, 1 + ret i32 %1 +} + +define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_one: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_one: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define i32 @combine_sdiv_by_negone(i32 %x) { +; CHECK-LABEL: combine_sdiv_by_negone: +; CHECK: // %bb.0: +; CHECK-NEXT: neg w0, w0 +; CHECK-NEXT: ret + %1 = sdiv i32 %x, -1 + ret i32 %1 +} + +define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_negone: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: neg v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_negone: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: sub v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define i32 @combine_sdiv_by_minsigned(i32 %x) { +; CHECK-SD-LABEL: combine_sdiv_by_minsigned: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #-2147483648 // =0x80000000 +; CHECK-SD-NEXT: cmp w0, w8 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_sdiv_by_minsigned: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: sxtw x8, w0 +; CHECK-GI-NEXT: lsl x9, x8, #31 +; CHECK-GI-NEXT: sub x8, x9, x8 +; CHECK-GI-NEXT: asr x8, x8, #32 +; CHECK-GI-NEXT: sub w8, w8, w0 +; CHECK-GI-NEXT: asr w8, w8, #30 +; CHECK-GI-NEXT: add w0, w8, w8, lsr #31 +; CHECK-GI-NEXT: ret + %1 = sdiv i32 %x, -2147483648 + ret i32 %1 +} + +define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_minsigned: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v1.4s, #128, lsl #24 +; CHECK-SD-NEXT: movi v2.4s, #1 +; CHECK-SD-NEXT: cmeq v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_minsigned: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mvni v1.4s, #128, lsl #24 +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: sub v1.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #30 +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31 +; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #30 +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define i32 @combine_sdiv_zero(i32 %x) { +; CHECK-LABEL: combine_sdiv_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret + %1 = sdiv i32 0, %x + ret i32 %1 +} + +define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) { +; CHECK-LABEL: combine_vec_sdiv_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ret + %1 = sdiv <4 x i32> zeroinitializer, %x + ret <4 x i32> %1 +} + +define i32 @combine_sdiv_dupe(i32 %x) { +; CHECK-SD-LABEL: combine_sdiv_dupe: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w0, #1 // =0x1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_sdiv_dupe: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sdiv w0, w0, w0 +; CHECK-GI-NEXT: ret + %1 = sdiv i32 %x, %x + ret i32 %1 +} + +define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_dupe: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.4s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_dupe: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: mov w11, v0.s[3] +; CHECK-GI-NEXT: sdiv w8, w8, w8 +; CHECK-GI-NEXT: sdiv w9, w9, w9 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w10 +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: sdiv w8, w11, w11 +; CHECK-GI-NEXT: mov v0.s[2], w10 +; CHECK-GI-NEXT: mov v0.s[3], w8 +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, %x + ret <4 x i32> %1 +} + +define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pos0: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v1.2d, #0x0000ff000000ff +; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ushr v0.4s, v0.4s, #2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pos0: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.4s, #1 +; CHECK-GI-NEXT: movi v2.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: fneg v1.4s, v1.4s +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #1 +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31 +; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #1 +; CHECK-GI-NEXT: ret + %1 = and <4 x i32> %x, + %2 = sdiv <4 x i32> %1, + ret <4 x i32> %2 +} + +define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pos1: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v1.2d, #0x0000ff000000ff +; CHECK-SD-NEXT: adrp x8, .LCPI11_0 +; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] +; CHECK-SD-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pos1: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: adrp x8, .LCPI11_2 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI11_2] +; CHECK-GI-NEXT: adrp x8, .LCPI11_1 +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI11_1] +; CHECK-GI-NEXT: adrp x8, .LCPI11_0 +; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: neg v1.4s, v2.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI11_0] +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ushr v1.4s, v0.4s, #31 +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret + %1 = and <4 x i32> %x, + %2 = sdiv <4 x i32> %1, + ret <4 x i32> %2 +} + +define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2a: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmlt v1.4s, v0.4s, #0 +; CHECK-SD-NEXT: usra v0.4s, v1.4s, #30 +; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2a: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.4s, #1 +; CHECK-GI-NEXT: fneg v1.4s, v1.4s +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #1 +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31 +; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #1 +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2a_neg: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmlt v1.4s, v0.4s, #0 +; CHECK-SD-NEXT: usra v0.4s, v1.4s, #30 +; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #2 +; CHECK-SD-NEXT: neg v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2a_neg: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mvni v1.4s, #128, lsl #24 +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: sub v1.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #1 +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31 +; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #1 +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v16i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI14_0 +; CHECK-SD-NEXT: cmlt v1.16b, v0.16b, #0 +; CHECK-SD-NEXT: movi v3.2d, #0x000000000000ff +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI14_0] +; CHECK-SD-NEXT: adrp x8, .LCPI14_1 +; CHECK-SD-NEXT: movi v4.2d, #0xffffffffffffff00 +; CHECK-SD-NEXT: ushl v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] +; CHECK-SD-NEXT: add v1.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-SD-NEXT: sshl v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v16i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI14_2 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_2] +; CHECK-GI-NEXT: adrp x8, .LCPI14_1 +; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] +; CHECK-GI-NEXT: adrp x8, .LCPI14_0 +; CHECK-GI-NEXT: add v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: neg v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI14_0] +; CHECK-GI-NEXT: sshl v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ushr v1.16b, v0.16b, #7 +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <16 x i8> %x, + ret <16 x i8> %1 +} + +define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v8i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI15_1 +; CHECK-SD-NEXT: cmlt v1.8h, v0.8h, #0 +; CHECK-SD-NEXT: adrp x9, .LCPI15_3 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] +; CHECK-SD-NEXT: adrp x8, .LCPI15_2 +; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI15_3] +; CHECK-SD-NEXT: ushl v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI15_2] +; CHECK-SD-NEXT: adrp x8, .LCPI15_0 +; CHECK-SD-NEXT: add v1.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: sshl v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v8i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI15_2 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI15_2] +; CHECK-GI-NEXT: adrp x8, .LCPI15_1 +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] +; CHECK-GI-NEXT: adrp x8, .LCPI15_0 +; CHECK-GI-NEXT: add v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: neg v1.8h, v2.8h +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] +; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ushr v1.8h, v0.8h, #15 +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v16i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI16_1 +; CHECK-SD-NEXT: cmlt v2.8h, v0.8h, #0 +; CHECK-SD-NEXT: cmlt v3.8h, v1.8h, #0 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_1] +; CHECK-SD-NEXT: adrp x8, .LCPI16_2 +; CHECK-SD-NEXT: ushl v2.8h, v2.8h, v4.8h +; CHECK-SD-NEXT: ushl v3.8h, v3.8h, v4.8h +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_2] +; CHECK-SD-NEXT: adrp x8, .LCPI16_0 +; CHECK-SD-NEXT: ldr q5, [x8, :lo12:.LCPI16_0] +; CHECK-SD-NEXT: adrp x8, .LCPI16_3 +; CHECK-SD-NEXT: add v2.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: add v3.8h, v1.8h, v3.8h +; CHECK-SD-NEXT: and v0.16b, v0.16b, v5.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v5.16b +; CHECK-SD-NEXT: sshl v2.8h, v2.8h, v4.8h +; CHECK-SD-NEXT: sshl v3.8h, v3.8h, v4.8h +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_3] +; CHECK-SD-NEXT: and v2.16b, v2.16b, v4.16b +; CHECK-SD-NEXT: and v3.16b, v3.16b, v4.16b +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v16i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI16_1 +; CHECK-GI-NEXT: sshr v3.8h, v0.8h, #15 +; CHECK-GI-NEXT: sshr v4.8h, v1.8h, #15 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] +; CHECK-GI-NEXT: adrp x8, .LCPI16_0 +; CHECK-GI-NEXT: ldr d5, [x8, :lo12:.LCPI16_0] +; CHECK-GI-NEXT: adrp x8, .LCPI16_2 +; CHECK-GI-NEXT: neg v2.8h, v2.8h +; CHECK-GI-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-GI-NEXT: ushl v3.8h, v3.8h, v2.8h +; CHECK-GI-NEXT: ushl v2.8h, v4.8h, v2.8h +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI16_2] +; CHECK-GI-NEXT: shl v5.8h, v5.8h, #15 +; CHECK-GI-NEXT: neg v4.8h, v4.8h +; CHECK-GI-NEXT: add v3.8h, v0.8h, v3.8h +; CHECK-GI-NEXT: add v2.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: sshl v3.8h, v3.8h, v4.8h +; CHECK-GI-NEXT: sshl v2.8h, v2.8h, v4.8h +; CHECK-GI-NEXT: sshr v4.8h, v5.8h, #15 +; CHECK-GI-NEXT: bif v0.16b, v3.16b, v4.16b +; CHECK-GI-NEXT: bif v1.16b, v2.16b, v4.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <16 x i16> %x, + ret <16 x i16> %1 +} + +define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v32i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI17_1 +; CHECK-SD-NEXT: cmlt v4.8h, v0.8h, #0 +; CHECK-SD-NEXT: cmlt v5.8h, v1.8h, #0 +; CHECK-SD-NEXT: cmlt v7.8h, v2.8h, #0 +; CHECK-SD-NEXT: cmlt v16.8h, v3.8h, #0 +; CHECK-SD-NEXT: ldr q6, [x8, :lo12:.LCPI17_1] +; CHECK-SD-NEXT: adrp x8, .LCPI17_2 +; CHECK-SD-NEXT: ushl v4.8h, v4.8h, v6.8h +; CHECK-SD-NEXT: ushl v5.8h, v5.8h, v6.8h +; CHECK-SD-NEXT: ushl v7.8h, v7.8h, v6.8h +; CHECK-SD-NEXT: ushl v6.8h, v16.8h, v6.8h +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI17_2] +; CHECK-SD-NEXT: adrp x8, .LCPI17_0 +; CHECK-SD-NEXT: add v4.8h, v0.8h, v4.8h +; CHECK-SD-NEXT: add v5.8h, v1.8h, v5.8h +; CHECK-SD-NEXT: ldr q17, [x8, :lo12:.LCPI17_0] +; CHECK-SD-NEXT: add v7.8h, v2.8h, v7.8h +; CHECK-SD-NEXT: add v6.8h, v3.8h, v6.8h +; CHECK-SD-NEXT: adrp x8, .LCPI17_3 +; CHECK-SD-NEXT: and v0.16b, v0.16b, v17.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v17.16b +; CHECK-SD-NEXT: and v2.16b, v2.16b, v17.16b +; CHECK-SD-NEXT: sshl v4.8h, v4.8h, v16.8h +; CHECK-SD-NEXT: sshl v5.8h, v5.8h, v16.8h +; CHECK-SD-NEXT: and v3.16b, v3.16b, v17.16b +; CHECK-SD-NEXT: sshl v7.8h, v7.8h, v16.8h +; CHECK-SD-NEXT: sshl v6.8h, v6.8h, v16.8h +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI17_3] +; CHECK-SD-NEXT: and v4.16b, v4.16b, v16.16b +; CHECK-SD-NEXT: and v5.16b, v5.16b, v16.16b +; CHECK-SD-NEXT: and v7.16b, v7.16b, v16.16b +; CHECK-SD-NEXT: and v6.16b, v6.16b, v16.16b +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v4.16b +; CHECK-SD-NEXT: orr v1.16b, v1.16b, v5.16b +; CHECK-SD-NEXT: orr v2.16b, v2.16b, v7.16b +; CHECK-SD-NEXT: orr v3.16b, v3.16b, v6.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v32i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI17_1 +; CHECK-GI-NEXT: sshr v5.8h, v0.8h, #15 +; CHECK-GI-NEXT: sshr v6.8h, v1.8h, #15 +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI17_1] +; CHECK-GI-NEXT: adrp x8, .LCPI17_0 +; CHECK-GI-NEXT: sshr v7.8h, v2.8h, #15 +; CHECK-GI-NEXT: sshr v16.8h, v3.8h, #15 +; CHECK-GI-NEXT: ldr d17, [x8, :lo12:.LCPI17_0] +; CHECK-GI-NEXT: adrp x8, .LCPI17_2 +; CHECK-GI-NEXT: neg v4.8h, v4.8h +; CHECK-GI-NEXT: ushll v17.8h, v17.8b, #0 +; CHECK-GI-NEXT: ushl v5.8h, v5.8h, v4.8h +; CHECK-GI-NEXT: ushl v6.8h, v6.8h, v4.8h +; CHECK-GI-NEXT: ushl v7.8h, v7.8h, v4.8h +; CHECK-GI-NEXT: ushl v4.8h, v16.8h, v4.8h +; CHECK-GI-NEXT: ldr q16, [x8, :lo12:.LCPI17_2] +; CHECK-GI-NEXT: shl v17.8h, v17.8h, #15 +; CHECK-GI-NEXT: neg v16.8h, v16.8h +; CHECK-GI-NEXT: add v5.8h, v0.8h, v5.8h +; CHECK-GI-NEXT: add v6.8h, v1.8h, v6.8h +; CHECK-GI-NEXT: add v7.8h, v2.8h, v7.8h +; CHECK-GI-NEXT: add v4.8h, v3.8h, v4.8h +; CHECK-GI-NEXT: sshr v17.8h, v17.8h, #15 +; CHECK-GI-NEXT: sshl v5.8h, v5.8h, v16.8h +; CHECK-GI-NEXT: sshl v6.8h, v6.8h, v16.8h +; CHECK-GI-NEXT: sshl v7.8h, v7.8h, v16.8h +; CHECK-GI-NEXT: sshl v4.8h, v4.8h, v16.8h +; CHECK-GI-NEXT: bif v0.16b, v5.16b, v17.16b +; CHECK-GI-NEXT: bif v1.16b, v6.16b, v17.16b +; CHECK-GI-NEXT: bif v2.16b, v7.16b, v17.16b +; CHECK-GI-NEXT: bif v3.16b, v4.16b, v17.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <32 x i16> %x, + ret <32 x i16> %1 +} + +define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI18_0 +; CHECK-SD-NEXT: cmlt v1.4s, v0.4s, #0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI18_0] +; CHECK-SD-NEXT: adrp x8, .LCPI18_1 +; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] +; CHECK-SD-NEXT: adrp x8, .LCPI18_2 +; CHECK-SD-NEXT: add v1.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI18_2] +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI18_2 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI18_2] +; CHECK-GI-NEXT: adrp x8, .LCPI18_1 +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] +; CHECK-GI-NEXT: adrp x8, .LCPI18_0 +; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: neg v1.4s, v2.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI18_0] +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ushr v1.4s, v0.4s, #31 +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v8i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI19_0 +; CHECK-SD-NEXT: cmlt v2.4s, v0.4s, #0 +; CHECK-SD-NEXT: cmlt v3.4s, v1.4s, #0 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI19_0] +; CHECK-SD-NEXT: adrp x8, .LCPI19_1 +; CHECK-SD-NEXT: ushl v2.4s, v2.4s, v4.4s +; CHECK-SD-NEXT: ushl v3.4s, v3.4s, v4.4s +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI19_1] +; CHECK-SD-NEXT: adrp x8, .LCPI19_2 +; CHECK-SD-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: add v3.4s, v1.4s, v3.4s +; CHECK-SD-NEXT: sshl v2.4s, v2.4s, v4.4s +; CHECK-SD-NEXT: sshl v3.4s, v3.4s, v4.4s +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI19_2] +; CHECK-SD-NEXT: bif v0.16b, v2.16b, v4.16b +; CHECK-SD-NEXT: bif v1.16b, v3.16b, v4.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v8i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w9, #0 // =0x0 +; CHECK-GI-NEXT: sshr v4.4s, v0.4s, #31 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI19_0 +; CHECK-GI-NEXT: sshr v5.4s, v1.4s, #31 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI19_0] +; CHECK-GI-NEXT: adrp x8, .LCPI19_1 +; CHECK-GI-NEXT: mov v2.h[1], w9 +; CHECK-GI-NEXT: neg v3.4s, v3.4s +; CHECK-GI-NEXT: ushl v4.4s, v4.4s, v3.4s +; CHECK-GI-NEXT: ushl v3.4s, v5.4s, v3.4s +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI19_1] +; CHECK-GI-NEXT: mov v2.h[2], w9 +; CHECK-GI-NEXT: neg v5.4s, v5.4s +; CHECK-GI-NEXT: add v4.4s, v0.4s, v4.4s +; CHECK-GI-NEXT: add v3.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: mov v2.h[3], w9 +; CHECK-GI-NEXT: sshl v4.4s, v4.4s, v5.4s +; CHECK-GI-NEXT: sshl v3.4s, v3.4s, v5.4s +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: shl v2.4s, v2.4s, #31 +; CHECK-GI-NEXT: sshr v2.4s, v2.4s, #31 +; CHECK-GI-NEXT: bif v0.16b, v4.16b, v2.16b +; CHECK-GI-NEXT: bif v1.16b, v3.16b, v2.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i32> %x, + ret <8 x i32> %1 +} + +define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v16i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI20_0 +; CHECK-SD-NEXT: cmlt v4.4s, v0.4s, #0 +; CHECK-SD-NEXT: cmlt v5.4s, v1.4s, #0 +; CHECK-SD-NEXT: cmlt v7.4s, v2.4s, #0 +; CHECK-SD-NEXT: cmlt v16.4s, v3.4s, #0 +; CHECK-SD-NEXT: ldr q6, [x8, :lo12:.LCPI20_0] +; CHECK-SD-NEXT: adrp x8, .LCPI20_1 +; CHECK-SD-NEXT: ushl v4.4s, v4.4s, v6.4s +; CHECK-SD-NEXT: ushl v5.4s, v5.4s, v6.4s +; CHECK-SD-NEXT: ushl v7.4s, v7.4s, v6.4s +; CHECK-SD-NEXT: ushl v6.4s, v16.4s, v6.4s +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI20_1] +; CHECK-SD-NEXT: adrp x8, .LCPI20_2 +; CHECK-SD-NEXT: add v4.4s, v0.4s, v4.4s +; CHECK-SD-NEXT: add v5.4s, v1.4s, v5.4s +; CHECK-SD-NEXT: add v7.4s, v2.4s, v7.4s +; CHECK-SD-NEXT: add v6.4s, v3.4s, v6.4s +; CHECK-SD-NEXT: sshl v4.4s, v4.4s, v16.4s +; CHECK-SD-NEXT: sshl v5.4s, v5.4s, v16.4s +; CHECK-SD-NEXT: sshl v7.4s, v7.4s, v16.4s +; CHECK-SD-NEXT: sshl v6.4s, v6.4s, v16.4s +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI20_2] +; CHECK-SD-NEXT: bif v0.16b, v4.16b, v16.16b +; CHECK-SD-NEXT: bif v1.16b, v5.16b, v16.16b +; CHECK-SD-NEXT: bif v2.16b, v7.16b, v16.16b +; CHECK-SD-NEXT: bif v3.16b, v6.16b, v16.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v16i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w9, #0 // =0x0 +; CHECK-GI-NEXT: sshr v6.4s, v0.4s, #31 +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI20_0 +; CHECK-GI-NEXT: sshr v7.4s, v1.4s, #31 +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI20_0] +; CHECK-GI-NEXT: sshr v16.4s, v2.4s, #31 +; CHECK-GI-NEXT: sshr v17.4s, v3.4s, #31 +; CHECK-GI-NEXT: adrp x8, .LCPI20_1 +; CHECK-GI-NEXT: mov v4.h[1], w9 +; CHECK-GI-NEXT: neg v5.4s, v5.4s +; CHECK-GI-NEXT: ushl v6.4s, v6.4s, v5.4s +; CHECK-GI-NEXT: ushl v7.4s, v7.4s, v5.4s +; CHECK-GI-NEXT: ushl v16.4s, v16.4s, v5.4s +; CHECK-GI-NEXT: mov v4.h[2], w9 +; CHECK-GI-NEXT: ushl v5.4s, v17.4s, v5.4s +; CHECK-GI-NEXT: ldr q17, [x8, :lo12:.LCPI20_1] +; CHECK-GI-NEXT: neg v17.4s, v17.4s +; CHECK-GI-NEXT: add v6.4s, v0.4s, v6.4s +; CHECK-GI-NEXT: add v7.4s, v1.4s, v7.4s +; CHECK-GI-NEXT: add v16.4s, v2.4s, v16.4s +; CHECK-GI-NEXT: add v5.4s, v3.4s, v5.4s +; CHECK-GI-NEXT: mov v4.h[3], w9 +; CHECK-GI-NEXT: sshl v6.4s, v6.4s, v17.4s +; CHECK-GI-NEXT: sshl v7.4s, v7.4s, v17.4s +; CHECK-GI-NEXT: sshl v16.4s, v16.4s, v17.4s +; CHECK-GI-NEXT: sshl v5.4s, v5.4s, v17.4s +; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-GI-NEXT: shl v4.4s, v4.4s, #31 +; CHECK-GI-NEXT: sshr v4.4s, v4.4s, #31 +; CHECK-GI-NEXT: bif v0.16b, v6.16b, v4.16b +; CHECK-GI-NEXT: bif v1.16b, v7.16b, v4.16b +; CHECK-GI-NEXT: bif v2.16b, v16.16b, v4.16b +; CHECK-GI-NEXT: bif v3.16b, v5.16b, v4.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <16 x i32> %x, + ret <16 x i32> %1 +} + +define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v2i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI21_0 +; CHECK-SD-NEXT: cmlt v1.2d, v0.2d, #0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI21_0] +; CHECK-SD-NEXT: adrp x8, .LCPI21_1 +; CHECK-SD-NEXT: ushl v1.2d, v1.2d, v2.2d +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI21_1] +; CHECK-SD-NEXT: adrp x8, .LCPI21_2 +; CHECK-SD-NEXT: add v1.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: sshl v1.2d, v1.2d, v2.2d +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI21_2] +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v2i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI21_1 +; CHECK-GI-NEXT: sshr v2.2d, v0.2d, #63 +; CHECK-GI-NEXT: adrp x9, .LCPI21_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI21_1] +; CHECK-GI-NEXT: adrp x8, .LCPI21_2 +; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI21_0] +; CHECK-GI-NEXT: neg v1.2d, v1.2d +; CHECK-GI-NEXT: shl v3.2d, v3.2d, #63 +; CHECK-GI-NEXT: ushl v1.2d, v2.2d, v1.2d +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI21_2] +; CHECK-GI-NEXT: neg v2.2d, v2.2d +; CHECK-GI-NEXT: add v1.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: sshl v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: sshr v2.2d, v3.2d, #63 +; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <2 x i64> %x, + ret <2 x i64> %1 +} + +define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v4i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI22_0 +; CHECK-SD-NEXT: cmlt v2.2d, v0.2d, #0 +; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI22_0] +; CHECK-SD-NEXT: adrp x8, .LCPI22_3 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI22_3] +; CHECK-SD-NEXT: adrp x8, .LCPI22_1 +; CHECK-SD-NEXT: ushl v2.2d, v2.2d, v3.2d +; CHECK-SD-NEXT: cmlt v3.2d, v1.2d, #0 +; CHECK-SD-NEXT: add v2.2d, v0.2d, v2.2d +; CHECK-SD-NEXT: ushl v3.2d, v3.2d, v4.2d +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI22_1] +; CHECK-SD-NEXT: adrp x8, .LCPI22_2 +; CHECK-SD-NEXT: sshl v2.2d, v2.2d, v4.2d +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI22_2] +; CHECK-SD-NEXT: add v1.2d, v1.2d, v3.2d +; CHECK-SD-NEXT: adrp x8, .LCPI22_4 +; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI22_4] +; CHECK-SD-NEXT: bif v0.16b, v2.16b, v4.16b +; CHECK-SD-NEXT: sshl v1.2d, v1.2d, v3.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v4i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI22_2 +; CHECK-GI-NEXT: sshr v3.2d, v0.2d, #63 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI22_2] +; CHECK-GI-NEXT: adrp x8, .LCPI22_1 +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI22_1] +; CHECK-GI-NEXT: adrp x8, .LCPI22_4 +; CHECK-GI-NEXT: neg v2.2d, v2.2d +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI22_4] +; CHECK-GI-NEXT: adrp x8, .LCPI22_0 +; CHECK-GI-NEXT: neg v4.2d, v4.2d +; CHECK-GI-NEXT: ldr q6, [x8, :lo12:.LCPI22_0] +; CHECK-GI-NEXT: adrp x8, .LCPI22_3 +; CHECK-GI-NEXT: neg v5.2d, v5.2d +; CHECK-GI-NEXT: ushl v2.2d, v3.2d, v2.2d +; CHECK-GI-NEXT: sshr v3.2d, v1.2d, #63 +; CHECK-GI-NEXT: shl v6.2d, v6.2d, #63 +; CHECK-GI-NEXT: add v2.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: ushl v3.2d, v3.2d, v4.2d +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI22_3] +; CHECK-GI-NEXT: sshl v2.2d, v2.2d, v5.2d +; CHECK-GI-NEXT: sshr v5.2d, v6.2d, #63 +; CHECK-GI-NEXT: add v1.2d, v1.2d, v3.2d +; CHECK-GI-NEXT: neg v3.2d, v4.2d +; CHECK-GI-NEXT: bif v0.16b, v2.16b, v5.16b +; CHECK-GI-NEXT: sshl v1.2d, v1.2d, v3.2d +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i64> %x, + ret <4 x i64> %1 +} + +define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v8i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI23_0 +; CHECK-SD-NEXT: cmlt v4.2d, v0.2d, #0 +; CHECK-SD-NEXT: cmlt v6.2d, v2.2d, #0 +; CHECK-SD-NEXT: ldr q5, [x8, :lo12:.LCPI23_0] +; CHECK-SD-NEXT: adrp x8, .LCPI23_3 +; CHECK-SD-NEXT: cmlt v7.2d, v3.2d, #0 +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI23_3] +; CHECK-SD-NEXT: adrp x8, .LCPI23_1 +; CHECK-SD-NEXT: ushl v4.2d, v4.2d, v5.2d +; CHECK-SD-NEXT: ushl v5.2d, v6.2d, v5.2d +; CHECK-SD-NEXT: cmlt v6.2d, v1.2d, #0 +; CHECK-SD-NEXT: ldr q17, [x8, :lo12:.LCPI23_1] +; CHECK-SD-NEXT: ushl v7.2d, v7.2d, v16.2d +; CHECK-SD-NEXT: adrp x8, .LCPI23_2 +; CHECK-SD-NEXT: add v4.2d, v0.2d, v4.2d +; CHECK-SD-NEXT: add v5.2d, v2.2d, v5.2d +; CHECK-SD-NEXT: ushl v6.2d, v6.2d, v16.2d +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI23_2] +; CHECK-SD-NEXT: adrp x8, .LCPI23_4 +; CHECK-SD-NEXT: add v3.2d, v3.2d, v7.2d +; CHECK-SD-NEXT: sshl v4.2d, v4.2d, v17.2d +; CHECK-SD-NEXT: sshl v5.2d, v5.2d, v17.2d +; CHECK-SD-NEXT: add v1.2d, v1.2d, v6.2d +; CHECK-SD-NEXT: ldr q6, [x8, :lo12:.LCPI23_4] +; CHECK-SD-NEXT: bif v0.16b, v4.16b, v16.16b +; CHECK-SD-NEXT: bif v2.16b, v5.16b, v16.16b +; CHECK-SD-NEXT: sshl v1.2d, v1.2d, v6.2d +; CHECK-SD-NEXT: sshl v3.2d, v3.2d, v6.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v8i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w9, #0 // =0x0 +; CHECK-GI-NEXT: sshr v7.2d, v0.2d, #63 +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI23_1 +; CHECK-GI-NEXT: sshr v16.2d, v1.2d, #63 +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI23_1] +; CHECK-GI-NEXT: sshr v17.2d, v2.2d, #63 +; CHECK-GI-NEXT: sshr v18.2d, v3.2d, #63 +; CHECK-GI-NEXT: adrp x8, .LCPI23_3 +; CHECK-GI-NEXT: mov v4.h[1], w9 +; CHECK-GI-NEXT: neg v5.2d, v5.2d +; CHECK-GI-NEXT: ldr q19, [x8, :lo12:.LCPI23_3] +; CHECK-GI-NEXT: neg v19.2d, v19.2d +; CHECK-GI-NEXT: ushl v7.2d, v7.2d, v5.2d +; CHECK-GI-NEXT: ushl v5.2d, v17.2d, v5.2d +; CHECK-GI-NEXT: mov v4.h[2], w9 +; CHECK-GI-NEXT: add v7.2d, v0.2d, v7.2d +; CHECK-GI-NEXT: add v5.2d, v2.2d, v5.2d +; CHECK-GI-NEXT: mov v4.h[3], w9 +; CHECK-GI-NEXT: adrp x9, .LCPI23_0 +; CHECK-GI-NEXT: ldr q6, [x9, :lo12:.LCPI23_0] +; CHECK-GI-NEXT: adrp x9, .LCPI23_2 +; CHECK-GI-NEXT: sshl v7.2d, v7.2d, v19.2d +; CHECK-GI-NEXT: ldr q20, [x9, :lo12:.LCPI23_2] +; CHECK-GI-NEXT: sshl v5.2d, v5.2d, v19.2d +; CHECK-GI-NEXT: neg v6.2d, v6.2d +; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-GI-NEXT: neg v20.2d, v20.2d +; CHECK-GI-NEXT: ushl v16.2d, v16.2d, v6.2d +; CHECK-GI-NEXT: ushl v6.2d, v18.2d, v6.2d +; CHECK-GI-NEXT: ushll v17.2d, v4.2s, #0 +; CHECK-GI-NEXT: ushll2 v18.2d, v4.4s, #0 +; CHECK-GI-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-GI-NEXT: add v16.2d, v1.2d, v16.2d +; CHECK-GI-NEXT: add v6.2d, v3.2d, v6.2d +; CHECK-GI-NEXT: shl v17.2d, v17.2d, #63 +; CHECK-GI-NEXT: shl v18.2d, v18.2d, #63 +; CHECK-GI-NEXT: shl v4.2d, v4.2d, #63 +; CHECK-GI-NEXT: sshl v16.2d, v16.2d, v20.2d +; CHECK-GI-NEXT: sshl v6.2d, v6.2d, v20.2d +; CHECK-GI-NEXT: sshr v17.2d, v17.2d, #63 +; CHECK-GI-NEXT: sshr v18.2d, v18.2d, #63 +; CHECK-GI-NEXT: sshr v4.2d, v4.2d, #63 +; CHECK-GI-NEXT: bif v0.16b, v7.16b, v17.16b +; CHECK-GI-NEXT: bif v1.16b, v16.16b, v18.16b +; CHECK-GI-NEXT: bif v2.16b, v5.16b, v4.16b +; CHECK-GI-NEXT: bif v3.16b, v6.16b, v18.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i64> %x, + ret <8 x i64> %1 +} + +define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI24_0 +; CHECK-SD-NEXT: cmlt v1.4s, v0.4s, #0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI24_0] +; CHECK-SD-NEXT: adrp x8, .LCPI24_1 +; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI24_1] +; CHECK-SD-NEXT: adrp x8, .LCPI24_2 +; CHECK-SD-NEXT: add v1.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI24_2] +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: movi v1.2d, #0xffffffff00000000 +; CHECK-SD-NEXT: neg v2.4s, v0.4s +; CHECK-SD-NEXT: bit v0.16b, v2.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI24_3 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI24_3] +; CHECK-GI-NEXT: adrp x8, .LCPI24_2 +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI24_2] +; CHECK-GI-NEXT: adrp x8, .LCPI24_1 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI24_1] +; CHECK-GI-NEXT: adrp x8, .LCPI24_0 +; CHECK-GI-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: neg v0.4s, v3.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI24_0] +; CHECK-GI-NEXT: sshl v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ushr v1.4s, v0.4s, #31 +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_undef1: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_undef1: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w8, #-4 // =0xfffffffc +; CHECK-GI-NEXT: mov w11, v0.s[3] +; CHECK-GI-NEXT: mov w12, #-16 // =0xfffffff0 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w8, w10, w8 +; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: fmov s0, w9 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: sdiv w8, w11, w12 +; CHECK-GI-NEXT: mov v0.s[2], w10 +; CHECK-GI-NEXT: mov v0.s[3], w8 +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_undef2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_undef2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w8, #4 // =0x4 +; CHECK-GI-NEXT: mov w11, v0.s[3] +; CHECK-GI-NEXT: mov w12, #16 // =0x10 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w8, w10, w8 +; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: fmov s0, w9 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: sdiv w8, w11, w12 +; CHECK-GI-NEXT: mov v0.s[2], w10 +; CHECK-GI-NEXT: mov v0.s[3], w8 +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_undef3: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_undef3: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w8, #-4 // =0xfffffffc +; CHECK-GI-NEXT: mov w11, v0.s[3] +; CHECK-GI-NEXT: mov w12, #16 // =0x10 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w8, w10, w8 +; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: fmov s0, w9 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: mov v0.s[1], w8 +; CHECK-GI-NEXT: sdiv w8, w11, w12 +; CHECK-GI-NEXT: mov v0.s[2], w10 +; CHECK-GI-NEXT: mov v0.s[3], w8 +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +; PR37119 +define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) { +; CHECK-SD-LABEL: non_splat_minus_one_divisor_0: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: umov w9, v0.b[0] +; CHECK-SD-NEXT: mov w8, wzr +; CHECK-SD-NEXT: umov w10, v0.b[1] +; CHECK-SD-NEXT: sub w9, w8, w9, sxtb +; CHECK-SD-NEXT: sub w10, w8, w10, sxtb +; CHECK-SD-NEXT: fmov s1, w9 +; CHECK-SD-NEXT: smov w9, v0.b[2] +; CHECK-SD-NEXT: mov v1.b[1], w10 +; CHECK-SD-NEXT: umov w10, v0.b[3] +; CHECK-SD-NEXT: mov v1.b[2], w9 +; CHECK-SD-NEXT: sub w9, w8, w10, sxtb +; CHECK-SD-NEXT: umov w10, v0.b[4] +; CHECK-SD-NEXT: mov v1.b[3], w9 +; CHECK-SD-NEXT: sub w9, w8, w10, sxtb +; CHECK-SD-NEXT: umov w10, v0.b[5] +; CHECK-SD-NEXT: mov v1.b[4], w9 +; CHECK-SD-NEXT: sub w9, w8, w10, sxtb +; CHECK-SD-NEXT: umov w10, v0.b[7] +; CHECK-SD-NEXT: mov v1.b[5], w9 +; CHECK-SD-NEXT: smov w9, v0.b[6] +; CHECK-SD-NEXT: mov v1.b[6], w9 +; CHECK-SD-NEXT: sub w9, w8, w10, sxtb +; CHECK-SD-NEXT: umov w10, v0.b[8] +; CHECK-SD-NEXT: mov v1.b[7], w9 +; CHECK-SD-NEXT: sub w8, w8, w10, sxtb +; CHECK-SD-NEXT: mov v1.b[8], w8 +; CHECK-SD-NEXT: smov w8, v0.b[9] +; CHECK-SD-NEXT: mov v1.b[9], w8 +; CHECK-SD-NEXT: smov w8, v0.b[10] +; CHECK-SD-NEXT: mov v1.b[10], w8 +; CHECK-SD-NEXT: smov w8, v0.b[11] +; CHECK-SD-NEXT: mov v1.b[11], w8 +; CHECK-SD-NEXT: smov w8, v0.b[12] +; CHECK-SD-NEXT: mov v1.b[12], w8 +; CHECK-SD-NEXT: smov w8, v0.b[13] +; CHECK-SD-NEXT: mov v1.b[13], w8 +; CHECK-SD-NEXT: smov w8, v0.b[14] +; CHECK-SD-NEXT: mov v1.b[14], w8 +; CHECK-SD-NEXT: smov w8, v0.b[15] +; CHECK-SD-NEXT: mov v1.b[15], w8 +; CHECK-SD-NEXT: mov v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: non_splat_minus_one_divisor_0: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 +; CHECK-GI-NEXT: adrp x8, .LCPI28_0 +; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI28_0] +; CHECK-GI-NEXT: mla v1.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: ret + %div = sdiv <16 x i8> %A, + ret <16 x i8> %div +} + +define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { +; CHECK-SD-LABEL: non_splat_minus_one_divisor_1: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI29_1 +; CHECK-SD-NEXT: cmlt v1.16b, v0.16b, #0 +; CHECK-SD-NEXT: adrp x9, .LCPI29_3 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI29_1] +; CHECK-SD-NEXT: adrp x8, .LCPI29_2 +; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI29_3] +; CHECK-SD-NEXT: adrp x9, .LCPI29_5 +; CHECK-SD-NEXT: ushl v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI29_2] +; CHECK-SD-NEXT: adrp x8, .LCPI29_0 +; CHECK-SD-NEXT: add v1.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: sshl v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI29_0] +; CHECK-SD-NEXT: adrp x8, .LCPI29_4 +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI29_4] +; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI29_5] +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: neg v1.16b, v0.16b +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: non_splat_minus_one_divisor_1: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI29_3 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI29_3] +; CHECK-GI-NEXT: adrp x8, .LCPI29_2 +; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI29_2] +; CHECK-GI-NEXT: adrp x8, .LCPI29_1 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI29_1] +; CHECK-GI-NEXT: adrp x8, .LCPI29_0 +; CHECK-GI-NEXT: mla v1.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: neg v0.16b, v3.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI29_0] +; CHECK-GI-NEXT: sshl v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ushr v1.16b, v0.16b, #7 +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret + %div = sdiv <16 x i8> %A, + ret <16 x i8> %div +} + +define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) { +; CHECK-SD-LABEL: non_splat_minus_one_divisor_2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI30_0 +; CHECK-SD-NEXT: cmlt v1.4s, v0.4s, #0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI30_0] +; CHECK-SD-NEXT: adrp x8, .LCPI30_1 +; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI30_1] +; CHECK-SD-NEXT: adrp x8, .LCPI30_2 +; CHECK-SD-NEXT: add v1.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI30_2] +; CHECK-SD-NEXT: adrp x8, .LCPI30_3 +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI30_3] +; CHECK-SD-NEXT: neg v1.4s, v0.4s +; CHECK-SD-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: non_splat_minus_one_divisor_2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI30_2 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI30_2] +; CHECK-GI-NEXT: adrp x8, .LCPI30_1 +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI30_1] +; CHECK-GI-NEXT: adrp x8, .LCPI30_0 +; CHECK-GI-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI30_0] +; CHECK-GI-NEXT: ushr v0.4s, v1.4s, #31 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret + %div = sdiv <4 x i32> %A, + ret <4 x i32> %div +} + +define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) { +; CHECK-LABEL: combine_vec_sdiv_nonuniform: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI31_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI31_0] +; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI32_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI32_0] +; CHECK-SD-NEXT: adrp x8, .LCPI32_1 +; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI32_1] +; CHECK-SD-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI32_1 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI32_1] +; CHECK-GI-NEXT: adrp x8, .LCPI32_0 +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI32_0] +; CHECK-GI-NEXT: neg v1.8h, v1.8h +; CHECK-GI-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform3: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI33_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI33_0] +; CHECK-SD-NEXT: adrp x8, .LCPI33_1 +; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: add v0.8h, v1.8h, v0.8h +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI33_1] +; CHECK-SD-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform3: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI33_1 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI33_1] +; CHECK-GI-NEXT: adrp x8, .LCPI33_0 +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_0] +; CHECK-GI-NEXT: add v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: neg v1.8h, v2.8h +; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform4: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI34_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI34_0] +; CHECK-SD-NEXT: adrp x8, .LCPI34_1 +; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: sub v0.8h, v1.8h, v0.8h +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI34_1] +; CHECK-SD-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform4: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI34_1 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI34_1] +; CHECK-GI-NEXT: adrp x8, .LCPI34_0 +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] +; CHECK-GI-NEXT: sub v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: neg v1.8h, v2.8h +; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform5: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI35_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI35_0] +; CHECK-SD-NEXT: adrp x8, .LCPI35_1 +; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI35_1] +; CHECK-SD-NEXT: adrp x8, .LCPI35_2 +; CHECK-SD-NEXT: mla v1.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI35_2] +; CHECK-SD-NEXT: sshl v0.8h, v1.8h, v0.8h +; CHECK-SD-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform5: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI35_2 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI35_2] +; CHECK-GI-NEXT: adrp x8, .LCPI35_1 +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_1] +; CHECK-GI-NEXT: adrp x8, .LCPI35_0 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI35_0] +; CHECK-GI-NEXT: mla v1.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: neg v0.8h, v3.8h +; CHECK-GI-NEXT: sshl v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform6: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI36_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI36_0] +; CHECK-SD-NEXT: adrp x8, .LCPI36_1 +; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI36_1] +; CHECK-SD-NEXT: adrp x8, .LCPI36_2 +; CHECK-SD-NEXT: mla v1.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI36_2] +; CHECK-SD-NEXT: adrp x8, .LCPI36_3 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI36_3] +; CHECK-SD-NEXT: sshl v0.8h, v1.8h, v0.8h +; CHECK-SD-NEXT: ushr v1.8h, v0.8h, #15 +; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform6: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI36_3 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI36_3] +; CHECK-GI-NEXT: adrp x8, .LCPI36_2 +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_2] +; CHECK-GI-NEXT: adrp x8, .LCPI36_1 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI36_1] +; CHECK-GI-NEXT: adrp x8, .LCPI36_0 +; CHECK-GI-NEXT: mla v1.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: neg v0.8h, v3.8h +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] +; CHECK-GI-NEXT: sshl v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: ushr v1.8h, v0.8h, #15 +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform7: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: umov w9, v0.h[0] +; CHECK-SD-NEXT: mov w8, wzr +; CHECK-SD-NEXT: umov w10, v0.h[1] +; CHECK-SD-NEXT: umov w11, v0.h[2] +; CHECK-SD-NEXT: sub w9, w8, w9, sxth +; CHECK-SD-NEXT: sub w10, w8, w10, sxth +; CHECK-SD-NEXT: fmov s1, w9 +; CHECK-SD-NEXT: sub w9, w8, w11, sxth +; CHECK-SD-NEXT: mov v1.h[1], w10 +; CHECK-SD-NEXT: umov w10, v0.h[3] +; CHECK-SD-NEXT: mov v1.h[2], w9 +; CHECK-SD-NEXT: sub w8, w8, w10, sxth +; CHECK-SD-NEXT: mov v1.h[3], w8 +; CHECK-SD-NEXT: smov w8, v0.h[4] +; CHECK-SD-NEXT: mov v1.h[4], w8 +; CHECK-SD-NEXT: smov w8, v0.h[5] +; CHECK-SD-NEXT: mov v1.h[5], w8 +; CHECK-SD-NEXT: smov w8, v0.h[6] +; CHECK-SD-NEXT: mov v1.h[6], w8 +; CHECK-SD-NEXT: smov w8, v0.h[7] +; CHECK-SD-NEXT: mov v1.h[7], w8 +; CHECK-SD-NEXT: mov v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform7: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 +; CHECK-GI-NEXT: adrp x8, .LCPI37_0 +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI37_0] +; CHECK-GI-NEXT: mla v1.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <16 x i8> @pr38658(<16 x i8> %x) { +; CHECK-SD-LABEL: pr38658: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI38_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI38_0] +; CHECK-SD-NEXT: adrp x8, .LCPI38_1 +; CHECK-SD-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: add v0.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI38_1] +; CHECK-SD-NEXT: sshl v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 +; CHECK-SD-NEXT: ushr v2.16b, v0.16b, #7 +; CHECK-SD-NEXT: mov v1.b[15], v2.b[15] +; CHECK-SD-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: pr38658: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI38_2 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI38_2] +; CHECK-GI-NEXT: adrp x8, .LCPI38_1 +; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI38_1] +; CHECK-GI-NEXT: adrp x8, .LCPI38_0 +; CHECK-GI-NEXT: add v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: neg v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI38_0] +; CHECK-GI-NEXT: sshl v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ushr v1.16b, v0.16b, #7 +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <16 x i8> %x, + ret <16 x i8> %1 +} + +define i1 @bool_sdiv(i1 %x, i1 %y) { +; CHECK-SD-LABEL: bool_sdiv: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w0, w0, #0x1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bool_sdiv: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sbfx w8, w0, #0, #1 +; CHECK-GI-NEXT: sbfx w9, w1, #0, #1 +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: and w0, w8, #0x1 +; CHECK-GI-NEXT: ret + %r = sdiv i1 %x, %y + ret i1 %r +} + +define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) { +; CHECK-SD-LABEL: boolvec_sdiv: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: boolvec_sdiv: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: umov w8, v0.h[0] +; CHECK-GI-NEXT: umov w9, v1.h[0] +; CHECK-GI-NEXT: umov w10, v1.h[1] +; CHECK-GI-NEXT: umov w11, v1.h[2] +; CHECK-GI-NEXT: umov w12, v1.h[3] +; CHECK-GI-NEXT: sbfx w8, w8, #0, #1 +; CHECK-GI-NEXT: sbfx w9, w9, #0, #1 +; CHECK-GI-NEXT: sbfx w10, w10, #0, #1 +; CHECK-GI-NEXT: sbfx w11, w11, #0, #1 +; CHECK-GI-NEXT: sbfx w12, w12, #0, #1 +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: umov w9, v0.h[1] +; CHECK-GI-NEXT: sbfx w9, w9, #0, #1 +; CHECK-GI-NEXT: sdiv w9, w9, w10 +; CHECK-GI-NEXT: umov w10, v0.h[2] +; CHECK-GI-NEXT: sbfx w10, w10, #0, #1 +; CHECK-GI-NEXT: sdiv w10, w10, w11 +; CHECK-GI-NEXT: umov w11, v0.h[3] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: sbfx w11, w11, #0, #1 +; CHECK-GI-NEXT: sdiv w8, w11, w12 +; CHECK-GI-NEXT: mov v0.h[2], w10 +; CHECK-GI-NEXT: mov v0.h[3], w8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret + %r = sdiv <4 x i1> %x, %y + ret <4 x i1> %r +} + +define i32 @combine_sdiv_two(i32 %x) { +; CHECK-SD-LABEL: combine_sdiv_two: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, w0, lsr #31 +; CHECK-SD-NEXT: asr w0, w8, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_sdiv_two: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: sxtw x8, w0 +; CHECK-GI-NEXT: sub x8, x8, x8, lsl #31 +; CHECK-GI-NEXT: asr x8, x8, #32 +; CHECK-GI-NEXT: add w8, w8, w0 +; CHECK-GI-NEXT: add w0, w8, w8, lsr #31 +; CHECK-GI-NEXT: ret + %1 = sdiv i32 %x, 2 + ret i32 %1 +} + +define i32 @combine_sdiv_negtwo(i32 %x) { +; CHECK-SD-LABEL: combine_sdiv_negtwo: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, w0, lsr #31 +; CHECK-SD-NEXT: neg w0, w8, asr #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_sdiv_negtwo: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: sxtw x8, w0 +; CHECK-GI-NEXT: lsl x9, x8, #31 +; CHECK-GI-NEXT: sub x8, x9, x8 +; CHECK-GI-NEXT: asr x8, x8, #32 +; CHECK-GI-NEXT: sub w8, w8, w0 +; CHECK-GI-NEXT: add w0, w8, w8, lsr #31 +; CHECK-GI-NEXT: ret + %1 = sdiv i32 %x, -2 + ret i32 %1 +} + +define i8 @combine_i8_sdiv_pow2(i8 %x) { +; CHECK-SD-LABEL: combine_i8_sdiv_pow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sxtb w8, w0 +; CHECK-SD-NEXT: ubfx w8, w8, #11, #4 +; CHECK-SD-NEXT: add w8, w0, w8 +; CHECK-SD-NEXT: sbfx w0, w8, #4, #4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i8_sdiv_pow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sxtb w8, w0 +; CHECK-GI-NEXT: sub w8, w8, w8, lsl #7 +; CHECK-GI-NEXT: sxth w8, w8 +; CHECK-GI-NEXT: add w8, w0, w8, asr #8 +; CHECK-GI-NEXT: sbfx w8, w8, #3, #5 +; CHECK-GI-NEXT: ubfx w9, w8, #7, #1 +; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: ret + %1 = sdiv i8 %x, 16 + ret i8 %1 +} + +define i8 @combine_i8_sdiv_negpow2(i8 %x) { +; CHECK-SD-LABEL: combine_i8_sdiv_negpow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sxtb w8, w0 +; CHECK-SD-NEXT: ubfx w8, w8, #9, #6 +; CHECK-SD-NEXT: add w8, w0, w8 +; CHECK-SD-NEXT: sxtb w8, w8 +; CHECK-SD-NEXT: neg w0, w8, asr #6 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i8_sdiv_negpow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sxtb w8, w0 +; CHECK-GI-NEXT: lsl w9, w8, #7 +; CHECK-GI-NEXT: sub w8, w9, w8 +; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 +; CHECK-GI-NEXT: sub w8, w8, w0 +; CHECK-GI-NEXT: sbfx w8, w8, #5, #3 +; CHECK-GI-NEXT: ubfx w9, w8, #7, #1 +; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: ret + %1 = sdiv i8 %x, -64 + ret i8 %1 +} + +define i16 @combine_i16_sdiv_pow2(i16 %x) { +; CHECK-SD-LABEL: combine_i16_sdiv_pow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sxth w8, w0 +; CHECK-SD-NEXT: ubfx w8, w8, #27, #4 +; CHECK-SD-NEXT: add w8, w0, w8 +; CHECK-SD-NEXT: sbfx w0, w8, #4, #12 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i16_sdiv_pow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sxth w8, w0 +; CHECK-GI-NEXT: sub w8, w8, w8, lsl #15 +; CHECK-GI-NEXT: add w8, w0, w8, asr #16 +; CHECK-GI-NEXT: sbfx w8, w8, #3, #13 +; CHECK-GI-NEXT: ubfx w9, w8, #15, #1 +; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: ret + %1 = sdiv i16 %x, 16 + ret i16 %1 +} + +define i16 @combine_i16_sdiv_negpow2(i16 %x) { +; CHECK-SD-LABEL: combine_i16_sdiv_negpow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sxth w8, w0 +; CHECK-SD-NEXT: lsr w8, w8, #23 +; CHECK-SD-NEXT: add w8, w0, w8, uxtb +; CHECK-SD-NEXT: sxth w8, w8 +; CHECK-SD-NEXT: neg w0, w8, asr #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i16_sdiv_negpow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sxth w8, w0 +; CHECK-GI-NEXT: lsl w9, w8, #15 +; CHECK-GI-NEXT: sub w8, w9, w8 +; CHECK-GI-NEXT: asr w8, w8, #16 +; CHECK-GI-NEXT: sub w8, w8, w0 +; CHECK-GI-NEXT: sbfx w8, w8, #7, #9 +; CHECK-GI-NEXT: ubfx w9, w8, #15, #1 +; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: ret + %1 = sdiv i16 %x, -256 + ret i16 %1 +} + +define i32 @combine_i32_sdiv_pow2(i32 %x) { +; CHECK-SD-LABEL: combine_i32_sdiv_pow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, #15 +; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: csel w8, w8, w0, lt +; CHECK-SD-NEXT: asr w0, w8, #4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i32_sdiv_pow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: sxtw x8, w0 +; CHECK-GI-NEXT: sub x8, x8, x8, lsl #31 +; CHECK-GI-NEXT: asr x8, x8, #32 +; CHECK-GI-NEXT: add w8, w8, w0 +; CHECK-GI-NEXT: asr w8, w8, #3 +; CHECK-GI-NEXT: add w0, w8, w8, lsr #31 +; CHECK-GI-NEXT: ret + %1 = sdiv i32 %x, 16 + ret i32 %1 +} + +define i32 @combine_i32_sdiv_negpow2(i32 %x) { +; CHECK-SD-LABEL: combine_i32_sdiv_negpow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, #255 +; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: csel w8, w8, w0, lt +; CHECK-SD-NEXT: neg w0, w8, asr #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i32_sdiv_negpow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: sxtw x8, w0 +; CHECK-GI-NEXT: lsl x9, x8, #31 +; CHECK-GI-NEXT: sub x8, x9, x8 +; CHECK-GI-NEXT: asr x8, x8, #32 +; CHECK-GI-NEXT: sub w8, w8, w0 +; CHECK-GI-NEXT: asr w8, w8, #7 +; CHECK-GI-NEXT: add w0, w8, w8, lsr #31 +; CHECK-GI-NEXT: ret + %1 = sdiv i32 %x, -256 + ret i32 %1 +} + +define i64 @combine_i64_sdiv_pow2(i64 %x) { +; CHECK-SD-LABEL: combine_i64_sdiv_pow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add x8, x0, #15 +; CHECK-SD-NEXT: cmp x0, #0 +; CHECK-SD-NEXT: csel x8, x8, x0, lt +; CHECK-SD-NEXT: asr x0, x8, #4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i64_sdiv_pow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov x8, #-9223372036854775807 // =0x8000000000000001 +; CHECK-GI-NEXT: smulh x8, x0, x8 +; CHECK-GI-NEXT: add x8, x8, x0 +; CHECK-GI-NEXT: asr x8, x8, #3 +; CHECK-GI-NEXT: add x0, x8, x8, lsr #63 +; CHECK-GI-NEXT: ret + %1 = sdiv i64 %x, 16 + ret i64 %1 +} + +define i64 @combine_i64_sdiv_negpow2(i64 %x) { +; CHECK-SD-LABEL: combine_i64_sdiv_negpow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add x8, x0, #255 +; CHECK-SD-NEXT: cmp x0, #0 +; CHECK-SD-NEXT: csel x8, x8, x0, lt +; CHECK-SD-NEXT: neg x0, x8, asr #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i64_sdiv_negpow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov x8, #9223372036854775807 // =0x7fffffffffffffff +; CHECK-GI-NEXT: smulh x8, x0, x8 +; CHECK-GI-NEXT: sub x8, x8, x0 +; CHECK-GI-NEXT: asr x8, x8, #7 +; CHECK-GI-NEXT: add x0, x8, x8, lsr #63 +; CHECK-GI-NEXT: ret + %1 = sdiv i64 %x, -256 + ret i64 %1 +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.mir index e99ee84100a39..fc73245e2b79b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.mir @@ -45,9 +45,14 @@ body: | ; CHECK: liveins: $w0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 104 - ; CHECK-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[COPY]], [[C]] - ; CHECK-NEXT: $w0 = COPY [[SDIV]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[SMULH:%[0-9]+]]:_(s32) = G_SMULH [[COPY]], [[C]] + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SMULH]], [[C1]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[ASHR]], [[C2]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[LSHR]] + ; CHECK-NEXT: $w0 = COPY [[ADD]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(s32) = COPY $w0 %1:_(s32) = G_CONSTANT i32 104 diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll index b8eb8269d605c..feb0e10ef62c3 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll @@ -229,12 +229,15 @@ define <4 x i32> @test_bit_sink_operand(<4 x i32> %src, <4 x i32> %dst, <4 x i32 ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sub sp, sp, #32 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 -; CHECK-GI-NEXT: asr w9, w0, #31 +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: sxtw x9, w0 ; CHECK-GI-NEXT: mov w8, wzr ; CHECK-GI-NEXT: add x10, sp, #16 ; CHECK-GI-NEXT: mov x11, sp -; CHECK-GI-NEXT: add w9, w0, w9, lsr #31 -; CHECK-GI-NEXT: asr w9, w9, #1 +; CHECK-GI-NEXT: sub x9, x9, x9, lsl #31 +; CHECK-GI-NEXT: asr x9, x9, #32 +; CHECK-GI-NEXT: add w9, w9, w0 +; CHECK-GI-NEXT: add w9, w9, w9, lsr #31 ; CHECK-GI-NEXT: .LBB11_1: // %do.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: bit v1.16b, v0.16b, v2.16b diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll index bdbebd8726fde..7aa6b77cf3524 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll @@ -15,56 +15,13 @@ define <16 x i8> @div16xi8(<16 x i8> %x) { ; ; CHECK-GI-LABEL: div16xi8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: smov w9, v0.b[0] -; CHECK-GI-NEXT: mov w8, #25 // =0x19 -; CHECK-GI-NEXT: smov w10, v0.b[1] -; CHECK-GI-NEXT: smov w11, v0.b[2] -; CHECK-GI-NEXT: smov w12, v0.b[3] -; CHECK-GI-NEXT: smov w13, v0.b[4] -; CHECK-GI-NEXT: smov w14, v0.b[5] -; CHECK-GI-NEXT: smov w15, v0.b[6] -; CHECK-GI-NEXT: smov w16, v0.b[7] -; CHECK-GI-NEXT: smov w17, v0.b[8] -; CHECK-GI-NEXT: smov w18, v0.b[9] -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.b[1], w10 -; CHECK-GI-NEXT: smov w10, v0.b[10] -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v1.b[2], w11 -; CHECK-GI-NEXT: smov w11, v0.b[11] -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: mov v1.b[3], w12 -; CHECK-GI-NEXT: smov w12, v0.b[12] -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v1.b[4], w13 -; CHECK-GI-NEXT: smov w13, v0.b[13] -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v1.b[5], w14 -; CHECK-GI-NEXT: sdiv w16, w16, w8 -; CHECK-GI-NEXT: mov v1.b[6], w15 -; CHECK-GI-NEXT: sdiv w17, w17, w8 -; CHECK-GI-NEXT: mov v1.b[7], w16 -; CHECK-GI-NEXT: sdiv w9, w18, w8 -; CHECK-GI-NEXT: mov v1.b[8], w17 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: mov v1.b[9], w9 -; CHECK-GI-NEXT: smov w9, v0.b[14] -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.b[10], w10 -; CHECK-GI-NEXT: smov w10, v0.b[15] -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v1.b[11], w11 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: mov v1.b[12], w12 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: mov v1.b[13], w13 -; CHECK-GI-NEXT: sdiv w8, w10, w8 -; CHECK-GI-NEXT: mov v1.b[14], w9 -; CHECK-GI-NEXT: mov v1.b[15], w8 -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: movi v1.16b, #41 +; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: smull v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uzp2 v1.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: sshr v0.16b, v1.16b, #2 +; CHECK-GI-NEXT: ushr v0.16b, v0.16b, #7 +; CHECK-GI-NEXT: ssra v0.16b, v1.16b, #2 ; CHECK-GI-NEXT: ret %div = sdiv <16 x i8> %x, ret <16 x i8> %div @@ -85,32 +42,15 @@ define <8 x i16> @div8xi16(<8 x i16> %x) { ; ; CHECK-GI-LABEL: div8xi16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: smov w9, v0.h[0] -; CHECK-GI-NEXT: mov w8, #6577 // =0x19b1 -; CHECK-GI-NEXT: smov w10, v0.h[1] -; CHECK-GI-NEXT: smov w11, v0.h[2] -; CHECK-GI-NEXT: smov w12, v0.h[3] -; CHECK-GI-NEXT: smov w13, v0.h[4] -; CHECK-GI-NEXT: smov w14, v0.h[5] -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.h[1], w10 -; CHECK-GI-NEXT: smov w10, v0.h[6] -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v1.h[2], w11 -; CHECK-GI-NEXT: smov w11, v0.h[7] -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: mov v1.h[3], w12 -; CHECK-GI-NEXT: sdiv w9, w14, w8 -; CHECK-GI-NEXT: mov v1.h[4], w13 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: mov v1.h[5], w9 -; CHECK-GI-NEXT: sdiv w8, w11, w8 -; CHECK-GI-NEXT: mov v1.h[6], w10 -; CHECK-GI-NEXT: mov v1.h[7], w8 -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: adrp x8, .LCPI1_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: add v1.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: sshr v0.8h, v1.8h, #12 +; CHECK-GI-NEXT: ushr v0.8h, v0.8h, #15 +; CHECK-GI-NEXT: ssra v0.8h, v1.8h, #12 ; CHECK-GI-NEXT: ret %div = sdiv <8 x i16> %x, ret <8 x i16> %div @@ -131,20 +71,14 @@ define <4 x i32> @div32xi4(<4 x i32> %x) { ; ; CHECK-GI-LABEL: div32xi4: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w8, #39957 // =0x9c15 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: movk w8, #145, lsl #16 -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s0, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v0.s[1], w10 -; CHECK-GI-NEXT: sdiv w8, w12, w8 -; CHECK-GI-NEXT: mov v0.s[2], w11 -; CHECK-GI-NEXT: mov v0.s[3], w8 +; CHECK-GI-NEXT: adrp x8, .LCPI2_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #22 +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31 +; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #22 ; CHECK-GI-NEXT: ret %div = sdiv <4 x i32> %x, ret <4 x i32> %div diff --git a/llvm/test/CodeGen/AArch64/select_const.ll b/llvm/test/CodeGen/AArch64/select_const.ll index 801093daef70d..0a73aed803415 100644 --- a/llvm/test/CodeGen/AArch64/select_const.ll +++ b/llvm/test/CodeGen/AArch64/select_const.ll @@ -461,15 +461,10 @@ define i8 @sel_constants_udiv_constant(i1 %cond) { ; CHECK-GI-LABEL: sel_constants_udiv_constant: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: and w8, w0, #0x1 -; CHECK-GI-NEXT: mov w9, #-4 // =0xfffffffc -; CHECK-GI-NEXT: mov w10, #23 // =0x17 +; CHECK-GI-NEXT: mov w9, #50 // =0x32 +; CHECK-GI-NEXT: mov w10, #4 // =0x4 ; CHECK-GI-NEXT: tst w8, #0x1 -; CHECK-GI-NEXT: csel w8, w9, w10, ne -; CHECK-GI-NEXT: mov w9, #205 // =0xcd -; CHECK-GI-NEXT: and w8, w8, #0xff -; CHECK-GI-NEXT: mul w8, w8, w9 -; CHECK-GI-NEXT: lsr w8, w8, #8 -; CHECK-GI-NEXT: lsr w0, w8, #2 +; CHECK-GI-NEXT: csel w0, w9, w10, ne ; CHECK-GI-NEXT: ret %sel = select i1 %cond, i8 -4, i8 23 %bo = udiv i8 %sel, 5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir index 96a776f6fbb69..cc4581195af45 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir @@ -778,8 +778,8 @@ body: | ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 50 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 21 - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT %cond(s1), [[C]], [[C1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[SELECT]](s32) + ; CHECK-NEXT: %udiv:_(s32) = G_SELECT %cond(s1), [[C]], [[C1]] + ; CHECK-NEXT: S_ENDPGM 0, implicit %udiv(s32) %reg:_(s32) = COPY $vgpr0 %zero:_(s32) = G_CONSTANT i32 0 %cond:_(s1) = G_ICMP intpred(eq), %reg, %zero diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 45bade21385be..3d147168c5be6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -671,16 +671,16 @@ define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr ; GFX6-LABEL: simplify_demanded_bfe_sdiv: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0x80000001 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s3, s3, 0x100001 -; GFX6-NEXT: s_ashr_i32 s4, s3, 31 -; GFX6-NEXT: s_lshr_b32 s4, s4, 31 -; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_ashr_i32 s3, s3, 1 -; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_bfe_i32 s3, s2, 0x100001 +; GFX6-NEXT: v_mul_hi_i32 v0, s3, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll index 2fa5492c8a2b7..c2a460b080a29 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -278,28 +278,46 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) { ; CHECK-LABEL: v_sdiv_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x80000001 +; CHECK-NEXT: v_mul_hi_i32 v1, v0, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 11, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i32 %num, 4096 ret i32 %result } define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) { -; CHECK-LABEL: v_sdiv_v2i32_pow2k_denom: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 20, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 12, v1 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GISEL-LABEL: v_sdiv_v2i32_pow2k_denom: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; GISEL-NEXT: v_lshrrev_b32_e32 v2, 20, v2 +; GISEL-NEXT: v_lshrrev_b32_e32 v3, 20, v3 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 12, v1 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; CGP-LABEL: v_sdiv_v2i32_pow2k_denom: +; CGP: ; %bb.0: +; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CGP-NEXT: v_mov_b32_e32 v2, 0x80000001 +; CGP-NEXT: v_mul_hi_i32 v3, v0, v2 +; CGP-NEXT: v_mul_hi_i32 v2, v1, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CGP-NEXT: v_ashrrev_i32_e32 v0, 11, v0 +; CGP-NEXT: v_ashrrev_i32_e32 v1, 11, v1 +; CGP-NEXT: v_lshrrev_b32_e32 v2, 31, v0 +; CGP-NEXT: v_lshrrev_b32_e32 v3, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i32> %num, ret <2 x i32> %result } @@ -308,30 +326,12 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) { ; CHECK-LABEL: v_sdiv_i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x12d8fb +; CHECK-NEXT: v_mov_b32_e32 v1, 0xd9528441 +; CHECK-NEXT: v_mul_hi_i32 v1, v0, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 20, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 0xffed2705, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i32 %num, 1235195 ret i32 %result @@ -387,46 +387,17 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-LABEL: v_sdiv_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 -; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v5, 0x12d8fb -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CGP-NEXT: v_mov_b32_e32 v2, 0xd9528441 +; CGP-NEXT: v_mul_hi_i32 v3, v0, v2 +; CGP-NEXT: v_mul_hi_i32 v2, v1, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CGP-NEXT: v_ashrrev_i32_e32 v0, 20, v0 +; CGP-NEXT: v_ashrrev_i32_e32 v1, 20, v1 +; CGP-NEXT: v_lshrrev_b32_e32 v2, 31, v0 +; CGP-NEXT: v_lshrrev_b32_e32 v3, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v7, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 -; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, v3, v5 -; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[6:7] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 0xffed2705, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i32> %num, ret <2 x i32> %result diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll index 389d59298505d..4b999b892ed35 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll @@ -375,14 +375,22 @@ define i16 @udiv16_constant_add(i16 %a) nounwind { define i32 @sdiv_constant_no_srai(i32 %a) nounwind { ; RV32-LABEL: sdiv_constant_no_srai: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 3 -; RV32-NEXT: div a0, a0, a1 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1366 +; RV32-NEXT: mulh a0, a0, a1 +; RV32-NEXT: srli a1, a0, 31 +; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: sdiv_constant_no_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 3 -; RV64-NEXT: divw a0, a0, a1 +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: addi a1, a1, 1366 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: srai a0, a0, 32 +; RV64-NEXT: srliw a1, a0, 31 +; RV64-NEXT: addw a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i32 %a, 3 ret i32 %1 @@ -392,14 +400,24 @@ define i32 @sdiv_constant_no_srai(i32 %a) nounwind { define i32 @sdiv_constant_srai(i32 %a) nounwind { ; RV32-LABEL: sdiv_constant_srai: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 5 -; RV32-NEXT: div a0, a0, a1 +; RV32-NEXT: lui a1, 419430 +; RV32-NEXT: addi a1, a1, 1639 +; RV32-NEXT: mulh a0, a0, a1 +; RV32-NEXT: srai a0, a0, 1 +; RV32-NEXT: srli a1, a0, 31 +; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: sdiv_constant_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 5 -; RV64-NEXT: divw a0, a0, a1 +; RV64-NEXT: lui a1, 419430 +; RV64-NEXT: addi a1, a1, 1639 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: srai a0, a0, 32 +; RV64-NEXT: sraiw a0, a0, 1 +; RV64-NEXT: srliw a1, a0, 31 +; RV64-NEXT: addw a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i32 %a, 5 ret i32 %1 @@ -409,14 +427,26 @@ define i32 @sdiv_constant_srai(i32 %a) nounwind { define i32 @sdiv_constant_add_srai(i32 %a) nounwind { ; RV32-LABEL: sdiv_constant_add_srai: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 7 -; RV32-NEXT: div a0, a0, a1 +; RV32-NEXT: lui a1, 599186 +; RV32-NEXT: addi a1, a1, 1171 +; RV32-NEXT: mulh a1, a0, a1 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: srai a0, a0, 2 +; RV32-NEXT: srli a1, a0, 31 +; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: sdiv_constant_add_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 7 -; RV64-NEXT: divw a0, a0, a1 +; RV64-NEXT: lui a1, 599186 +; RV64-NEXT: addi a1, a1, 1171 +; RV64-NEXT: sext.w a2, a0 +; RV64-NEXT: mul a1, a2, a1 +; RV64-NEXT: srai a1, a1, 32 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: sraiw a0, a0, 2 +; RV64-NEXT: srliw a1, a0, 31 +; RV64-NEXT: addw a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i32 %a, 7 ret i32 %1 @@ -426,14 +456,26 @@ define i32 @sdiv_constant_add_srai(i32 %a) nounwind { define i32 @sdiv_constant_sub_srai(i32 %a) nounwind { ; RV32-LABEL: sdiv_constant_sub_srai: ; RV32: # %bb.0: -; RV32-NEXT: li a1, -7 -; RV32-NEXT: div a0, a0, a1 +; RV32-NEXT: lui a1, 449390 +; RV32-NEXT: addi a1, a1, -1171 +; RV32-NEXT: mulh a1, a0, a1 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: srai a0, a1, 2 +; RV32-NEXT: srli a1, a0, 31 +; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: sdiv_constant_sub_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, -7 -; RV64-NEXT: divw a0, a0, a1 +; RV64-NEXT: lui a1, 449390 +; RV64-NEXT: addi a1, a1, -1171 +; RV64-NEXT: sext.w a2, a0 +; RV64-NEXT: mul a1, a2, a1 +; RV64-NEXT: srai a1, a1, 32 +; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: sraiw a0, a1, 2 +; RV64-NEXT: srliw a1, a0, 31 +; RV64-NEXT: addw a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i32 %a, -7 ret i32 %1 @@ -453,8 +495,11 @@ define i64 @sdiv64_constant_no_srai(i64 %a) nounwind { ; ; RV64-LABEL: sdiv64_constant_no_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 3 -; RV64-NEXT: div a0, a0, a1 +; RV64-NEXT: lui a1, %hi(.LCPI12_0) +; RV64-NEXT: ld a1, %lo(.LCPI12_0)(a1) +; RV64-NEXT: mulh a0, a0, a1 +; RV64-NEXT: srli a1, a0, 63 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i64 %a, 3 ret i64 %1 @@ -474,8 +519,12 @@ define i64 @sdiv64_constant_srai(i64 %a) nounwind { ; ; RV64-LABEL: sdiv64_constant_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 5 -; RV64-NEXT: div a0, a0, a1 +; RV64-NEXT: lui a1, %hi(.LCPI13_0) +; RV64-NEXT: ld a1, %lo(.LCPI13_0)(a1) +; RV64-NEXT: mulh a0, a0, a1 +; RV64-NEXT: srai a0, a0, 1 +; RV64-NEXT: srli a1, a0, 63 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i64 %a, 5 ret i64 %1 @@ -495,8 +544,19 @@ define i64 @sdiv64_constant_add_srai(i64 %a) nounwind { ; ; RV64-LABEL: sdiv64_constant_add_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 15 -; RV64-NEXT: div a0, a0, a1 +; RV64-NEXT: lui a1, 1017993 +; RV64-NEXT: addi a1, a1, -1911 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1911 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1911 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1911 +; RV64-NEXT: mulh a1, a0, a1 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: srai a0, a0, 3 +; RV64-NEXT: srli a1, a0, 63 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i64 %a, 15 ret i64 %1 @@ -516,8 +576,19 @@ define i64 @sdiv64_constant_sub_srai(i64 %a) nounwind { ; ; RV64-LABEL: sdiv64_constant_sub_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, -3 -; RV64-NEXT: div a0, a0, a1 +; RV64-NEXT: lui a1, 21845 +; RV64-NEXT: addi a1, a1, 1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, 1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, 1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, 1365 +; RV64-NEXT: mulh a1, a0, a1 +; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: srai a0, a1, 1 +; RV64-NEXT: srli a1, a0, 63 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i64 %a, -3 ret i64 %1 @@ -526,32 +597,52 @@ define i64 @sdiv64_constant_sub_srai(i64 %a) nounwind { define i8 @sdiv8_constant_no_srai(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_constant_no_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, 3 +; RV32IM-NEXT: li a1, 86 ; RV32IM-NEXT: slli a0, a0, 24 ; RV32IM-NEXT: srai a0, a0, 24 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: srai a0, a0, 24 +; RV32IM-NEXT: zext.b a1, a0 +; RV32IM-NEXT: srli a1, a1, 7 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv8_constant_no_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, 3 +; RV32IMZB-NEXT: li a1, 86 ; RV32IMZB-NEXT: sext.b a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: mul a0, a0, a1 +; RV32IMZB-NEXT: sext.h a0, a0 +; RV32IMZB-NEXT: srai a0, a0, 8 +; RV32IMZB-NEXT: zext.b a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 7 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv8_constant_no_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, 3 +; RV64IM-NEXT: li a1, 86 ; RV64IM-NEXT: slli a0, a0, 56 ; RV64IM-NEXT: srai a0, a0, 56 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: srai a0, a0, 56 +; RV64IM-NEXT: zext.b a1, a0 +; RV64IM-NEXT: srli a1, a1, 7 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv8_constant_no_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, 3 +; RV64IMZB-NEXT: li a1, 86 ; RV64IMZB-NEXT: sext.b a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: mul a0, a0, a1 +; RV64IMZB-NEXT: sext.h a0, a0 +; RV64IMZB-NEXT: srai a0, a0, 8 +; RV64IMZB-NEXT: zext.b a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 7 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i8 %a, 3 ret i8 %1 @@ -560,32 +651,60 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind { define i8 @sdiv8_constant_srai(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_constant_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, 5 +; RV32IM-NEXT: li a1, 103 ; RV32IM-NEXT: slli a0, a0, 24 ; RV32IM-NEXT: srai a0, a0, 24 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: srai a0, a0, 24 +; RV32IM-NEXT: slli a0, a0, 24 +; RV32IM-NEXT: srai a0, a0, 25 +; RV32IM-NEXT: zext.b a1, a0 +; RV32IM-NEXT: srli a1, a1, 7 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv8_constant_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, 5 +; RV32IMZB-NEXT: li a1, 103 ; RV32IMZB-NEXT: sext.b a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: mul a0, a0, a1 +; RV32IMZB-NEXT: sext.h a0, a0 +; RV32IMZB-NEXT: srai a0, a0, 8 +; RV32IMZB-NEXT: sext.b a0, a0 +; RV32IMZB-NEXT: srai a0, a0, 1 +; RV32IMZB-NEXT: zext.b a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 7 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv8_constant_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, 5 +; RV64IM-NEXT: li a1, 103 ; RV64IM-NEXT: slli a0, a0, 56 ; RV64IM-NEXT: srai a0, a0, 56 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: srai a0, a0, 56 +; RV64IM-NEXT: slli a0, a0, 56 +; RV64IM-NEXT: srai a0, a0, 57 +; RV64IM-NEXT: zext.b a1, a0 +; RV64IM-NEXT: srli a1, a1, 7 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv8_constant_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, 5 +; RV64IMZB-NEXT: li a1, 103 +; RV64IMZB-NEXT: sext.b a0, a0 +; RV64IMZB-NEXT: mul a0, a0, a1 +; RV64IMZB-NEXT: sext.h a0, a0 +; RV64IMZB-NEXT: srai a0, a0, 8 ; RV64IMZB-NEXT: sext.b a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: srai a0, a0, 1 +; RV64IMZB-NEXT: zext.b a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 7 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i8 %a, 5 ret i8 %1 @@ -594,32 +713,64 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind { define i8 @sdiv8_constant_add_srai(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_constant_add_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, 7 +; RV32IM-NEXT: li a1, -109 +; RV32IM-NEXT: slli a2, a0, 24 +; RV32IM-NEXT: srai a2, a2, 24 +; RV32IM-NEXT: mul a1, a2, a1 +; RV32IM-NEXT: slli a1, a1, 16 +; RV32IM-NEXT: srai a1, a1, 24 +; RV32IM-NEXT: add a0, a1, a0 ; RV32IM-NEXT: slli a0, a0, 24 -; RV32IM-NEXT: srai a0, a0, 24 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: srai a0, a0, 26 +; RV32IM-NEXT: zext.b a1, a0 +; RV32IM-NEXT: srli a1, a1, 7 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv8_constant_add_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, 7 +; RV32IMZB-NEXT: li a1, -109 +; RV32IMZB-NEXT: sext.b a2, a0 +; RV32IMZB-NEXT: mul a1, a2, a1 +; RV32IMZB-NEXT: sext.h a1, a1 +; RV32IMZB-NEXT: srai a1, a1, 8 +; RV32IMZB-NEXT: add a0, a1, a0 ; RV32IMZB-NEXT: sext.b a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: srai a0, a0, 2 +; RV32IMZB-NEXT: zext.b a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 7 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv8_constant_add_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, 7 +; RV64IM-NEXT: li a1, -109 +; RV64IM-NEXT: slli a2, a0, 56 +; RV64IM-NEXT: srai a2, a2, 56 +; RV64IM-NEXT: mul a1, a2, a1 +; RV64IM-NEXT: slli a1, a1, 48 +; RV64IM-NEXT: srai a1, a1, 56 +; RV64IM-NEXT: add a0, a1, a0 ; RV64IM-NEXT: slli a0, a0, 56 -; RV64IM-NEXT: srai a0, a0, 56 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: srai a0, a0, 58 +; RV64IM-NEXT: zext.b a1, a0 +; RV64IM-NEXT: srli a1, a1, 7 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv8_constant_add_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, 7 +; RV64IMZB-NEXT: li a1, -109 +; RV64IMZB-NEXT: sext.b a2, a0 +; RV64IMZB-NEXT: mul a1, a2, a1 +; RV64IMZB-NEXT: sext.h a1, a1 +; RV64IMZB-NEXT: srai a1, a1, 8 +; RV64IMZB-NEXT: add a0, a1, a0 ; RV64IMZB-NEXT: sext.b a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: srai a0, a0, 2 +; RV64IMZB-NEXT: zext.b a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 7 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i8 %a, 7 ret i8 %1 @@ -628,32 +779,64 @@ define i8 @sdiv8_constant_add_srai(i8 %a) nounwind { define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_constant_sub_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, -7 -; RV32IM-NEXT: slli a0, a0, 24 -; RV32IM-NEXT: srai a0, a0, 24 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: li a1, 109 +; RV32IM-NEXT: slli a2, a0, 24 +; RV32IM-NEXT: srai a2, a2, 24 +; RV32IM-NEXT: mul a1, a2, a1 +; RV32IM-NEXT: slli a1, a1, 16 +; RV32IM-NEXT: srai a1, a1, 24 +; RV32IM-NEXT: sub a1, a1, a0 +; RV32IM-NEXT: slli a1, a1, 24 +; RV32IM-NEXT: srai a0, a1, 26 +; RV32IM-NEXT: zext.b a1, a0 +; RV32IM-NEXT: srli a1, a1, 7 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv8_constant_sub_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, -7 -; RV32IMZB-NEXT: sext.b a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: li a1, 109 +; RV32IMZB-NEXT: sext.b a2, a0 +; RV32IMZB-NEXT: mul a1, a2, a1 +; RV32IMZB-NEXT: sext.h a1, a1 +; RV32IMZB-NEXT: srai a1, a1, 8 +; RV32IMZB-NEXT: sub a1, a1, a0 +; RV32IMZB-NEXT: sext.b a0, a1 +; RV32IMZB-NEXT: srai a0, a0, 2 +; RV32IMZB-NEXT: zext.b a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 7 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv8_constant_sub_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, -7 -; RV64IM-NEXT: slli a0, a0, 56 -; RV64IM-NEXT: srai a0, a0, 56 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: li a1, 109 +; RV64IM-NEXT: slli a2, a0, 56 +; RV64IM-NEXT: srai a2, a2, 56 +; RV64IM-NEXT: mul a1, a2, a1 +; RV64IM-NEXT: slli a1, a1, 48 +; RV64IM-NEXT: srai a1, a1, 56 +; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: slli a1, a1, 56 +; RV64IM-NEXT: srai a0, a1, 58 +; RV64IM-NEXT: zext.b a1, a0 +; RV64IM-NEXT: srli a1, a1, 7 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv8_constant_sub_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, -7 -; RV64IMZB-NEXT: sext.b a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: li a1, 109 +; RV64IMZB-NEXT: sext.b a2, a0 +; RV64IMZB-NEXT: mul a1, a2, a1 +; RV64IMZB-NEXT: sext.h a1, a1 +; RV64IMZB-NEXT: srai a1, a1, 8 +; RV64IMZB-NEXT: sub a1, a1, a0 +; RV64IMZB-NEXT: sext.b a0, a1 +; RV64IMZB-NEXT: srai a0, a0, 2 +; RV64IMZB-NEXT: zext.b a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 7 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i8 %a, -7 ret i8 %1 @@ -662,32 +845,54 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind { define i16 @sdiv16_constant_no_srai(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_constant_no_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, 3 +; RV32IM-NEXT: lui a1, 5 ; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: addi a1, a1, 1366 ; RV32IM-NEXT: srai a0, a0, 16 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: srai a0, a0, 16 +; RV32IM-NEXT: slli a1, a0, 16 +; RV32IM-NEXT: srli a1, a1, 16 +; RV32IM-NEXT: srli a1, a1, 15 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv16_constant_no_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, 3 +; RV32IMZB-NEXT: lui a1, 5 +; RV32IMZB-NEXT: addi a1, a1, 1366 ; RV32IMZB-NEXT: sext.h a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: mul a0, a0, a1 +; RV32IMZB-NEXT: srai a0, a0, 16 +; RV32IMZB-NEXT: zext.h a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 15 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv16_constant_no_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, 3 +; RV64IM-NEXT: lui a1, 5 ; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: addi a1, a1, 1366 ; RV64IM-NEXT: srai a0, a0, 48 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: sraiw a0, a0, 16 +; RV64IM-NEXT: slli a1, a0, 48 +; RV64IM-NEXT: srli a1, a1, 48 +; RV64IM-NEXT: srli a1, a1, 15 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv16_constant_no_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, 3 +; RV64IMZB-NEXT: lui a1, 5 +; RV64IMZB-NEXT: addi a1, a1, 1366 ; RV64IMZB-NEXT: sext.h a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: mul a0, a0, a1 +; RV64IMZB-NEXT: sraiw a0, a0, 16 +; RV64IMZB-NEXT: zext.h a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 15 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i16 %a, 3 ret i16 %1 @@ -696,32 +901,62 @@ define i16 @sdiv16_constant_no_srai(i16 %a) nounwind { define i16 @sdiv16_constant_srai(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_constant_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, 5 +; RV32IM-NEXT: lui a1, 6 ; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: addi a1, a1, 1639 ; RV32IM-NEXT: srai a0, a0, 16 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: srai a0, a0, 16 +; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: srai a0, a0, 17 +; RV32IM-NEXT: slli a1, a0, 16 +; RV32IM-NEXT: srli a1, a1, 16 +; RV32IM-NEXT: srli a1, a1, 15 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv16_constant_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, 5 +; RV32IMZB-NEXT: lui a1, 6 +; RV32IMZB-NEXT: addi a1, a1, 1639 ; RV32IMZB-NEXT: sext.h a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: mul a0, a0, a1 +; RV32IMZB-NEXT: srai a0, a0, 16 +; RV32IMZB-NEXT: sext.h a0, a0 +; RV32IMZB-NEXT: srai a0, a0, 1 +; RV32IMZB-NEXT: zext.h a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 15 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv16_constant_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, 5 +; RV64IM-NEXT: lui a1, 6 ; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: addi a1, a1, 1639 ; RV64IM-NEXT: srai a0, a0, 48 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: sraiw a0, a0, 16 +; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: srai a0, a0, 49 +; RV64IM-NEXT: slli a1, a0, 48 +; RV64IM-NEXT: srli a1, a1, 48 +; RV64IM-NEXT: srli a1, a1, 15 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv16_constant_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, 5 +; RV64IMZB-NEXT: lui a1, 6 +; RV64IMZB-NEXT: addi a1, a1, 1639 +; RV64IMZB-NEXT: sext.h a0, a0 +; RV64IMZB-NEXT: mul a0, a0, a1 +; RV64IMZB-NEXT: sraiw a0, a0, 16 ; RV64IMZB-NEXT: sext.h a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: srai a0, a0, 1 +; RV64IMZB-NEXT: zext.h a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 15 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i16 %a, 5 ret i16 %1 @@ -730,32 +965,66 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind { define i16 @sdiv16_constant_add_srai(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_constant_add_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, 15 +; RV32IM-NEXT: lui a1, 1048569 +; RV32IM-NEXT: slli a2, a0, 16 +; RV32IM-NEXT: addi a1, a1, -1911 +; RV32IM-NEXT: srai a2, a2, 16 +; RV32IM-NEXT: mul a1, a2, a1 +; RV32IM-NEXT: srai a1, a1, 16 +; RV32IM-NEXT: add a0, a1, a0 ; RV32IM-NEXT: slli a0, a0, 16 -; RV32IM-NEXT: srai a0, a0, 16 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: srai a0, a0, 19 +; RV32IM-NEXT: slli a1, a0, 16 +; RV32IM-NEXT: srli a1, a1, 16 +; RV32IM-NEXT: srli a1, a1, 15 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv16_constant_add_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, 15 +; RV32IMZB-NEXT: lui a1, 1048569 +; RV32IMZB-NEXT: addi a1, a1, -1911 +; RV32IMZB-NEXT: sext.h a2, a0 +; RV32IMZB-NEXT: mul a1, a2, a1 +; RV32IMZB-NEXT: srai a1, a1, 16 +; RV32IMZB-NEXT: add a0, a1, a0 ; RV32IMZB-NEXT: sext.h a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: srai a0, a0, 3 +; RV32IMZB-NEXT: zext.h a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 15 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv16_constant_add_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, 15 +; RV64IM-NEXT: lui a1, 1048569 +; RV64IM-NEXT: slli a2, a0, 48 +; RV64IM-NEXT: addi a1, a1, -1911 +; RV64IM-NEXT: srai a2, a2, 48 +; RV64IM-NEXT: mul a1, a2, a1 +; RV64IM-NEXT: sraiw a1, a1, 16 +; RV64IM-NEXT: add a0, a1, a0 ; RV64IM-NEXT: slli a0, a0, 48 -; RV64IM-NEXT: srai a0, a0, 48 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: srai a0, a0, 51 +; RV64IM-NEXT: slli a1, a0, 48 +; RV64IM-NEXT: srli a1, a1, 48 +; RV64IM-NEXT: srli a1, a1, 15 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv16_constant_add_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, 15 +; RV64IMZB-NEXT: lui a1, 1048569 +; RV64IMZB-NEXT: addi a1, a1, -1911 +; RV64IMZB-NEXT: sext.h a2, a0 +; RV64IMZB-NEXT: mul a1, a2, a1 +; RV64IMZB-NEXT: sraiw a1, a1, 16 +; RV64IMZB-NEXT: add a0, a1, a0 ; RV64IMZB-NEXT: sext.h a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: srai a0, a0, 3 +; RV64IMZB-NEXT: zext.h a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 15 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i16 %a, 15 ret i16 %1 @@ -764,32 +1033,66 @@ define i16 @sdiv16_constant_add_srai(i16 %a) nounwind { define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_constant_sub_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, -15 -; RV32IM-NEXT: slli a0, a0, 16 -; RV32IM-NEXT: srai a0, a0, 16 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: lui a1, 7 +; RV32IM-NEXT: slli a2, a0, 16 +; RV32IM-NEXT: addi a1, a1, 1911 +; RV32IM-NEXT: srai a2, a2, 16 +; RV32IM-NEXT: mul a1, a2, a1 +; RV32IM-NEXT: srai a1, a1, 16 +; RV32IM-NEXT: sub a1, a1, a0 +; RV32IM-NEXT: slli a1, a1, 16 +; RV32IM-NEXT: srai a0, a1, 19 +; RV32IM-NEXT: slli a1, a0, 16 +; RV32IM-NEXT: srli a1, a1, 16 +; RV32IM-NEXT: srli a1, a1, 15 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv16_constant_sub_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, -15 -; RV32IMZB-NEXT: sext.h a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: lui a1, 7 +; RV32IMZB-NEXT: addi a1, a1, 1911 +; RV32IMZB-NEXT: sext.h a2, a0 +; RV32IMZB-NEXT: mul a1, a2, a1 +; RV32IMZB-NEXT: srai a1, a1, 16 +; RV32IMZB-NEXT: sub a1, a1, a0 +; RV32IMZB-NEXT: sext.h a0, a1 +; RV32IMZB-NEXT: srai a0, a0, 3 +; RV32IMZB-NEXT: zext.h a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 15 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv16_constant_sub_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, -15 -; RV64IM-NEXT: slli a0, a0, 48 -; RV64IM-NEXT: srai a0, a0, 48 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: lui a1, 7 +; RV64IM-NEXT: slli a2, a0, 48 +; RV64IM-NEXT: addi a1, a1, 1911 +; RV64IM-NEXT: srai a2, a2, 48 +; RV64IM-NEXT: mul a1, a2, a1 +; RV64IM-NEXT: sraiw a1, a1, 16 +; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: slli a1, a1, 48 +; RV64IM-NEXT: srai a0, a1, 51 +; RV64IM-NEXT: slli a1, a0, 48 +; RV64IM-NEXT: srli a1, a1, 48 +; RV64IM-NEXT: srli a1, a1, 15 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv16_constant_sub_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, -15 -; RV64IMZB-NEXT: sext.h a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: lui a1, 7 +; RV64IMZB-NEXT: addi a1, a1, 1911 +; RV64IMZB-NEXT: sext.h a2, a0 +; RV64IMZB-NEXT: mul a1, a2, a1 +; RV64IMZB-NEXT: sraiw a1, a1, 16 +; RV64IMZB-NEXT: sub a1, a1, a0 +; RV64IMZB-NEXT: sext.h a0, a1 +; RV64IMZB-NEXT: srai a0, a0, 3 +; RV64IMZB-NEXT: zext.h a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 15 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i16 %a, -15 ret i16 %1 From ee8df1ac69b482e4ade557c33134b761ed3865ce Mon Sep 17 00:00:00 2001 From: Yu Li Date: Tue, 1 Jul 2025 10:46:24 +0000 Subject: [PATCH 2/4] removed test cases with undef --- .../AArch64/GlobalISel/combine-sdiv.ll | 78 ------------------- 1 file changed, 78 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll index b7dadf711fce1..4d3a9d2b9a5cf 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll @@ -874,84 +874,6 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) { ret <4 x i32> %1 } -define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) { -; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_undef1: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_undef1: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w8, #-4 // =0xfffffffc -; CHECK-GI-NEXT: mov w11, v0.s[3] -; CHECK-GI-NEXT: mov w12, #-16 // =0xfffffff0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w8, w10, w8 -; CHECK-GI-NEXT: mov w10, v0.s[2] -; CHECK-GI-NEXT: fmov s0, w9 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: mov v0.s[1], w8 -; CHECK-GI-NEXT: sdiv w8, w11, w12 -; CHECK-GI-NEXT: mov v0.s[2], w10 -; CHECK-GI-NEXT: mov v0.s[3], w8 -; CHECK-GI-NEXT: ret - %1 = sdiv <4 x i32> %x, - ret <4 x i32> %1 -} - -define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) { -; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_undef2: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_undef2: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w8, #4 // =0x4 -; CHECK-GI-NEXT: mov w11, v0.s[3] -; CHECK-GI-NEXT: mov w12, #16 // =0x10 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w8, w10, w8 -; CHECK-GI-NEXT: mov w10, v0.s[2] -; CHECK-GI-NEXT: fmov s0, w9 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: mov v0.s[1], w8 -; CHECK-GI-NEXT: sdiv w8, w11, w12 -; CHECK-GI-NEXT: mov v0.s[2], w10 -; CHECK-GI-NEXT: mov v0.s[3], w8 -; CHECK-GI-NEXT: ret - %1 = sdiv <4 x i32> %x, - ret <4 x i32> %1 -} - -define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) { -; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_undef3: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_undef3: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w8, #-4 // =0xfffffffc -; CHECK-GI-NEXT: mov w11, v0.s[3] -; CHECK-GI-NEXT: mov w12, #16 // =0x10 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w8, w10, w8 -; CHECK-GI-NEXT: mov w10, v0.s[2] -; CHECK-GI-NEXT: fmov s0, w9 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: mov v0.s[1], w8 -; CHECK-GI-NEXT: sdiv w8, w11, w12 -; CHECK-GI-NEXT: mov v0.s[2], w10 -; CHECK-GI-NEXT: mov v0.s[3], w8 -; CHECK-GI-NEXT: ret - %1 = sdiv <4 x i32> %x, - ret <4 x i32> %1 -} - ; PR37119 define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) { ; CHECK-SD-LABEL: non_splat_minus_one_divisor_0: From 81c6c90907399ad30f7b21b38d2851dfabaa177e Mon Sep 17 00:00:00 2001 From: Yu Li Date: Tue, 1 Jul 2025 15:16:38 +0000 Subject: [PATCH 3/4] update sdiv unit tests --- .../AArch64/GlobalISel/combine-sdiv.ll | 204 +++++++++--------- 1 file changed, 102 insertions(+), 102 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll index 4d3a9d2b9a5cf..6f5ad333fcc01 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll @@ -924,11 +924,11 @@ define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) { ; CHECK-GI-LABEL: non_splat_minus_one_divisor_0: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 -; CHECK-GI-NEXT: adrp x8, .LCPI28_0 +; CHECK-GI-NEXT: adrp x8, .LCPI25_0 ; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b ; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI28_0] +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI25_0] ; CHECK-GI-NEXT: mla v1.16b, v0.16b, v2.16b ; CHECK-GI-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NEXT: ret @@ -939,24 +939,24 @@ define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) { define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; CHECK-SD-LABEL: non_splat_minus_one_divisor_1: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: adrp x8, .LCPI29_1 +; CHECK-SD-NEXT: adrp x8, .LCPI26_1 ; CHECK-SD-NEXT: cmlt v1.16b, v0.16b, #0 -; CHECK-SD-NEXT: adrp x9, .LCPI29_3 -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI29_1] -; CHECK-SD-NEXT: adrp x8, .LCPI29_2 -; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI29_3] -; CHECK-SD-NEXT: adrp x9, .LCPI29_5 +; CHECK-SD-NEXT: adrp x9, .LCPI26_3 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_1] +; CHECK-SD-NEXT: adrp x8, .LCPI26_2 +; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI26_3] +; CHECK-SD-NEXT: adrp x9, .LCPI26_5 ; CHECK-SD-NEXT: ushl v1.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI29_2] -; CHECK-SD-NEXT: adrp x8, .LCPI29_0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_2] +; CHECK-SD-NEXT: adrp x8, .LCPI26_0 ; CHECK-SD-NEXT: add v1.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: sshl v1.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI29_0] -; CHECK-SD-NEXT: adrp x8, .LCPI29_4 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_0] +; CHECK-SD-NEXT: adrp x8, .LCPI26_4 ; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI29_4] +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_4] ; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI29_5] +; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI26_5] ; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: neg v1.16b, v0.16b ; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b @@ -966,19 +966,19 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; ; CHECK-GI-LABEL: non_splat_minus_one_divisor_1: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI29_3 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI29_3] -; CHECK-GI-NEXT: adrp x8, .LCPI29_2 +; CHECK-GI-NEXT: adrp x8, .LCPI26_3 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI26_3] +; CHECK-GI-NEXT: adrp x8, .LCPI26_2 ; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b ; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI29_2] -; CHECK-GI-NEXT: adrp x8, .LCPI29_1 -; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI29_1] -; CHECK-GI-NEXT: adrp x8, .LCPI29_0 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI26_2] +; CHECK-GI-NEXT: adrp x8, .LCPI26_1 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI26_1] +; CHECK-GI-NEXT: adrp x8, .LCPI26_0 ; CHECK-GI-NEXT: mla v1.16b, v0.16b, v2.16b ; CHECK-GI-NEXT: neg v0.16b, v3.16b -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI29_0] +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI26_0] ; CHECK-GI-NEXT: sshl v0.16b, v1.16b, v0.16b ; CHECK-GI-NEXT: ushr v1.16b, v0.16b, #7 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b @@ -991,35 +991,35 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) { ; CHECK-SD-LABEL: non_splat_minus_one_divisor_2: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: adrp x8, .LCPI30_0 +; CHECK-SD-NEXT: adrp x8, .LCPI27_0 ; CHECK-SD-NEXT: cmlt v1.4s, v0.4s, #0 -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI30_0] -; CHECK-SD-NEXT: adrp x8, .LCPI30_1 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI27_0] +; CHECK-SD-NEXT: adrp x8, .LCPI27_1 ; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI30_1] -; CHECK-SD-NEXT: adrp x8, .LCPI30_2 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI27_1] +; CHECK-SD-NEXT: adrp x8, .LCPI27_2 ; CHECK-SD-NEXT: add v1.4s, v0.4s, v1.4s ; CHECK-SD-NEXT: sshl v1.4s, v1.4s, v2.4s -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI30_2] -; CHECK-SD-NEXT: adrp x8, .LCPI30_3 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI27_2] +; CHECK-SD-NEXT: adrp x8, .LCPI27_3 ; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI30_3] +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI27_3] ; CHECK-SD-NEXT: neg v1.4s, v0.4s ; CHECK-SD-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: non_splat_minus_one_divisor_2: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI30_2 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI30_2] -; CHECK-GI-NEXT: adrp x8, .LCPI30_1 +; CHECK-GI-NEXT: adrp x8, .LCPI27_2 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI27_2] +; CHECK-GI-NEXT: adrp x8, .LCPI27_1 ; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s ; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s ; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI30_1] -; CHECK-GI-NEXT: adrp x8, .LCPI30_0 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI27_1] +; CHECK-GI-NEXT: adrp x8, .LCPI27_0 ; CHECK-GI-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI30_0] +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI27_0] ; CHECK-GI-NEXT: ushr v0.4s, v1.4s, #31 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s @@ -1031,8 +1031,8 @@ define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) { define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) { ; CHECK-LABEL: combine_vec_sdiv_nonuniform: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI31_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI31_0] +; CHECK-NEXT: adrp x8, .LCPI28_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI28_0] ; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h @@ -1045,12 +1045,12 @@ define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) { define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) { ; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform2: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: adrp x8, .LCPI32_0 -; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI32_0] -; CHECK-SD-NEXT: adrp x8, .LCPI32_1 +; CHECK-SD-NEXT: adrp x8, .LCPI29_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI29_0] +; CHECK-SD-NEXT: adrp x8, .LCPI29_1 ; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-SD-NEXT: smull v0.4s, v0.4h, v1.4h -; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI32_1] +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI29_1] ; CHECK-SD-NEXT: uzp2 v0.8h, v0.8h, v2.8h ; CHECK-SD-NEXT: sshl v0.8h, v0.8h, v1.8h ; CHECK-SD-NEXT: usra v0.8h, v0.8h, #15 @@ -1058,12 +1058,12 @@ define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) { ; ; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform2: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI32_1 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI32_1] -; CHECK-GI-NEXT: adrp x8, .LCPI32_0 +; CHECK-GI-NEXT: adrp x8, .LCPI29_1 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI29_1] +; CHECK-GI-NEXT: adrp x8, .LCPI29_0 ; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI32_0] +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI29_0] ; CHECK-GI-NEXT: neg v1.8h, v1.8h ; CHECK-GI-NEXT: uzp2 v0.8h, v0.8h, v2.8h ; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v1.8h @@ -1076,27 +1076,27 @@ define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) { define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) { ; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform3: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: adrp x8, .LCPI33_0 -; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI33_0] -; CHECK-SD-NEXT: adrp x8, .LCPI33_1 +; CHECK-SD-NEXT: adrp x8, .LCPI30_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI30_0] +; CHECK-SD-NEXT: adrp x8, .LCPI30_1 ; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h ; CHECK-SD-NEXT: add v0.8h, v1.8h, v0.8h -; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI33_1] +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI30_1] ; CHECK-SD-NEXT: sshl v0.8h, v0.8h, v1.8h ; CHECK-SD-NEXT: usra v0.8h, v0.8h, #15 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform3: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI33_1 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI33_1] -; CHECK-GI-NEXT: adrp x8, .LCPI33_0 +; CHECK-GI-NEXT: adrp x8, .LCPI30_1 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI30_1] +; CHECK-GI-NEXT: adrp x8, .LCPI30_0 ; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_0] +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI30_0] ; CHECK-GI-NEXT: add v0.8h, v1.8h, v0.8h ; CHECK-GI-NEXT: neg v1.8h, v2.8h ; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v1.8h @@ -1109,27 +1109,27 @@ define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) { define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) { ; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform4: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: adrp x8, .LCPI34_0 -; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI34_0] -; CHECK-SD-NEXT: adrp x8, .LCPI34_1 +; CHECK-SD-NEXT: adrp x8, .LCPI31_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI31_0] +; CHECK-SD-NEXT: adrp x8, .LCPI31_1 ; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h ; CHECK-SD-NEXT: sub v0.8h, v1.8h, v0.8h -; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI34_1] +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI31_1] ; CHECK-SD-NEXT: sshl v0.8h, v0.8h, v1.8h ; CHECK-SD-NEXT: usra v0.8h, v0.8h, #15 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform4: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI34_1 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI34_1] -; CHECK-GI-NEXT: adrp x8, .LCPI34_0 +; CHECK-GI-NEXT: adrp x8, .LCPI31_1 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI31_1] +; CHECK-GI-NEXT: adrp x8, .LCPI31_0 ; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI31_0] ; CHECK-GI-NEXT: sub v0.8h, v1.8h, v0.8h ; CHECK-GI-NEXT: neg v1.8h, v2.8h ; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v1.8h @@ -1142,31 +1142,31 @@ define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) { define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { ; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform5: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: adrp x8, .LCPI35_0 -; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI35_0] -; CHECK-SD-NEXT: adrp x8, .LCPI35_1 +; CHECK-SD-NEXT: adrp x8, .LCPI32_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI32_0] +; CHECK-SD-NEXT: adrp x8, .LCPI32_1 ; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI35_1] -; CHECK-SD-NEXT: adrp x8, .LCPI35_2 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI32_1] +; CHECK-SD-NEXT: adrp x8, .LCPI32_2 ; CHECK-SD-NEXT: mla v1.8h, v0.8h, v2.8h -; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI35_2] +; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI32_2] ; CHECK-SD-NEXT: sshl v0.8h, v1.8h, v0.8h ; CHECK-SD-NEXT: usra v0.8h, v0.8h, #15 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform5: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI35_2 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI35_2] -; CHECK-GI-NEXT: adrp x8, .LCPI35_1 +; CHECK-GI-NEXT: adrp x8, .LCPI32_2 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI32_2] +; CHECK-GI-NEXT: adrp x8, .LCPI32_1 ; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_1] -; CHECK-GI-NEXT: adrp x8, .LCPI35_0 -; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI35_0] +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI32_1] +; CHECK-GI-NEXT: adrp x8, .LCPI32_0 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI32_0] ; CHECK-GI-NEXT: mla v1.8h, v0.8h, v2.8h ; CHECK-GI-NEXT: neg v0.8h, v3.8h ; CHECK-GI-NEXT: sshl v0.8h, v1.8h, v0.8h @@ -1179,18 +1179,18 @@ define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { ; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform6: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: adrp x8, .LCPI36_0 -; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI36_0] -; CHECK-SD-NEXT: adrp x8, .LCPI36_1 +; CHECK-SD-NEXT: adrp x8, .LCPI33_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI33_0] +; CHECK-SD-NEXT: adrp x8, .LCPI33_1 ; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI36_1] -; CHECK-SD-NEXT: adrp x8, .LCPI36_2 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI33_1] +; CHECK-SD-NEXT: adrp x8, .LCPI33_2 ; CHECK-SD-NEXT: mla v1.8h, v0.8h, v2.8h -; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI36_2] -; CHECK-SD-NEXT: adrp x8, .LCPI36_3 -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI36_3] +; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI33_2] +; CHECK-SD-NEXT: adrp x8, .LCPI33_3 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI33_3] ; CHECK-SD-NEXT: sshl v0.8h, v1.8h, v0.8h ; CHECK-SD-NEXT: ushr v1.8h, v0.8h, #15 ; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b @@ -1199,19 +1199,19 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { ; ; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform6: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI36_3 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI36_3] -; CHECK-GI-NEXT: adrp x8, .LCPI36_2 +; CHECK-GI-NEXT: adrp x8, .LCPI33_3 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI33_3] +; CHECK-GI-NEXT: adrp x8, .LCPI33_2 ; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_2] -; CHECK-GI-NEXT: adrp x8, .LCPI36_1 -; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI36_1] -; CHECK-GI-NEXT: adrp x8, .LCPI36_0 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_2] +; CHECK-GI-NEXT: adrp x8, .LCPI33_1 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI33_1] +; CHECK-GI-NEXT: adrp x8, .LCPI33_0 ; CHECK-GI-NEXT: mla v1.8h, v0.8h, v2.8h ; CHECK-GI-NEXT: neg v0.8h, v3.8h -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_0] ; CHECK-GI-NEXT: sshl v0.8h, v1.8h, v0.8h ; CHECK-GI-NEXT: ushr v1.8h, v0.8h, #15 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b @@ -1251,11 +1251,11 @@ define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) { ; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform7: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 -; CHECK-GI-NEXT: adrp x8, .LCPI37_0 +; CHECK-GI-NEXT: adrp x8, .LCPI34_0 ; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h ; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI37_0] +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] ; CHECK-GI-NEXT: mla v1.8h, v0.8h, v2.8h ; CHECK-GI-NEXT: mov v0.16b, v1.16b ; CHECK-GI-NEXT: ret @@ -1266,14 +1266,14 @@ define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) { define <16 x i8> @pr38658(<16 x i8> %x) { ; CHECK-SD-LABEL: pr38658: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: adrp x8, .LCPI38_0 -; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI38_0] -; CHECK-SD-NEXT: adrp x8, .LCPI38_1 +; CHECK-SD-NEXT: adrp x8, .LCPI35_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI35_0] +; CHECK-SD-NEXT: adrp x8, .LCPI35_1 ; CHECK-SD-NEXT: smull2 v2.8h, v0.16b, v1.16b ; CHECK-SD-NEXT: smull v1.8h, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp2 v1.16b, v1.16b, v2.16b ; CHECK-SD-NEXT: add v0.16b, v1.16b, v0.16b -; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI38_1] +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI35_1] ; CHECK-SD-NEXT: sshl v0.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 ; CHECK-SD-NEXT: ushr v2.16b, v0.16b, #7 @@ -1283,17 +1283,17 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; ; CHECK-GI-LABEL: pr38658: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI38_2 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI38_2] -; CHECK-GI-NEXT: adrp x8, .LCPI38_1 +; CHECK-GI-NEXT: adrp x8, .LCPI35_2 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI35_2] +; CHECK-GI-NEXT: adrp x8, .LCPI35_1 ; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b ; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI38_1] -; CHECK-GI-NEXT: adrp x8, .LCPI38_0 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_1] +; CHECK-GI-NEXT: adrp x8, .LCPI35_0 ; CHECK-GI-NEXT: add v0.16b, v1.16b, v0.16b ; CHECK-GI-NEXT: neg v1.16b, v2.16b -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI38_0] +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_0] ; CHECK-GI-NEXT: sshl v0.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: ushr v1.16b, v0.16b, #7 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b From ea26b166dea389329bada995098aeb108ce35a6f Mon Sep 17 00:00:00 2001 From: Yu Li Date: Mon, 7 Jul 2025 10:03:39 +0000 Subject: [PATCH 4/4] changes based on comments --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 2 +- .../include/llvm/Target/GlobalISel/Combine.td | 8 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 4 +- llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll | 9 +- .../AArch64/{GlobalISel => }/combine-sdiv.ll | 453 ++++++++---------- .../AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll | 16 +- .../CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll | 48 +- 7 files changed, 235 insertions(+), 305 deletions(-) rename llvm/test/CodeGen/AArch64/{GlobalISel => }/combine-sdiv.ll (82%) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index d47b5a0ad40bd..31f1197b9723b 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -145,7 +145,7 @@ class CombinerHelper { /// \return true if \p Query is legal on the target, or if \p Query will /// perform WidenScalar action on the target. - bool isLegalorHasWidenScalar(const LegalityQuery &Query) const; + bool isLegalOrHasWidenScalar(const LegalityQuery &Query) const; /// \return true if the combine is running prior to legalization, or if \p Ty /// is a legal integer constant type on the target. diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 85b86b4ed00d6..66051d756c808 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1131,13 +1131,13 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg, def udiv_by_const : GICombineRule< (defs root:$root), - (match (wip_match_opcode G_UDIV):$root, + (match (G_UDIV $dst, $x, $y):$root, [{ return Helper.matchUDivorURemByConst(*${root}); }]), (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>; def sdiv_by_const : GICombineRule< (defs root:$root), - (match (wip_match_opcode G_SDIV):$root, + (match (G_SDIV $dst, $x, $y):$root, [{ return Helper.matchSDivByConst(*${root}); }]), (apply [{ Helper.applySDivByConst(*${root}); }])>; @@ -1153,8 +1153,8 @@ def udiv_by_pow2 : GICombineRule< [{ return Helper.matchDivByPow2(*${root}, /*IsSigned=*/false); }]), (apply [{ Helper.applyUDivByPow2(*${root}); }])>; -def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const, - sdiv_by_pow2, udiv_by_pow2]>; +def intdiv_combines : GICombineGroup<[udiv_by_pow2, sdiv_by_pow2, + udiv_by_const, sdiv_by_const,]>; def urem_by_const : GICombineRule< (defs root:$root), diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 80f1e3f2644ec..13091e047b431 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -162,7 +162,7 @@ bool CombinerHelper::isLegalOrBeforeLegalizer( return isPreLegalize() || isLegal(Query); } -bool CombinerHelper::isLegalorHasWidenScalar(const LegalityQuery &Query) const { +bool CombinerHelper::isLegalOrHasWidenScalar(const LegalityQuery &Query) const { return isLegal(Query) || LI->getAction(Query).Action == LegalizeActions::WidenScalar; } @@ -5557,7 +5557,7 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}})) return false; if (!isLegal({TargetOpcode::G_SMULH, {DstTy}}) && - !isLegalorHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}})) + !isLegalOrHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}})) return false; } diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll index feb0e10ef62c3..b8eb8269d605c 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll @@ -229,15 +229,12 @@ define <4 x i32> @test_bit_sink_operand(<4 x i32> %src, <4 x i32> %dst, <4 x i32 ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sub sp, sp, #32 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 -; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-GI-NEXT: sxtw x9, w0 +; CHECK-GI-NEXT: asr w9, w0, #31 ; CHECK-GI-NEXT: mov w8, wzr ; CHECK-GI-NEXT: add x10, sp, #16 ; CHECK-GI-NEXT: mov x11, sp -; CHECK-GI-NEXT: sub x9, x9, x9, lsl #31 -; CHECK-GI-NEXT: asr x9, x9, #32 -; CHECK-GI-NEXT: add w9, w9, w0 -; CHECK-GI-NEXT: add w9, w9, w9, lsr #31 +; CHECK-GI-NEXT: add w9, w0, w9, lsr #31 +; CHECK-GI-NEXT: asr w9, w9, #1 ; CHECK-GI-NEXT: .LBB11_1: // %do.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: bit v1.16b, v0.16b, v2.16b diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll similarity index 82% rename from llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll rename to llvm/test/CodeGen/AArch64/combine-sdiv.ll index 6f5ad333fcc01..b6a2146e430e7 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.ll +++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll @@ -12,18 +12,9 @@ define i32 @combine_sdiv_by_one(i32 %x) { } define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) { -; CHECK-SD-LABEL: combine_vec_sdiv_by_one: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: combine_vec_sdiv_by_one: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 -; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s -; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: combine_vec_sdiv_by_one: +; CHECK: // %bb.0: +; CHECK-NEXT: ret %1 = sdiv <4 x i32> %x, ret <4 x i32> %1 } @@ -38,19 +29,10 @@ define i32 @combine_sdiv_by_negone(i32 %x) { } define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) { -; CHECK-SD-LABEL: combine_vec_sdiv_by_negone: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: neg v0.4s, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: combine_vec_sdiv_by_negone: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 -; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s -; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: sub v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: combine_vec_sdiv_by_negone: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: ret %1 = sdiv <4 x i32> %x, ret <4 x i32> %1 } @@ -65,14 +47,9 @@ define i32 @combine_sdiv_by_minsigned(i32 %x) { ; ; CHECK-GI-LABEL: combine_sdiv_by_minsigned: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-GI-NEXT: sxtw x8, w0 -; CHECK-GI-NEXT: lsl x9, x8, #31 -; CHECK-GI-NEXT: sub x8, x9, x8 -; CHECK-GI-NEXT: asr x8, x8, #32 -; CHECK-GI-NEXT: sub w8, w8, w0 -; CHECK-GI-NEXT: asr w8, w8, #30 -; CHECK-GI-NEXT: add w0, w8, w8, lsr #31 +; CHECK-GI-NEXT: asr w8, w0, #31 +; CHECK-GI-NEXT: add w8, w0, w8, lsr #1 +; CHECK-GI-NEXT: neg w0, w8, asr #31 ; CHECK-GI-NEXT: ret %1 = sdiv i32 %x, -2147483648 ret i32 %1 @@ -89,14 +66,10 @@ define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) { ; ; CHECK-GI-LABEL: combine_vec_sdiv_by_minsigned: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mvni v1.4s, #128, lsl #24 -; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s -; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: sub v1.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #30 -; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31 -; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #30 +; CHECK-GI-NEXT: sshr v1.4s, v0.4s, #31 +; CHECK-GI-NEXT: usra v0.4s, v1.4s, #1 +; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #31 +; CHECK-GI-NEXT: neg v0.4s, v0.4s ; CHECK-GI-NEXT: ret %1 = sdiv <4 x i32> %x, ret <4 x i32> %1 @@ -169,17 +142,9 @@ define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) { ; ; CHECK-GI-LABEL: combine_vec_sdiv_by_pos0: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.4s, #1 -; CHECK-GI-NEXT: movi v2.2d, #0x0000ff000000ff -; CHECK-GI-NEXT: fneg v1.4s, v1.4s -; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s -; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: add v1.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #1 -; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31 -; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #1 +; CHECK-GI-NEXT: movi v1.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #2 ; CHECK-GI-NEXT: ret %1 = and <4 x i32> %x, %2 = sdiv <4 x i32> %1, @@ -198,23 +163,21 @@ define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) { ; ; CHECK-GI-LABEL: combine_vec_sdiv_by_pos1: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.2d, #0x0000ff000000ff -; CHECK-GI-NEXT: adrp x8, .LCPI11_2 -; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI11_2] -; CHECK-GI-NEXT: adrp x8, .LCPI11_1 -; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s -; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI11_1] +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w9, #0 // =0x0 +; CHECK-GI-NEXT: movi v2.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: adrp x8, .LCPI11_0 -; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: neg v1.4s, v2.4s -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI11_0] -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: ushr v1.4s, v0.4s, #31 -; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI11_0] +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: neg v2.4s, v3.4s +; CHECK-GI-NEXT: sshl v2.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: mov v1.s[2], w9 +; CHECK-GI-NEXT: mov v1.s[3], w9 +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: bif v0.16b, v2.16b, v1.16b ; CHECK-GI-NEXT: ret %1 = and <4 x i32> %x, %2 = sdiv <4 x i32> %1, @@ -231,15 +194,9 @@ define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) { ; ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2a: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.4s, #1 -; CHECK-GI-NEXT: fneg v1.4s, v1.4s -; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s -; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: add v1.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #1 -; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31 -; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #1 +; CHECK-GI-NEXT: sshr v1.4s, v0.4s, #31 +; CHECK-GI-NEXT: usra v0.4s, v1.4s, #30 +; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #2 ; CHECK-GI-NEXT: ret %1 = sdiv <4 x i32> %x, ret <4 x i32> %1 @@ -256,14 +213,10 @@ define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) { ; ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2a_neg: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mvni v1.4s, #128, lsl #24 -; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s -; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: sub v1.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #1 -; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31 -; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #1 +; CHECK-GI-NEXT: sshr v1.4s, v0.4s, #31 +; CHECK-GI-NEXT: usra v0.4s, v1.4s, #30 +; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #2 +; CHECK-GI-NEXT: neg v0.4s, v0.4s ; CHECK-GI-NEXT: ret %1 = sdiv <4 x i32> %x, ret <4 x i32> %1 @@ -289,21 +242,21 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v16i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI14_2 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_2] ; CHECK-GI-NEXT: adrp x8, .LCPI14_1 -; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b -; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b -; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] -; CHECK-GI-NEXT: adrp x8, .LCPI14_0 -; CHECK-GI-NEXT: add v0.16b, v1.16b, v0.16b -; CHECK-GI-NEXT: neg v1.16b, v2.16b -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI14_0] -; CHECK-GI-NEXT: sshl v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ushr v1.16b, v0.16b, #7 -; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: sshr v2.16b, v0.16b, #7 +; CHECK-GI-NEXT: adrp x9, .LCPI14_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_1] +; CHECK-GI-NEXT: adrp x8, .LCPI14_2 +; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI14_0] +; CHECK-GI-NEXT: neg v1.16b, v1.16b +; CHECK-GI-NEXT: shl v3.16b, v3.16b, #7 +; CHECK-GI-NEXT: ushl v1.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI14_2] +; CHECK-GI-NEXT: neg v2.16b, v2.16b +; CHECK-GI-NEXT: add v1.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: sshl v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: sshr v2.16b, v3.16b, #7 +; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-GI-NEXT: ret %1 = sdiv <16 x i8> %x, ret <16 x i8> %1 @@ -331,21 +284,22 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { ; ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v8i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI15_2 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI15_2] ; CHECK-GI-NEXT: adrp x8, .LCPI15_1 -; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h -; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] +; CHECK-GI-NEXT: sshr v2.8h, v0.8h, #15 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI15_1] ; CHECK-GI-NEXT: adrp x8, .LCPI15_0 -; CHECK-GI-NEXT: add v0.8h, v1.8h, v0.8h -; CHECK-GI-NEXT: neg v1.8h, v2.8h -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] -; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v1.8h -; CHECK-GI-NEXT: ushr v1.8h, v0.8h, #15 -; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ldr d3, [x8, :lo12:.LCPI15_0] +; CHECK-GI-NEXT: adrp x8, .LCPI15_2 +; CHECK-GI-NEXT: neg v1.8h, v1.8h +; CHECK-GI-NEXT: ushl v1.8h, v2.8h, v1.8h +; CHECK-GI-NEXT: ushll v2.8h, v3.8b, #0 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI15_2] +; CHECK-GI-NEXT: neg v3.8h, v3.8h +; CHECK-GI-NEXT: add v1.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shl v2.8h, v2.8h, #15 +; CHECK-GI-NEXT: sshl v1.8h, v1.8h, v3.8h +; CHECK-GI-NEXT: sshr v2.8h, v2.8h, #15 +; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-GI-NEXT: ret %1 = sdiv <8 x i16> %x, ret <8 x i16> %1 @@ -503,21 +457,25 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) { ; ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v4i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI18_2 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI18_2] -; CHECK-GI-NEXT: adrp x8, .LCPI18_1 -; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s -; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w9, #0 // =0x0 +; CHECK-GI-NEXT: sshr v3.4s, v0.4s, #31 +; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: adrp x8, .LCPI18_0 -; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: neg v1.4s, v2.4s ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI18_0] -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: ushr v1.4s, v0.4s, #31 -; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: adrp x8, .LCPI18_1 +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: neg v2.4s, v2.4s +; CHECK-GI-NEXT: ushl v2.4s, v3.4s, v2.4s +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI18_1] +; CHECK-GI-NEXT: mov v1.s[2], w9 +; CHECK-GI-NEXT: neg v3.4s, v3.4s +; CHECK-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: mov v1.s[3], w9 +; CHECK-GI-NEXT: sshl v2.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: bif v0.16b, v2.16b, v1.16b ; CHECK-GI-NEXT: ret %1 = sdiv <4 x i32> %x, ret <4 x i32> %1 @@ -852,23 +810,32 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) { ; ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI24_3 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI24_3] -; CHECK-GI-NEXT: adrp x8, .LCPI24_2 -; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s -; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI24_2] -; CHECK-GI-NEXT: adrp x8, .LCPI24_1 -; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI24_1] -; CHECK-GI-NEXT: adrp x8, .LCPI24_0 -; CHECK-GI-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-GI-NEXT: neg v0.4s, v3.4s -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI24_0] -; CHECK-GI-NEXT: sshl v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: ushr v1.4s, v0.4s, #31 -; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w9, #0 // =0x0 +; CHECK-GI-NEXT: adrp x10, .LCPI24_0 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: ldr q2, [x10, :lo12:.LCPI24_0] +; CHECK-GI-NEXT: sshr v3.4s, v0.4s, #31 +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: adrp x10, .LCPI24_1 +; CHECK-GI-NEXT: neg v2.4s, v2.4s +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: mov v4.s[1], w8 +; CHECK-GI-NEXT: ushl v2.4s, v3.4s, v2.4s +; CHECK-GI-NEXT: ldr q3, [x10, :lo12:.LCPI24_1] +; CHECK-GI-NEXT: mov v1.s[2], w9 +; CHECK-GI-NEXT: neg v3.4s, v3.4s +; CHECK-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: mov v4.d[1], v4.d[0] +; CHECK-GI-NEXT: mov v1.s[3], w9 +; CHECK-GI-NEXT: sshl v2.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: bif v0.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: shl v1.4s, v4.4s, #31 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: neg v2.4s, v0.4s +; CHECK-GI-NEXT: bit v0.16b, v2.16b, v1.16b ; CHECK-GI-NEXT: ret %1 = sdiv <4 x i32> %x, ret <4 x i32> %1 @@ -923,14 +890,12 @@ define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) { ; ; CHECK-GI-LABEL: non_splat_minus_one_divisor_0: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-NEXT: adrp x8, .LCPI25_0 -; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b -; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b -; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI25_0] -; CHECK-GI-NEXT: mla v1.16b, v0.16b, v2.16b -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: neg v2.16b, v0.16b +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI25_0] +; CHECK-GI-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-GI-NEXT: sshr v1.16b, v1.16b, #7 +; CHECK-GI-NEXT: bit v0.16b, v2.16b, v1.16b ; CHECK-GI-NEXT: ret %div = sdiv <16 x i8> %A, ret <16 x i8> %div @@ -966,23 +931,27 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; ; CHECK-GI-LABEL: non_splat_minus_one_divisor_1: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI26_3 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI26_3] ; CHECK-GI-NEXT: adrp x8, .LCPI26_2 -; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b -; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b -; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI26_2] -; CHECK-GI-NEXT: adrp x8, .LCPI26_1 -; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI26_1] +; CHECK-GI-NEXT: sshr v2.16b, v0.16b, #7 +; CHECK-GI-NEXT: adrp x9, .LCPI26_1 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI26_2] +; CHECK-GI-NEXT: adrp x8, .LCPI26_3 +; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI26_1] +; CHECK-GI-NEXT: neg v1.16b, v1.16b +; CHECK-GI-NEXT: shl v3.16b, v3.16b, #7 +; CHECK-GI-NEXT: ushl v1.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI26_3] ; CHECK-GI-NEXT: adrp x8, .LCPI26_0 -; CHECK-GI-NEXT: mla v1.16b, v0.16b, v2.16b -; CHECK-GI-NEXT: neg v0.16b, v3.16b -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI26_0] -; CHECK-GI-NEXT: sshl v0.16b, v1.16b, v0.16b -; CHECK-GI-NEXT: ushr v1.16b, v0.16b, #7 -; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: neg v2.16b, v2.16b +; CHECK-GI-NEXT: add v1.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: sshl v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: sshr v2.16b, v3.16b, #7 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI26_0] +; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: shl v1.16b, v3.16b, #7 +; CHECK-GI-NEXT: sshr v1.16b, v1.16b, #7 +; CHECK-GI-NEXT: neg v2.16b, v0.16b +; CHECK-GI-NEXT: bit v0.16b, v2.16b, v1.16b ; CHECK-GI-NEXT: ret %div = sdiv <16 x i8> %A, ret <16 x i8> %div @@ -1010,19 +979,33 @@ define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) { ; ; CHECK-GI-LABEL: non_splat_minus_one_divisor_2: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI27_2 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI27_2] -; CHECK-GI-NEXT: adrp x8, .LCPI27_1 -; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s -; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI27_1] -; CHECK-GI-NEXT: adrp x8, .LCPI27_0 -; CHECK-GI-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI27_0] -; CHECK-GI-NEXT: ushr v0.4s, v1.4s, #31 -; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: adrp x9, .LCPI27_0 +; CHECK-GI-NEXT: mov w10, #0 // =0x0 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: ldr q2, [x9, :lo12:.LCPI27_0] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: sshr v3.4s, v0.4s, #31 +; CHECK-GI-NEXT: adrp x9, .LCPI27_1 +; CHECK-GI-NEXT: neg v2.4s, v2.4s +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: mov v4.s[1], w10 +; CHECK-GI-NEXT: ushl v2.4s, v3.4s, v2.4s +; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI27_1] +; CHECK-GI-NEXT: mov v1.s[2], w10 +; CHECK-GI-NEXT: mov v4.s[2], w10 +; CHECK-GI-NEXT: neg v3.4s, v3.4s +; CHECK-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: mov v1.s[3], w10 +; CHECK-GI-NEXT: sshl v2.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v4.s[3], w8 +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: bif v0.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: shl v1.4s, v4.4s, #31 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: neg v2.4s, v0.4s +; CHECK-GI-NEXT: bit v0.16b, v2.16b, v1.16b ; CHECK-GI-NEXT: ret %div = sdiv <4 x i32> %A, ret <4 x i32> %div @@ -1250,14 +1233,13 @@ define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) { ; ; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform7: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-NEXT: adrp x8, .LCPI34_0 -; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h -; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] -; CHECK-GI-NEXT: mla v1.8h, v0.8h, v2.8h -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: neg v2.8h, v0.8h +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI34_0] +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: shl v1.8h, v1.8h, #15 +; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #15 +; CHECK-GI-NEXT: bit v0.16b, v2.16b, v1.16b ; CHECK-GI-NEXT: ret %1 = sdiv <8 x i16> %x, ret <8 x i16> %1 @@ -1368,12 +1350,9 @@ define i32 @combine_sdiv_two(i32 %x) { ; ; CHECK-GI-LABEL: combine_sdiv_two: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-GI-NEXT: sxtw x8, w0 -; CHECK-GI-NEXT: sub x8, x8, x8, lsl #31 -; CHECK-GI-NEXT: asr x8, x8, #32 -; CHECK-GI-NEXT: add w8, w8, w0 -; CHECK-GI-NEXT: add w0, w8, w8, lsr #31 +; CHECK-GI-NEXT: asr w8, w0, #31 +; CHECK-GI-NEXT: add w8, w0, w8, lsr #31 +; CHECK-GI-NEXT: asr w0, w8, #1 ; CHECK-GI-NEXT: ret %1 = sdiv i32 %x, 2 ret i32 %1 @@ -1388,13 +1367,9 @@ define i32 @combine_sdiv_negtwo(i32 %x) { ; ; CHECK-GI-LABEL: combine_sdiv_negtwo: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-GI-NEXT: sxtw x8, w0 -; CHECK-GI-NEXT: lsl x9, x8, #31 -; CHECK-GI-NEXT: sub x8, x9, x8 -; CHECK-GI-NEXT: asr x8, x8, #32 -; CHECK-GI-NEXT: sub w8, w8, w0 -; CHECK-GI-NEXT: add w0, w8, w8, lsr #31 +; CHECK-GI-NEXT: asr w8, w0, #31 +; CHECK-GI-NEXT: add w8, w0, w8, lsr #31 +; CHECK-GI-NEXT: neg w0, w8, asr #1 ; CHECK-GI-NEXT: ret %1 = sdiv i32 %x, -2 ret i32 %1 @@ -1411,13 +1386,10 @@ define i8 @combine_i8_sdiv_pow2(i8 %x) { ; ; CHECK-GI-LABEL: combine_i8_sdiv_pow2: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sxtb w8, w0 -; CHECK-GI-NEXT: sub w8, w8, w8, lsl #7 -; CHECK-GI-NEXT: sxth w8, w8 -; CHECK-GI-NEXT: add w8, w0, w8, asr #8 -; CHECK-GI-NEXT: sbfx w8, w8, #3, #5 -; CHECK-GI-NEXT: ubfx w9, w8, #7, #1 -; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: sbfx w8, w0, #7, #1 +; CHECK-GI-NEXT: ubfx w8, w8, #4, #4 +; CHECK-GI-NEXT: add w8, w0, w8 +; CHECK-GI-NEXT: sbfx w0, w8, #4, #4 ; CHECK-GI-NEXT: ret %1 = sdiv i8 %x, 16 ret i8 %1 @@ -1435,14 +1407,11 @@ define i8 @combine_i8_sdiv_negpow2(i8 %x) { ; ; CHECK-GI-LABEL: combine_i8_sdiv_negpow2: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sxtb w8, w0 -; CHECK-GI-NEXT: lsl w9, w8, #7 -; CHECK-GI-NEXT: sub w8, w9, w8 -; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 -; CHECK-GI-NEXT: sub w8, w8, w0 -; CHECK-GI-NEXT: sbfx w8, w8, #5, #3 -; CHECK-GI-NEXT: ubfx w9, w8, #7, #1 -; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: sbfx w8, w0, #7, #1 +; CHECK-GI-NEXT: ubfx w8, w8, #2, #6 +; CHECK-GI-NEXT: add w8, w0, w8 +; CHECK-GI-NEXT: sxtb w8, w8 +; CHECK-GI-NEXT: neg w0, w8, asr #6 ; CHECK-GI-NEXT: ret %1 = sdiv i8 %x, -64 ret i8 %1 @@ -1459,12 +1428,10 @@ define i16 @combine_i16_sdiv_pow2(i16 %x) { ; ; CHECK-GI-LABEL: combine_i16_sdiv_pow2: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sxth w8, w0 -; CHECK-GI-NEXT: sub w8, w8, w8, lsl #15 -; CHECK-GI-NEXT: add w8, w0, w8, asr #16 -; CHECK-GI-NEXT: sbfx w8, w8, #3, #13 -; CHECK-GI-NEXT: ubfx w9, w8, #15, #1 -; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: sbfx w8, w0, #15, #1 +; CHECK-GI-NEXT: ubfx w8, w8, #12, #4 +; CHECK-GI-NEXT: add w8, w0, w8 +; CHECK-GI-NEXT: sbfx w0, w8, #4, #12 ; CHECK-GI-NEXT: ret %1 = sdiv i16 %x, 16 ret i16 %1 @@ -1482,14 +1449,11 @@ define i16 @combine_i16_sdiv_negpow2(i16 %x) { ; ; CHECK-GI-LABEL: combine_i16_sdiv_negpow2: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sxth w8, w0 -; CHECK-GI-NEXT: lsl w9, w8, #15 -; CHECK-GI-NEXT: sub w8, w9, w8 -; CHECK-GI-NEXT: asr w8, w8, #16 -; CHECK-GI-NEXT: sub w8, w8, w0 -; CHECK-GI-NEXT: sbfx w8, w8, #7, #9 -; CHECK-GI-NEXT: ubfx w9, w8, #15, #1 -; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: sbfx w8, w0, #15, #1 +; CHECK-GI-NEXT: ubfx w8, w8, #8, #8 +; CHECK-GI-NEXT: add w8, w0, w8 +; CHECK-GI-NEXT: sxth w8, w8 +; CHECK-GI-NEXT: neg w0, w8, asr #8 ; CHECK-GI-NEXT: ret %1 = sdiv i16 %x, -256 ret i16 %1 @@ -1506,13 +1470,9 @@ define i32 @combine_i32_sdiv_pow2(i32 %x) { ; ; CHECK-GI-LABEL: combine_i32_sdiv_pow2: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-GI-NEXT: sxtw x8, w0 -; CHECK-GI-NEXT: sub x8, x8, x8, lsl #31 -; CHECK-GI-NEXT: asr x8, x8, #32 -; CHECK-GI-NEXT: add w8, w8, w0 -; CHECK-GI-NEXT: asr w8, w8, #3 -; CHECK-GI-NEXT: add w0, w8, w8, lsr #31 +; CHECK-GI-NEXT: asr w8, w0, #31 +; CHECK-GI-NEXT: add w8, w0, w8, lsr #28 +; CHECK-GI-NEXT: asr w0, w8, #4 ; CHECK-GI-NEXT: ret %1 = sdiv i32 %x, 16 ret i32 %1 @@ -1529,14 +1489,9 @@ define i32 @combine_i32_sdiv_negpow2(i32 %x) { ; ; CHECK-GI-LABEL: combine_i32_sdiv_negpow2: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-GI-NEXT: sxtw x8, w0 -; CHECK-GI-NEXT: lsl x9, x8, #31 -; CHECK-GI-NEXT: sub x8, x9, x8 -; CHECK-GI-NEXT: asr x8, x8, #32 -; CHECK-GI-NEXT: sub w8, w8, w0 -; CHECK-GI-NEXT: asr w8, w8, #7 -; CHECK-GI-NEXT: add w0, w8, w8, lsr #31 +; CHECK-GI-NEXT: asr w8, w0, #31 +; CHECK-GI-NEXT: add w8, w0, w8, lsr #24 +; CHECK-GI-NEXT: neg w0, w8, asr #8 ; CHECK-GI-NEXT: ret %1 = sdiv i32 %x, -256 ret i32 %1 @@ -1553,11 +1508,9 @@ define i64 @combine_i64_sdiv_pow2(i64 %x) { ; ; CHECK-GI-LABEL: combine_i64_sdiv_pow2: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov x8, #-9223372036854775807 // =0x8000000000000001 -; CHECK-GI-NEXT: smulh x8, x0, x8 -; CHECK-GI-NEXT: add x8, x8, x0 -; CHECK-GI-NEXT: asr x8, x8, #3 -; CHECK-GI-NEXT: add x0, x8, x8, lsr #63 +; CHECK-GI-NEXT: asr x8, x0, #63 +; CHECK-GI-NEXT: add x8, x0, x8, lsr #60 +; CHECK-GI-NEXT: asr x0, x8, #4 ; CHECK-GI-NEXT: ret %1 = sdiv i64 %x, 16 ret i64 %1 @@ -1574,11 +1527,9 @@ define i64 @combine_i64_sdiv_negpow2(i64 %x) { ; ; CHECK-GI-LABEL: combine_i64_sdiv_negpow2: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov x8, #9223372036854775807 // =0x7fffffffffffffff -; CHECK-GI-NEXT: smulh x8, x0, x8 -; CHECK-GI-NEXT: sub x8, x8, x0 -; CHECK-GI-NEXT: asr x8, x8, #7 -; CHECK-GI-NEXT: add x0, x8, x8, lsr #63 +; CHECK-GI-NEXT: asr x8, x0, #63 +; CHECK-GI-NEXT: add x8, x0, x8, lsr #56 +; CHECK-GI-NEXT: neg x0, x8, asr #8 ; CHECK-GI-NEXT: ret %1 = sdiv i64 %x, -256 ret i64 %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 3d147168c5be6..45bade21385be 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -671,16 +671,16 @@ define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr ; GFX6-LABEL: simplify_demanded_bfe_sdiv: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX6-NEXT: v_mov_b32_e32 v0, 0x80000001 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s3, s2, 0x100001 -; GFX6-NEXT: v_mul_hi_i32 v0, s3, v0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s3, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x100001 +; GFX6-NEXT: s_ashr_i32 s4, s3, 31 +; GFX6-NEXT: s_lshr_b32 s4, s4, 31 +; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: s_ashr_i32 s3, s3, 1 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll index c2a460b080a29..8981b11a90286 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -278,46 +278,28 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) { ; CHECK-LABEL: v_sdiv_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, 0x80000001 -; CHECK-NEXT: v_mul_hi_i32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CHECK-NEXT: v_ashrrev_i32_e32 v0, 11, v0 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 31, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i32 %num, 4096 ret i32 %result } define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) { -; GISEL-LABEL: v_sdiv_v2i32_pow2k_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 20, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 20, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_ashrrev_i32_e32 v0, 12, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v1, 12, v1 -; GISEL-NEXT: s_setpc_b64 s[30:31] -; -; CGP-LABEL: v_sdiv_v2i32_pow2k_denom: -; CGP: ; %bb.0: -; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_mov_b32_e32 v2, 0x80000001 -; CGP-NEXT: v_mul_hi_i32 v3, v0, v2 -; CGP-NEXT: v_mul_hi_i32 v2, v1, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; CGP-NEXT: v_ashrrev_i32_e32 v0, 11, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v1, 11, v1 -; CGP-NEXT: v_lshrrev_b32_e32 v2, 31, v0 -; CGP-NEXT: v_lshrrev_b32_e32 v3, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: s_setpc_b64 s[30:31] +; CHECK-LABEL: v_sdiv_v2i32_pow2k_denom: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshrrev_b32_e32 v3, 20, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 12, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i32> %num, ret <2 x i32> %result }