diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 7d7b5364d6b68..31f1197b9723b 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -143,6 +143,10 @@ class CombinerHelper { /// Query is legal on the target. bool isLegalOrBeforeLegalizer(const LegalityQuery &Query) const; + /// \return true if \p Query is legal on the target, or if \p Query will + /// perform WidenScalar action on the target. + bool isLegalOrHasWidenScalar(const LegalityQuery &Query) const; + /// \return true if the combine is running prior to legalization, or if \p Ty /// is a legal integer constant type on the target. bool isConstantLegalOrBeforeLegalizer(const LLT Ty) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 6033d80e717d3..66051d756c808 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1131,13 +1131,13 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg, def udiv_by_const : GICombineRule< (defs root:$root), - (match (wip_match_opcode G_UDIV):$root, + (match (G_UDIV $dst, $x, $y):$root, [{ return Helper.matchUDivorURemByConst(*${root}); }]), (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>; def sdiv_by_const : GICombineRule< (defs root:$root), - (match (wip_match_opcode G_SDIV):$root, + (match (G_SDIV $dst, $x, $y):$root, [{ return Helper.matchSDivByConst(*${root}); }]), (apply [{ Helper.applySDivByConst(*${root}); }])>; @@ -1153,8 +1153,8 @@ def udiv_by_pow2 : GICombineRule< [{ return Helper.matchDivByPow2(*${root}, /*IsSigned=*/false); }]), (apply [{ Helper.applyUDivByPow2(*${root}); }])>; -def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const, - sdiv_by_pow2, udiv_by_pow2]>; +def intdiv_combines : GICombineGroup<[udiv_by_pow2, sdiv_by_pow2, + udiv_by_const, sdiv_by_const,]>; def urem_by_const : GICombineRule< (defs root:$root), @@ -2054,9 +2054,9 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, div_rem_to_divrem, funnel_shift_combines, bitreverse_shift, commute_shift, form_bitfield_extract, constant_fold_binops, constant_fold_fma, constant_fold_cast_op, fabs_fneg_fold, - intdiv_combines, mulh_combines, redundant_neg_operands, + mulh_combines, redundant_neg_operands, and_or_disjoint_mask, fma_combines, fold_binop_into_select, - intrem_combines, sub_add_reg, select_to_minmax, + intrem_combines, intdiv_combines, sub_add_reg, select_to_minmax, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, simplify_neg_minmax, combine_concat_vector, sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 3b11d0848d300..13091e047b431 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -162,6 +162,11 @@ bool CombinerHelper::isLegalOrBeforeLegalizer( return isPreLegalize() || isLegal(Query); } +bool CombinerHelper::isLegalOrHasWidenScalar(const LegalityQuery &Query) const { + return isLegal(Query) || + LI->getAction(Query).Action == LegalizeActions::WidenScalar; +} + bool CombinerHelper::isConstantLegalOrBeforeLegalizer(const LLT Ty) const { if (!Ty.isVector()) return isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {Ty}}); @@ -5522,6 +5527,8 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { Register Dst = MI.getOperand(0).getReg(); Register RHS = MI.getOperand(2).getReg(); LLT DstTy = MRI.getType(Dst); + auto SizeInBits = DstTy.getScalarSizeInBits(); + LLT WideTy = DstTy.changeElementSize(SizeInBits * 2); auto &MF = *MI.getMF(); AttributeList Attr = MF.getFunction().getAttributes(); @@ -5541,8 +5548,21 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } - // Don't support the general case for now. - return false; + auto *RHSDef = MRI.getVRegDef(RHS); + if (!isConstantOrConstantVector(*RHSDef, MRI)) + return false; + + // Don't do this if the types are not going to be legal. + if (LI) { + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}})) + return false; + if (!isLegal({TargetOpcode::G_SMULH, {DstTy}}) && + !isLegalOrHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}})) + return false; + } + + return matchUnaryPredicate( + MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } void CombinerHelper::applySDivByConst(MachineInstr &MI) const { @@ -5558,21 +5578,22 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const { Register RHS = SDiv.getReg(2); LLT Ty = MRI.getType(Dst); LLT ScalarTy = Ty.getScalarType(); + const unsigned EltBits = ScalarTy.getScalarSizeInBits(); LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType(); auto &MIB = Builder; bool UseSRA = false; - SmallVector Shifts, Factors; + SmallVector ExactShifts, ExactFactors; - auto *RHSDef = cast(getDefIgnoringCopies(RHS, MRI)); - bool IsSplat = getIConstantSplatVal(*RHSDef, MRI).has_value(); + auto *RHSDefInstr = cast(getDefIgnoringCopies(RHS, MRI)); + bool IsSplat = getIConstantSplatVal(*RHSDefInstr, MRI).has_value(); - auto BuildSDIVPattern = [&](const Constant *C) { + auto BuildExactSDIVPattern = [&](const Constant *C) { // Don't recompute inverses for each splat element. - if (IsSplat && !Factors.empty()) { - Shifts.push_back(Shifts[0]); - Factors.push_back(Factors[0]); + if (IsSplat && !ExactFactors.empty()) { + ExactShifts.push_back(ExactShifts[0]); + ExactFactors.push_back(ExactFactors[0]); return true; } @@ -5587,31 +5608,104 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const { // Calculate the multiplicative inverse modulo BW. // 2^W requires W + 1 bits, so we have to extend and then truncate. APInt Factor = Divisor.multiplicativeInverse(); - Shifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0)); - Factors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0)); + ExactShifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0)); + ExactFactors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0)); return true; }; - // Collect all magic values from the build vector. + if (MI.getFlag(MachineInstr::MIFlag::IsExact)) { + // Collect all magic values from the build vector. + bool Matched = matchUnaryPredicate(MRI, RHS, BuildExactSDIVPattern); + (void)Matched; + assert(Matched && "Expected unary predicate match to succeed"); + + Register Shift, Factor; + if (Ty.isVector()) { + Shift = MIB.buildBuildVector(ShiftAmtTy, ExactShifts).getReg(0); + Factor = MIB.buildBuildVector(Ty, ExactFactors).getReg(0); + } else { + Shift = ExactShifts[0]; + Factor = ExactFactors[0]; + } + + Register Res = LHS; + + if (UseSRA) + Res = MIB.buildAShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0); + + return MIB.buildMul(Ty, Res, Factor); + } + + SmallVector MagicFactors, Factors, Shifts, ShiftMasks; + + auto BuildSDIVPattern = [&](const Constant *C) { + auto *CI = cast(C); + const APInt &Divisor = CI->getValue(); + + SignedDivisionByConstantInfo magics = + SignedDivisionByConstantInfo::get(Divisor); + int NumeratorFactor = 0; + int ShiftMask = -1; + + if (Divisor.isOne() || Divisor.isAllOnes()) { + // If d is +1/-1, we just multiply the numerator by +1/-1. + NumeratorFactor = Divisor.getSExtValue(); + magics.Magic = 0; + magics.ShiftAmount = 0; + ShiftMask = 0; + } else if (Divisor.isStrictlyPositive() && magics.Magic.isNegative()) { + // If d > 0 and m < 0, add the numerator. + NumeratorFactor = 1; + } else if (Divisor.isNegative() && magics.Magic.isStrictlyPositive()) { + // If d < 0 and m > 0, subtract the numerator. + NumeratorFactor = -1; + } + + MagicFactors.push_back(MIB.buildConstant(ScalarTy, magics.Magic).getReg(0)); + Factors.push_back(MIB.buildConstant(ScalarTy, NumeratorFactor).getReg(0)); + Shifts.push_back( + MIB.buildConstant(ScalarShiftAmtTy, magics.ShiftAmount).getReg(0)); + ShiftMasks.push_back(MIB.buildConstant(ScalarTy, ShiftMask).getReg(0)); + + return true; + }; + + // Collect the shifts/magic values from each element. bool Matched = matchUnaryPredicate(MRI, RHS, BuildSDIVPattern); (void)Matched; assert(Matched && "Expected unary predicate match to succeed"); - Register Shift, Factor; - if (Ty.isVector()) { - Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0); + Register MagicFactor, Factor, Shift, ShiftMask; + auto *RHSDef = getOpcodeDef(RHS, MRI); + if (RHSDef) { + MagicFactor = MIB.buildBuildVector(Ty, MagicFactors).getReg(0); Factor = MIB.buildBuildVector(Ty, Factors).getReg(0); + Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0); + ShiftMask = MIB.buildBuildVector(Ty, ShiftMasks).getReg(0); } else { - Shift = Shifts[0]; + assert(MRI.getType(RHS).isScalar() && + "Non-build_vector operation should have been a scalar"); + MagicFactor = MagicFactors[0]; Factor = Factors[0]; + Shift = Shifts[0]; + ShiftMask = ShiftMasks[0]; } - Register Res = LHS; + Register Q = LHS; + Q = MIB.buildSMulH(Ty, LHS, MagicFactor).getReg(0); + + // (Optionally) Add/subtract the numerator using Factor. + Factor = MIB.buildMul(Ty, LHS, Factor).getReg(0); + Q = MIB.buildAdd(Ty, Q, Factor).getReg(0); - if (UseSRA) - Res = MIB.buildAShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0); + // Shift right algebraic by shift value. + Q = MIB.buildAShr(Ty, Q, Shift).getReg(0); - return MIB.buildMul(Ty, Res, Factor); + // Extract the sign bit, mask it and add it to the quotient. + auto SignShift = MIB.buildConstant(ShiftAmtTy, EltBits - 1); + auto T = MIB.buildLShr(Ty, Q, SignShift); + T = MIB.buildAnd(Ty, T, ShiftMask); + return MIB.buildAdd(Ty, Q, T); } bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) const { diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.mir index e99ee84100a39..fc73245e2b79b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.mir @@ -45,9 +45,14 @@ body: | ; CHECK: liveins: $w0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 104 - ; CHECK-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[COPY]], [[C]] - ; CHECK-NEXT: $w0 = COPY [[SDIV]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[SMULH:%[0-9]+]]:_(s32) = G_SMULH [[COPY]], [[C]] + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SMULH]], [[C1]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[ASHR]], [[C2]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[LSHR]] + ; CHECK-NEXT: $w0 = COPY [[ADD]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(s32) = COPY $w0 %1:_(s32) = G_CONSTANT i32 104 diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll index bdbebd8726fde..7aa6b77cf3524 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll @@ -15,56 +15,13 @@ define <16 x i8> @div16xi8(<16 x i8> %x) { ; ; CHECK-GI-LABEL: div16xi8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: smov w9, v0.b[0] -; CHECK-GI-NEXT: mov w8, #25 // =0x19 -; CHECK-GI-NEXT: smov w10, v0.b[1] -; CHECK-GI-NEXT: smov w11, v0.b[2] -; CHECK-GI-NEXT: smov w12, v0.b[3] -; CHECK-GI-NEXT: smov w13, v0.b[4] -; CHECK-GI-NEXT: smov w14, v0.b[5] -; CHECK-GI-NEXT: smov w15, v0.b[6] -; CHECK-GI-NEXT: smov w16, v0.b[7] -; CHECK-GI-NEXT: smov w17, v0.b[8] -; CHECK-GI-NEXT: smov w18, v0.b[9] -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.b[1], w10 -; CHECK-GI-NEXT: smov w10, v0.b[10] -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v1.b[2], w11 -; CHECK-GI-NEXT: smov w11, v0.b[11] -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: mov v1.b[3], w12 -; CHECK-GI-NEXT: smov w12, v0.b[12] -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v1.b[4], w13 -; CHECK-GI-NEXT: smov w13, v0.b[13] -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v1.b[5], w14 -; CHECK-GI-NEXT: sdiv w16, w16, w8 -; CHECK-GI-NEXT: mov v1.b[6], w15 -; CHECK-GI-NEXT: sdiv w17, w17, w8 -; CHECK-GI-NEXT: mov v1.b[7], w16 -; CHECK-GI-NEXT: sdiv w9, w18, w8 -; CHECK-GI-NEXT: mov v1.b[8], w17 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: mov v1.b[9], w9 -; CHECK-GI-NEXT: smov w9, v0.b[14] -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.b[10], w10 -; CHECK-GI-NEXT: smov w10, v0.b[15] -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v1.b[11], w11 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: mov v1.b[12], w12 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: mov v1.b[13], w13 -; CHECK-GI-NEXT: sdiv w8, w10, w8 -; CHECK-GI-NEXT: mov v1.b[14], w9 -; CHECK-GI-NEXT: mov v1.b[15], w8 -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: movi v1.16b, #41 +; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: smull v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uzp2 v1.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: sshr v0.16b, v1.16b, #2 +; CHECK-GI-NEXT: ushr v0.16b, v0.16b, #7 +; CHECK-GI-NEXT: ssra v0.16b, v1.16b, #2 ; CHECK-GI-NEXT: ret %div = sdiv <16 x i8> %x, ret <16 x i8> %div @@ -85,32 +42,15 @@ define <8 x i16> @div8xi16(<8 x i16> %x) { ; ; CHECK-GI-LABEL: div8xi16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: smov w9, v0.h[0] -; CHECK-GI-NEXT: mov w8, #6577 // =0x19b1 -; CHECK-GI-NEXT: smov w10, v0.h[1] -; CHECK-GI-NEXT: smov w11, v0.h[2] -; CHECK-GI-NEXT: smov w12, v0.h[3] -; CHECK-GI-NEXT: smov w13, v0.h[4] -; CHECK-GI-NEXT: smov w14, v0.h[5] -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.h[1], w10 -; CHECK-GI-NEXT: smov w10, v0.h[6] -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v1.h[2], w11 -; CHECK-GI-NEXT: smov w11, v0.h[7] -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: mov v1.h[3], w12 -; CHECK-GI-NEXT: sdiv w9, w14, w8 -; CHECK-GI-NEXT: mov v1.h[4], w13 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: mov v1.h[5], w9 -; CHECK-GI-NEXT: sdiv w8, w11, w8 -; CHECK-GI-NEXT: mov v1.h[6], w10 -; CHECK-GI-NEXT: mov v1.h[7], w8 -; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: adrp x8, .LCPI1_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: add v1.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: sshr v0.8h, v1.8h, #12 +; CHECK-GI-NEXT: ushr v0.8h, v0.8h, #15 +; CHECK-GI-NEXT: ssra v0.8h, v1.8h, #12 ; CHECK-GI-NEXT: ret %div = sdiv <8 x i16> %x, ret <8 x i16> %div @@ -131,20 +71,14 @@ define <4 x i32> @div32xi4(<4 x i32> %x) { ; ; CHECK-GI-LABEL: div32xi4: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w8, #39957 // =0x9c15 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: movk w8, #145, lsl #16 -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s0, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v0.s[1], w10 -; CHECK-GI-NEXT: sdiv w8, w12, w8 -; CHECK-GI-NEXT: mov v0.s[2], w11 -; CHECK-GI-NEXT: mov v0.s[3], w8 +; CHECK-GI-NEXT: adrp x8, .LCPI2_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #22 +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #31 +; CHECK-GI-NEXT: ssra v0.4s, v1.4s, #22 ; CHECK-GI-NEXT: ret %div = sdiv <4 x i32> %x, ret <4 x i32> %div diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll new file mode 100644 index 0000000000000..b6a2146e430e7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll @@ -0,0 +1,1536 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel=1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; These tests are taken from the combine-udiv.ll in X86. +define i32 @combine_sdiv_by_one(i32 %x) { +; CHECK-LABEL: combine_sdiv_by_one: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %1 = sdiv i32 %x, 1 + ret i32 %1 +} + +define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) { +; CHECK-LABEL: combine_vec_sdiv_by_one: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define i32 @combine_sdiv_by_negone(i32 %x) { +; CHECK-LABEL: combine_sdiv_by_negone: +; CHECK: // %bb.0: +; CHECK-NEXT: neg w0, w0 +; CHECK-NEXT: ret + %1 = sdiv i32 %x, -1 + ret i32 %1 +} + +define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) { +; CHECK-LABEL: combine_vec_sdiv_by_negone: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define i32 @combine_sdiv_by_minsigned(i32 %x) { +; CHECK-SD-LABEL: combine_sdiv_by_minsigned: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #-2147483648 // =0x80000000 +; CHECK-SD-NEXT: cmp w0, w8 +; CHECK-SD-NEXT: cset w0, eq +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_sdiv_by_minsigned: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: asr w8, w0, #31 +; CHECK-GI-NEXT: add w8, w0, w8, lsr #1 +; CHECK-GI-NEXT: neg w0, w8, asr #31 +; CHECK-GI-NEXT: ret + %1 = sdiv i32 %x, -2147483648 + ret i32 %1 +} + +define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_minsigned: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v1.4s, #128, lsl #24 +; CHECK-SD-NEXT: movi v2.4s, #1 +; CHECK-SD-NEXT: cmeq v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_minsigned: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sshr v1.4s, v0.4s, #31 +; CHECK-GI-NEXT: usra v0.4s, v1.4s, #1 +; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #31 +; CHECK-GI-NEXT: neg v0.4s, v0.4s +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define i32 @combine_sdiv_zero(i32 %x) { +; CHECK-LABEL: combine_sdiv_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret + %1 = sdiv i32 0, %x + ret i32 %1 +} + +define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) { +; CHECK-LABEL: combine_vec_sdiv_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ret + %1 = sdiv <4 x i32> zeroinitializer, %x + ret <4 x i32> %1 +} + +define i32 @combine_sdiv_dupe(i32 %x) { +; CHECK-SD-LABEL: combine_sdiv_dupe: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w0, #1 // =0x1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_sdiv_dupe: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sdiv w0, w0, w0 +; CHECK-GI-NEXT: ret + %1 = sdiv i32 %x, %x + ret i32 %1 +} + +define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_dupe: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.4s, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_dupe: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: mov w11, v0.s[3] +; CHECK-GI-NEXT: sdiv w8, w8, w8 +; CHECK-GI-NEXT: sdiv w9, w9, w9 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w10 +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: sdiv w8, w11, w11 +; CHECK-GI-NEXT: mov v0.s[2], w10 +; CHECK-GI-NEXT: mov v0.s[3], w8 +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, %x + ret <4 x i32> %1 +} + +define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pos0: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v1.2d, #0x0000ff000000ff +; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ushr v0.4s, v0.4s, #2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pos0: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #2 +; CHECK-GI-NEXT: ret + %1 = and <4 x i32> %x, + %2 = sdiv <4 x i32> %1, + ret <4 x i32> %2 +} + +define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pos1: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v1.2d, #0x0000ff000000ff +; CHECK-SD-NEXT: adrp x8, .LCPI11_0 +; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] +; CHECK-SD-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pos1: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w9, #0 // =0x0 +; CHECK-GI-NEXT: movi v2.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI11_0 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI11_0] +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: neg v2.4s, v3.4s +; CHECK-GI-NEXT: sshl v2.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: mov v1.s[2], w9 +; CHECK-GI-NEXT: mov v1.s[3], w9 +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: bif v0.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: ret + %1 = and <4 x i32> %x, + %2 = sdiv <4 x i32> %1, + ret <4 x i32> %2 +} + +define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2a: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmlt v1.4s, v0.4s, #0 +; CHECK-SD-NEXT: usra v0.4s, v1.4s, #30 +; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2a: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sshr v1.4s, v0.4s, #31 +; CHECK-GI-NEXT: usra v0.4s, v1.4s, #30 +; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #2 +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2a_neg: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmlt v1.4s, v0.4s, #0 +; CHECK-SD-NEXT: usra v0.4s, v1.4s, #30 +; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #2 +; CHECK-SD-NEXT: neg v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2a_neg: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sshr v1.4s, v0.4s, #31 +; CHECK-GI-NEXT: usra v0.4s, v1.4s, #30 +; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #2 +; CHECK-GI-NEXT: neg v0.4s, v0.4s +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v16i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI14_0 +; CHECK-SD-NEXT: cmlt v1.16b, v0.16b, #0 +; CHECK-SD-NEXT: movi v3.2d, #0x000000000000ff +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI14_0] +; CHECK-SD-NEXT: adrp x8, .LCPI14_1 +; CHECK-SD-NEXT: movi v4.2d, #0xffffffffffffff00 +; CHECK-SD-NEXT: ushl v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] +; CHECK-SD-NEXT: add v1.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-SD-NEXT: sshl v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v16i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI14_1 +; CHECK-GI-NEXT: sshr v2.16b, v0.16b, #7 +; CHECK-GI-NEXT: adrp x9, .LCPI14_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_1] +; CHECK-GI-NEXT: adrp x8, .LCPI14_2 +; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI14_0] +; CHECK-GI-NEXT: neg v1.16b, v1.16b +; CHECK-GI-NEXT: shl v3.16b, v3.16b, #7 +; CHECK-GI-NEXT: ushl v1.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI14_2] +; CHECK-GI-NEXT: neg v2.16b, v2.16b +; CHECK-GI-NEXT: add v1.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: sshl v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: sshr v2.16b, v3.16b, #7 +; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <16 x i8> %x, + ret <16 x i8> %1 +} + +define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v8i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI15_1 +; CHECK-SD-NEXT: cmlt v1.8h, v0.8h, #0 +; CHECK-SD-NEXT: adrp x9, .LCPI15_3 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] +; CHECK-SD-NEXT: adrp x8, .LCPI15_2 +; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI15_3] +; CHECK-SD-NEXT: ushl v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI15_2] +; CHECK-SD-NEXT: adrp x8, .LCPI15_0 +; CHECK-SD-NEXT: add v1.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: sshl v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v8i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI15_1 +; CHECK-GI-NEXT: sshr v2.8h, v0.8h, #15 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI15_1] +; CHECK-GI-NEXT: adrp x8, .LCPI15_0 +; CHECK-GI-NEXT: ldr d3, [x8, :lo12:.LCPI15_0] +; CHECK-GI-NEXT: adrp x8, .LCPI15_2 +; CHECK-GI-NEXT: neg v1.8h, v1.8h +; CHECK-GI-NEXT: ushl v1.8h, v2.8h, v1.8h +; CHECK-GI-NEXT: ushll v2.8h, v3.8b, #0 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI15_2] +; CHECK-GI-NEXT: neg v3.8h, v3.8h +; CHECK-GI-NEXT: add v1.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shl v2.8h, v2.8h, #15 +; CHECK-GI-NEXT: sshl v1.8h, v1.8h, v3.8h +; CHECK-GI-NEXT: sshr v2.8h, v2.8h, #15 +; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v16i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI16_1 +; CHECK-SD-NEXT: cmlt v2.8h, v0.8h, #0 +; CHECK-SD-NEXT: cmlt v3.8h, v1.8h, #0 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_1] +; CHECK-SD-NEXT: adrp x8, .LCPI16_2 +; CHECK-SD-NEXT: ushl v2.8h, v2.8h, v4.8h +; CHECK-SD-NEXT: ushl v3.8h, v3.8h, v4.8h +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_2] +; CHECK-SD-NEXT: adrp x8, .LCPI16_0 +; CHECK-SD-NEXT: ldr q5, [x8, :lo12:.LCPI16_0] +; CHECK-SD-NEXT: adrp x8, .LCPI16_3 +; CHECK-SD-NEXT: add v2.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: add v3.8h, v1.8h, v3.8h +; CHECK-SD-NEXT: and v0.16b, v0.16b, v5.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v5.16b +; CHECK-SD-NEXT: sshl v2.8h, v2.8h, v4.8h +; CHECK-SD-NEXT: sshl v3.8h, v3.8h, v4.8h +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_3] +; CHECK-SD-NEXT: and v2.16b, v2.16b, v4.16b +; CHECK-SD-NEXT: and v3.16b, v3.16b, v4.16b +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v16i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI16_1 +; CHECK-GI-NEXT: sshr v3.8h, v0.8h, #15 +; CHECK-GI-NEXT: sshr v4.8h, v1.8h, #15 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] +; CHECK-GI-NEXT: adrp x8, .LCPI16_0 +; CHECK-GI-NEXT: ldr d5, [x8, :lo12:.LCPI16_0] +; CHECK-GI-NEXT: adrp x8, .LCPI16_2 +; CHECK-GI-NEXT: neg v2.8h, v2.8h +; CHECK-GI-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-GI-NEXT: ushl v3.8h, v3.8h, v2.8h +; CHECK-GI-NEXT: ushl v2.8h, v4.8h, v2.8h +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI16_2] +; CHECK-GI-NEXT: shl v5.8h, v5.8h, #15 +; CHECK-GI-NEXT: neg v4.8h, v4.8h +; CHECK-GI-NEXT: add v3.8h, v0.8h, v3.8h +; CHECK-GI-NEXT: add v2.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: sshl v3.8h, v3.8h, v4.8h +; CHECK-GI-NEXT: sshl v2.8h, v2.8h, v4.8h +; CHECK-GI-NEXT: sshr v4.8h, v5.8h, #15 +; CHECK-GI-NEXT: bif v0.16b, v3.16b, v4.16b +; CHECK-GI-NEXT: bif v1.16b, v2.16b, v4.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <16 x i16> %x, + ret <16 x i16> %1 +} + +define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v32i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI17_1 +; CHECK-SD-NEXT: cmlt v4.8h, v0.8h, #0 +; CHECK-SD-NEXT: cmlt v5.8h, v1.8h, #0 +; CHECK-SD-NEXT: cmlt v7.8h, v2.8h, #0 +; CHECK-SD-NEXT: cmlt v16.8h, v3.8h, #0 +; CHECK-SD-NEXT: ldr q6, [x8, :lo12:.LCPI17_1] +; CHECK-SD-NEXT: adrp x8, .LCPI17_2 +; CHECK-SD-NEXT: ushl v4.8h, v4.8h, v6.8h +; CHECK-SD-NEXT: ushl v5.8h, v5.8h, v6.8h +; CHECK-SD-NEXT: ushl v7.8h, v7.8h, v6.8h +; CHECK-SD-NEXT: ushl v6.8h, v16.8h, v6.8h +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI17_2] +; CHECK-SD-NEXT: adrp x8, .LCPI17_0 +; CHECK-SD-NEXT: add v4.8h, v0.8h, v4.8h +; CHECK-SD-NEXT: add v5.8h, v1.8h, v5.8h +; CHECK-SD-NEXT: ldr q17, [x8, :lo12:.LCPI17_0] +; CHECK-SD-NEXT: add v7.8h, v2.8h, v7.8h +; CHECK-SD-NEXT: add v6.8h, v3.8h, v6.8h +; CHECK-SD-NEXT: adrp x8, .LCPI17_3 +; CHECK-SD-NEXT: and v0.16b, v0.16b, v17.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v17.16b +; CHECK-SD-NEXT: and v2.16b, v2.16b, v17.16b +; CHECK-SD-NEXT: sshl v4.8h, v4.8h, v16.8h +; CHECK-SD-NEXT: sshl v5.8h, v5.8h, v16.8h +; CHECK-SD-NEXT: and v3.16b, v3.16b, v17.16b +; CHECK-SD-NEXT: sshl v7.8h, v7.8h, v16.8h +; CHECK-SD-NEXT: sshl v6.8h, v6.8h, v16.8h +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI17_3] +; CHECK-SD-NEXT: and v4.16b, v4.16b, v16.16b +; CHECK-SD-NEXT: and v5.16b, v5.16b, v16.16b +; CHECK-SD-NEXT: and v7.16b, v7.16b, v16.16b +; CHECK-SD-NEXT: and v6.16b, v6.16b, v16.16b +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v4.16b +; CHECK-SD-NEXT: orr v1.16b, v1.16b, v5.16b +; CHECK-SD-NEXT: orr v2.16b, v2.16b, v7.16b +; CHECK-SD-NEXT: orr v3.16b, v3.16b, v6.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v32i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI17_1 +; CHECK-GI-NEXT: sshr v5.8h, v0.8h, #15 +; CHECK-GI-NEXT: sshr v6.8h, v1.8h, #15 +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI17_1] +; CHECK-GI-NEXT: adrp x8, .LCPI17_0 +; CHECK-GI-NEXT: sshr v7.8h, v2.8h, #15 +; CHECK-GI-NEXT: sshr v16.8h, v3.8h, #15 +; CHECK-GI-NEXT: ldr d17, [x8, :lo12:.LCPI17_0] +; CHECK-GI-NEXT: adrp x8, .LCPI17_2 +; CHECK-GI-NEXT: neg v4.8h, v4.8h +; CHECK-GI-NEXT: ushll v17.8h, v17.8b, #0 +; CHECK-GI-NEXT: ushl v5.8h, v5.8h, v4.8h +; CHECK-GI-NEXT: ushl v6.8h, v6.8h, v4.8h +; CHECK-GI-NEXT: ushl v7.8h, v7.8h, v4.8h +; CHECK-GI-NEXT: ushl v4.8h, v16.8h, v4.8h +; CHECK-GI-NEXT: ldr q16, [x8, :lo12:.LCPI17_2] +; CHECK-GI-NEXT: shl v17.8h, v17.8h, #15 +; CHECK-GI-NEXT: neg v16.8h, v16.8h +; CHECK-GI-NEXT: add v5.8h, v0.8h, v5.8h +; CHECK-GI-NEXT: add v6.8h, v1.8h, v6.8h +; CHECK-GI-NEXT: add v7.8h, v2.8h, v7.8h +; CHECK-GI-NEXT: add v4.8h, v3.8h, v4.8h +; CHECK-GI-NEXT: sshr v17.8h, v17.8h, #15 +; CHECK-GI-NEXT: sshl v5.8h, v5.8h, v16.8h +; CHECK-GI-NEXT: sshl v6.8h, v6.8h, v16.8h +; CHECK-GI-NEXT: sshl v7.8h, v7.8h, v16.8h +; CHECK-GI-NEXT: sshl v4.8h, v4.8h, v16.8h +; CHECK-GI-NEXT: bif v0.16b, v5.16b, v17.16b +; CHECK-GI-NEXT: bif v1.16b, v6.16b, v17.16b +; CHECK-GI-NEXT: bif v2.16b, v7.16b, v17.16b +; CHECK-GI-NEXT: bif v3.16b, v4.16b, v17.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <32 x i16> %x, + ret <32 x i16> %1 +} + +define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI18_0 +; CHECK-SD-NEXT: cmlt v1.4s, v0.4s, #0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI18_0] +; CHECK-SD-NEXT: adrp x8, .LCPI18_1 +; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] +; CHECK-SD-NEXT: adrp x8, .LCPI18_2 +; CHECK-SD-NEXT: add v1.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI18_2] +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w9, #0 // =0x0 +; CHECK-GI-NEXT: sshr v3.4s, v0.4s, #31 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI18_0 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI18_0] +; CHECK-GI-NEXT: adrp x8, .LCPI18_1 +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: neg v2.4s, v2.4s +; CHECK-GI-NEXT: ushl v2.4s, v3.4s, v2.4s +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI18_1] +; CHECK-GI-NEXT: mov v1.s[2], w9 +; CHECK-GI-NEXT: neg v3.4s, v3.4s +; CHECK-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: mov v1.s[3], w9 +; CHECK-GI-NEXT: sshl v2.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: bif v0.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v8i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI19_0 +; CHECK-SD-NEXT: cmlt v2.4s, v0.4s, #0 +; CHECK-SD-NEXT: cmlt v3.4s, v1.4s, #0 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI19_0] +; CHECK-SD-NEXT: adrp x8, .LCPI19_1 +; CHECK-SD-NEXT: ushl v2.4s, v2.4s, v4.4s +; CHECK-SD-NEXT: ushl v3.4s, v3.4s, v4.4s +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI19_1] +; CHECK-SD-NEXT: adrp x8, .LCPI19_2 +; CHECK-SD-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: add v3.4s, v1.4s, v3.4s +; CHECK-SD-NEXT: sshl v2.4s, v2.4s, v4.4s +; CHECK-SD-NEXT: sshl v3.4s, v3.4s, v4.4s +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI19_2] +; CHECK-SD-NEXT: bif v0.16b, v2.16b, v4.16b +; CHECK-SD-NEXT: bif v1.16b, v3.16b, v4.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v8i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w9, #0 // =0x0 +; CHECK-GI-NEXT: sshr v4.4s, v0.4s, #31 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI19_0 +; CHECK-GI-NEXT: sshr v5.4s, v1.4s, #31 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI19_0] +; CHECK-GI-NEXT: adrp x8, .LCPI19_1 +; CHECK-GI-NEXT: mov v2.h[1], w9 +; CHECK-GI-NEXT: neg v3.4s, v3.4s +; CHECK-GI-NEXT: ushl v4.4s, v4.4s, v3.4s +; CHECK-GI-NEXT: ushl v3.4s, v5.4s, v3.4s +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI19_1] +; CHECK-GI-NEXT: mov v2.h[2], w9 +; CHECK-GI-NEXT: neg v5.4s, v5.4s +; CHECK-GI-NEXT: add v4.4s, v0.4s, v4.4s +; CHECK-GI-NEXT: add v3.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: mov v2.h[3], w9 +; CHECK-GI-NEXT: sshl v4.4s, v4.4s, v5.4s +; CHECK-GI-NEXT: sshl v3.4s, v3.4s, v5.4s +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: shl v2.4s, v2.4s, #31 +; CHECK-GI-NEXT: sshr v2.4s, v2.4s, #31 +; CHECK-GI-NEXT: bif v0.16b, v4.16b, v2.16b +; CHECK-GI-NEXT: bif v1.16b, v3.16b, v2.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i32> %x, + ret <8 x i32> %1 +} + +define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v16i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI20_0 +; CHECK-SD-NEXT: cmlt v4.4s, v0.4s, #0 +; CHECK-SD-NEXT: cmlt v5.4s, v1.4s, #0 +; CHECK-SD-NEXT: cmlt v7.4s, v2.4s, #0 +; CHECK-SD-NEXT: cmlt v16.4s, v3.4s, #0 +; CHECK-SD-NEXT: ldr q6, [x8, :lo12:.LCPI20_0] +; CHECK-SD-NEXT: adrp x8, .LCPI20_1 +; CHECK-SD-NEXT: ushl v4.4s, v4.4s, v6.4s +; CHECK-SD-NEXT: ushl v5.4s, v5.4s, v6.4s +; CHECK-SD-NEXT: ushl v7.4s, v7.4s, v6.4s +; CHECK-SD-NEXT: ushl v6.4s, v16.4s, v6.4s +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI20_1] +; CHECK-SD-NEXT: adrp x8, .LCPI20_2 +; CHECK-SD-NEXT: add v4.4s, v0.4s, v4.4s +; CHECK-SD-NEXT: add v5.4s, v1.4s, v5.4s +; CHECK-SD-NEXT: add v7.4s, v2.4s, v7.4s +; CHECK-SD-NEXT: add v6.4s, v3.4s, v6.4s +; CHECK-SD-NEXT: sshl v4.4s, v4.4s, v16.4s +; CHECK-SD-NEXT: sshl v5.4s, v5.4s, v16.4s +; CHECK-SD-NEXT: sshl v7.4s, v7.4s, v16.4s +; CHECK-SD-NEXT: sshl v6.4s, v6.4s, v16.4s +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI20_2] +; CHECK-SD-NEXT: bif v0.16b, v4.16b, v16.16b +; CHECK-SD-NEXT: bif v1.16b, v5.16b, v16.16b +; CHECK-SD-NEXT: bif v2.16b, v7.16b, v16.16b +; CHECK-SD-NEXT: bif v3.16b, v6.16b, v16.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v16i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w9, #0 // =0x0 +; CHECK-GI-NEXT: sshr v6.4s, v0.4s, #31 +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI20_0 +; CHECK-GI-NEXT: sshr v7.4s, v1.4s, #31 +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI20_0] +; CHECK-GI-NEXT: sshr v16.4s, v2.4s, #31 +; CHECK-GI-NEXT: sshr v17.4s, v3.4s, #31 +; CHECK-GI-NEXT: adrp x8, .LCPI20_1 +; CHECK-GI-NEXT: mov v4.h[1], w9 +; CHECK-GI-NEXT: neg v5.4s, v5.4s +; CHECK-GI-NEXT: ushl v6.4s, v6.4s, v5.4s +; CHECK-GI-NEXT: ushl v7.4s, v7.4s, v5.4s +; CHECK-GI-NEXT: ushl v16.4s, v16.4s, v5.4s +; CHECK-GI-NEXT: mov v4.h[2], w9 +; CHECK-GI-NEXT: ushl v5.4s, v17.4s, v5.4s +; CHECK-GI-NEXT: ldr q17, [x8, :lo12:.LCPI20_1] +; CHECK-GI-NEXT: neg v17.4s, v17.4s +; CHECK-GI-NEXT: add v6.4s, v0.4s, v6.4s +; CHECK-GI-NEXT: add v7.4s, v1.4s, v7.4s +; CHECK-GI-NEXT: add v16.4s, v2.4s, v16.4s +; CHECK-GI-NEXT: add v5.4s, v3.4s, v5.4s +; CHECK-GI-NEXT: mov v4.h[3], w9 +; CHECK-GI-NEXT: sshl v6.4s, v6.4s, v17.4s +; CHECK-GI-NEXT: sshl v7.4s, v7.4s, v17.4s +; CHECK-GI-NEXT: sshl v16.4s, v16.4s, v17.4s +; CHECK-GI-NEXT: sshl v5.4s, v5.4s, v17.4s +; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-GI-NEXT: shl v4.4s, v4.4s, #31 +; CHECK-GI-NEXT: sshr v4.4s, v4.4s, #31 +; CHECK-GI-NEXT: bif v0.16b, v6.16b, v4.16b +; CHECK-GI-NEXT: bif v1.16b, v7.16b, v4.16b +; CHECK-GI-NEXT: bif v2.16b, v16.16b, v4.16b +; CHECK-GI-NEXT: bif v3.16b, v5.16b, v4.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <16 x i32> %x, + ret <16 x i32> %1 +} + +define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v2i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI21_0 +; CHECK-SD-NEXT: cmlt v1.2d, v0.2d, #0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI21_0] +; CHECK-SD-NEXT: adrp x8, .LCPI21_1 +; CHECK-SD-NEXT: ushl v1.2d, v1.2d, v2.2d +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI21_1] +; CHECK-SD-NEXT: adrp x8, .LCPI21_2 +; CHECK-SD-NEXT: add v1.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: sshl v1.2d, v1.2d, v2.2d +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI21_2] +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v2i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI21_1 +; CHECK-GI-NEXT: sshr v2.2d, v0.2d, #63 +; CHECK-GI-NEXT: adrp x9, .LCPI21_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI21_1] +; CHECK-GI-NEXT: adrp x8, .LCPI21_2 +; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI21_0] +; CHECK-GI-NEXT: neg v1.2d, v1.2d +; CHECK-GI-NEXT: shl v3.2d, v3.2d, #63 +; CHECK-GI-NEXT: ushl v1.2d, v2.2d, v1.2d +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI21_2] +; CHECK-GI-NEXT: neg v2.2d, v2.2d +; CHECK-GI-NEXT: add v1.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: sshl v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: sshr v2.2d, v3.2d, #63 +; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <2 x i64> %x, + ret <2 x i64> %1 +} + +define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v4i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI22_0 +; CHECK-SD-NEXT: cmlt v2.2d, v0.2d, #0 +; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI22_0] +; CHECK-SD-NEXT: adrp x8, .LCPI22_3 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI22_3] +; CHECK-SD-NEXT: adrp x8, .LCPI22_1 +; CHECK-SD-NEXT: ushl v2.2d, v2.2d, v3.2d +; CHECK-SD-NEXT: cmlt v3.2d, v1.2d, #0 +; CHECK-SD-NEXT: add v2.2d, v0.2d, v2.2d +; CHECK-SD-NEXT: ushl v3.2d, v3.2d, v4.2d +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI22_1] +; CHECK-SD-NEXT: adrp x8, .LCPI22_2 +; CHECK-SD-NEXT: sshl v2.2d, v2.2d, v4.2d +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI22_2] +; CHECK-SD-NEXT: add v1.2d, v1.2d, v3.2d +; CHECK-SD-NEXT: adrp x8, .LCPI22_4 +; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI22_4] +; CHECK-SD-NEXT: bif v0.16b, v2.16b, v4.16b +; CHECK-SD-NEXT: sshl v1.2d, v1.2d, v3.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v4i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI22_2 +; CHECK-GI-NEXT: sshr v3.2d, v0.2d, #63 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI22_2] +; CHECK-GI-NEXT: adrp x8, .LCPI22_1 +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI22_1] +; CHECK-GI-NEXT: adrp x8, .LCPI22_4 +; CHECK-GI-NEXT: neg v2.2d, v2.2d +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI22_4] +; CHECK-GI-NEXT: adrp x8, .LCPI22_0 +; CHECK-GI-NEXT: neg v4.2d, v4.2d +; CHECK-GI-NEXT: ldr q6, [x8, :lo12:.LCPI22_0] +; CHECK-GI-NEXT: adrp x8, .LCPI22_3 +; CHECK-GI-NEXT: neg v5.2d, v5.2d +; CHECK-GI-NEXT: ushl v2.2d, v3.2d, v2.2d +; CHECK-GI-NEXT: sshr v3.2d, v1.2d, #63 +; CHECK-GI-NEXT: shl v6.2d, v6.2d, #63 +; CHECK-GI-NEXT: add v2.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: ushl v3.2d, v3.2d, v4.2d +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI22_3] +; CHECK-GI-NEXT: sshl v2.2d, v2.2d, v5.2d +; CHECK-GI-NEXT: sshr v5.2d, v6.2d, #63 +; CHECK-GI-NEXT: add v1.2d, v1.2d, v3.2d +; CHECK-GI-NEXT: neg v3.2d, v4.2d +; CHECK-GI-NEXT: bif v0.16b, v2.16b, v5.16b +; CHECK-GI-NEXT: sshl v1.2d, v1.2d, v3.2d +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i64> %x, + ret <4 x i64> %1 +} + +define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v8i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI23_0 +; CHECK-SD-NEXT: cmlt v4.2d, v0.2d, #0 +; CHECK-SD-NEXT: cmlt v6.2d, v2.2d, #0 +; CHECK-SD-NEXT: ldr q5, [x8, :lo12:.LCPI23_0] +; CHECK-SD-NEXT: adrp x8, .LCPI23_3 +; CHECK-SD-NEXT: cmlt v7.2d, v3.2d, #0 +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI23_3] +; CHECK-SD-NEXT: adrp x8, .LCPI23_1 +; CHECK-SD-NEXT: ushl v4.2d, v4.2d, v5.2d +; CHECK-SD-NEXT: ushl v5.2d, v6.2d, v5.2d +; CHECK-SD-NEXT: cmlt v6.2d, v1.2d, #0 +; CHECK-SD-NEXT: ldr q17, [x8, :lo12:.LCPI23_1] +; CHECK-SD-NEXT: ushl v7.2d, v7.2d, v16.2d +; CHECK-SD-NEXT: adrp x8, .LCPI23_2 +; CHECK-SD-NEXT: add v4.2d, v0.2d, v4.2d +; CHECK-SD-NEXT: add v5.2d, v2.2d, v5.2d +; CHECK-SD-NEXT: ushl v6.2d, v6.2d, v16.2d +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI23_2] +; CHECK-SD-NEXT: adrp x8, .LCPI23_4 +; CHECK-SD-NEXT: add v3.2d, v3.2d, v7.2d +; CHECK-SD-NEXT: sshl v4.2d, v4.2d, v17.2d +; CHECK-SD-NEXT: sshl v5.2d, v5.2d, v17.2d +; CHECK-SD-NEXT: add v1.2d, v1.2d, v6.2d +; CHECK-SD-NEXT: ldr q6, [x8, :lo12:.LCPI23_4] +; CHECK-SD-NEXT: bif v0.16b, v4.16b, v16.16b +; CHECK-SD-NEXT: bif v2.16b, v5.16b, v16.16b +; CHECK-SD-NEXT: sshl v1.2d, v1.2d, v6.2d +; CHECK-SD-NEXT: sshl v3.2d, v3.2d, v6.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v8i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w9, #0 // =0x0 +; CHECK-GI-NEXT: sshr v7.2d, v0.2d, #63 +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI23_1 +; CHECK-GI-NEXT: sshr v16.2d, v1.2d, #63 +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI23_1] +; CHECK-GI-NEXT: sshr v17.2d, v2.2d, #63 +; CHECK-GI-NEXT: sshr v18.2d, v3.2d, #63 +; CHECK-GI-NEXT: adrp x8, .LCPI23_3 +; CHECK-GI-NEXT: mov v4.h[1], w9 +; CHECK-GI-NEXT: neg v5.2d, v5.2d +; CHECK-GI-NEXT: ldr q19, [x8, :lo12:.LCPI23_3] +; CHECK-GI-NEXT: neg v19.2d, v19.2d +; CHECK-GI-NEXT: ushl v7.2d, v7.2d, v5.2d +; CHECK-GI-NEXT: ushl v5.2d, v17.2d, v5.2d +; CHECK-GI-NEXT: mov v4.h[2], w9 +; CHECK-GI-NEXT: add v7.2d, v0.2d, v7.2d +; CHECK-GI-NEXT: add v5.2d, v2.2d, v5.2d +; CHECK-GI-NEXT: mov v4.h[3], w9 +; CHECK-GI-NEXT: adrp x9, .LCPI23_0 +; CHECK-GI-NEXT: ldr q6, [x9, :lo12:.LCPI23_0] +; CHECK-GI-NEXT: adrp x9, .LCPI23_2 +; CHECK-GI-NEXT: sshl v7.2d, v7.2d, v19.2d +; CHECK-GI-NEXT: ldr q20, [x9, :lo12:.LCPI23_2] +; CHECK-GI-NEXT: sshl v5.2d, v5.2d, v19.2d +; CHECK-GI-NEXT: neg v6.2d, v6.2d +; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-GI-NEXT: neg v20.2d, v20.2d +; CHECK-GI-NEXT: ushl v16.2d, v16.2d, v6.2d +; CHECK-GI-NEXT: ushl v6.2d, v18.2d, v6.2d +; CHECK-GI-NEXT: ushll v17.2d, v4.2s, #0 +; CHECK-GI-NEXT: ushll2 v18.2d, v4.4s, #0 +; CHECK-GI-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-GI-NEXT: add v16.2d, v1.2d, v16.2d +; CHECK-GI-NEXT: add v6.2d, v3.2d, v6.2d +; CHECK-GI-NEXT: shl v17.2d, v17.2d, #63 +; CHECK-GI-NEXT: shl v18.2d, v18.2d, #63 +; CHECK-GI-NEXT: shl v4.2d, v4.2d, #63 +; CHECK-GI-NEXT: sshl v16.2d, v16.2d, v20.2d +; CHECK-GI-NEXT: sshl v6.2d, v6.2d, v20.2d +; CHECK-GI-NEXT: sshr v17.2d, v17.2d, #63 +; CHECK-GI-NEXT: sshr v18.2d, v18.2d, #63 +; CHECK-GI-NEXT: sshr v4.2d, v4.2d, #63 +; CHECK-GI-NEXT: bif v0.16b, v7.16b, v17.16b +; CHECK-GI-NEXT: bif v1.16b, v16.16b, v18.16b +; CHECK-GI-NEXT: bif v2.16b, v5.16b, v4.16b +; CHECK-GI-NEXT: bif v3.16b, v6.16b, v18.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i64> %x, + ret <8 x i64> %1 +} + +define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI24_0 +; CHECK-SD-NEXT: cmlt v1.4s, v0.4s, #0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI24_0] +; CHECK-SD-NEXT: adrp x8, .LCPI24_1 +; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI24_1] +; CHECK-SD-NEXT: adrp x8, .LCPI24_2 +; CHECK-SD-NEXT: add v1.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI24_2] +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: movi v1.2d, #0xffffffff00000000 +; CHECK-SD-NEXT: neg v2.4s, v0.4s +; CHECK-SD-NEXT: bit v0.16b, v2.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w9, #0 // =0x0 +; CHECK-GI-NEXT: adrp x10, .LCPI24_0 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: ldr q2, [x10, :lo12:.LCPI24_0] +; CHECK-GI-NEXT: sshr v3.4s, v0.4s, #31 +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: adrp x10, .LCPI24_1 +; CHECK-GI-NEXT: neg v2.4s, v2.4s +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: mov v4.s[1], w8 +; CHECK-GI-NEXT: ushl v2.4s, v3.4s, v2.4s +; CHECK-GI-NEXT: ldr q3, [x10, :lo12:.LCPI24_1] +; CHECK-GI-NEXT: mov v1.s[2], w9 +; CHECK-GI-NEXT: neg v3.4s, v3.4s +; CHECK-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: mov v4.d[1], v4.d[0] +; CHECK-GI-NEXT: mov v1.s[3], w9 +; CHECK-GI-NEXT: sshl v2.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: bif v0.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: shl v1.4s, v4.4s, #31 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: neg v2.4s, v0.4s +; CHECK-GI-NEXT: bit v0.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <4 x i32> %x, + ret <4 x i32> %1 +} + +; PR37119 +define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) { +; CHECK-SD-LABEL: non_splat_minus_one_divisor_0: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: umov w9, v0.b[0] +; CHECK-SD-NEXT: mov w8, wzr +; CHECK-SD-NEXT: umov w10, v0.b[1] +; CHECK-SD-NEXT: sub w9, w8, w9, sxtb +; CHECK-SD-NEXT: sub w10, w8, w10, sxtb +; CHECK-SD-NEXT: fmov s1, w9 +; CHECK-SD-NEXT: smov w9, v0.b[2] +; CHECK-SD-NEXT: mov v1.b[1], w10 +; CHECK-SD-NEXT: umov w10, v0.b[3] +; CHECK-SD-NEXT: mov v1.b[2], w9 +; CHECK-SD-NEXT: sub w9, w8, w10, sxtb +; CHECK-SD-NEXT: umov w10, v0.b[4] +; CHECK-SD-NEXT: mov v1.b[3], w9 +; CHECK-SD-NEXT: sub w9, w8, w10, sxtb +; CHECK-SD-NEXT: umov w10, v0.b[5] +; CHECK-SD-NEXT: mov v1.b[4], w9 +; CHECK-SD-NEXT: sub w9, w8, w10, sxtb +; CHECK-SD-NEXT: umov w10, v0.b[7] +; CHECK-SD-NEXT: mov v1.b[5], w9 +; CHECK-SD-NEXT: smov w9, v0.b[6] +; CHECK-SD-NEXT: mov v1.b[6], w9 +; CHECK-SD-NEXT: sub w9, w8, w10, sxtb +; CHECK-SD-NEXT: umov w10, v0.b[8] +; CHECK-SD-NEXT: mov v1.b[7], w9 +; CHECK-SD-NEXT: sub w8, w8, w10, sxtb +; CHECK-SD-NEXT: mov v1.b[8], w8 +; CHECK-SD-NEXT: smov w8, v0.b[9] +; CHECK-SD-NEXT: mov v1.b[9], w8 +; CHECK-SD-NEXT: smov w8, v0.b[10] +; CHECK-SD-NEXT: mov v1.b[10], w8 +; CHECK-SD-NEXT: smov w8, v0.b[11] +; CHECK-SD-NEXT: mov v1.b[11], w8 +; CHECK-SD-NEXT: smov w8, v0.b[12] +; CHECK-SD-NEXT: mov v1.b[12], w8 +; CHECK-SD-NEXT: smov w8, v0.b[13] +; CHECK-SD-NEXT: mov v1.b[13], w8 +; CHECK-SD-NEXT: smov w8, v0.b[14] +; CHECK-SD-NEXT: mov v1.b[14], w8 +; CHECK-SD-NEXT: smov w8, v0.b[15] +; CHECK-SD-NEXT: mov v1.b[15], w8 +; CHECK-SD-NEXT: mov v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: non_splat_minus_one_divisor_0: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI25_0 +; CHECK-GI-NEXT: neg v2.16b, v0.16b +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI25_0] +; CHECK-GI-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-GI-NEXT: sshr v1.16b, v1.16b, #7 +; CHECK-GI-NEXT: bit v0.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: ret + %div = sdiv <16 x i8> %A, + ret <16 x i8> %div +} + +define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { +; CHECK-SD-LABEL: non_splat_minus_one_divisor_1: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI26_1 +; CHECK-SD-NEXT: cmlt v1.16b, v0.16b, #0 +; CHECK-SD-NEXT: adrp x9, .LCPI26_3 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_1] +; CHECK-SD-NEXT: adrp x8, .LCPI26_2 +; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI26_3] +; CHECK-SD-NEXT: adrp x9, .LCPI26_5 +; CHECK-SD-NEXT: ushl v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_2] +; CHECK-SD-NEXT: adrp x8, .LCPI26_0 +; CHECK-SD-NEXT: add v1.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: sshl v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_0] +; CHECK-SD-NEXT: adrp x8, .LCPI26_4 +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_4] +; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI26_5] +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: neg v1.16b, v0.16b +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: non_splat_minus_one_divisor_1: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI26_2 +; CHECK-GI-NEXT: sshr v2.16b, v0.16b, #7 +; CHECK-GI-NEXT: adrp x9, .LCPI26_1 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI26_2] +; CHECK-GI-NEXT: adrp x8, .LCPI26_3 +; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI26_1] +; CHECK-GI-NEXT: neg v1.16b, v1.16b +; CHECK-GI-NEXT: shl v3.16b, v3.16b, #7 +; CHECK-GI-NEXT: ushl v1.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI26_3] +; CHECK-GI-NEXT: adrp x8, .LCPI26_0 +; CHECK-GI-NEXT: neg v2.16b, v2.16b +; CHECK-GI-NEXT: add v1.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: sshl v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: sshr v2.16b, v3.16b, #7 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI26_0] +; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: shl v1.16b, v3.16b, #7 +; CHECK-GI-NEXT: sshr v1.16b, v1.16b, #7 +; CHECK-GI-NEXT: neg v2.16b, v0.16b +; CHECK-GI-NEXT: bit v0.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: ret + %div = sdiv <16 x i8> %A, + ret <16 x i8> %div +} + +define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) { +; CHECK-SD-LABEL: non_splat_minus_one_divisor_2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI27_0 +; CHECK-SD-NEXT: cmlt v1.4s, v0.4s, #0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI27_0] +; CHECK-SD-NEXT: adrp x8, .LCPI27_1 +; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI27_1] +; CHECK-SD-NEXT: adrp x8, .LCPI27_2 +; CHECK-SD-NEXT: add v1.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI27_2] +; CHECK-SD-NEXT: adrp x8, .LCPI27_3 +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI27_3] +; CHECK-SD-NEXT: neg v1.4s, v0.4s +; CHECK-SD-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: non_splat_minus_one_divisor_2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: adrp x9, .LCPI27_0 +; CHECK-GI-NEXT: mov w10, #0 // =0x0 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: ldr q2, [x9, :lo12:.LCPI27_0] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: sshr v3.4s, v0.4s, #31 +; CHECK-GI-NEXT: adrp x9, .LCPI27_1 +; CHECK-GI-NEXT: neg v2.4s, v2.4s +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: mov v4.s[1], w10 +; CHECK-GI-NEXT: ushl v2.4s, v3.4s, v2.4s +; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI27_1] +; CHECK-GI-NEXT: mov v1.s[2], w10 +; CHECK-GI-NEXT: mov v4.s[2], w10 +; CHECK-GI-NEXT: neg v3.4s, v3.4s +; CHECK-GI-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: mov v1.s[3], w10 +; CHECK-GI-NEXT: sshl v2.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov v4.s[3], w8 +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: bif v0.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: shl v1.4s, v4.4s, #31 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #31 +; CHECK-GI-NEXT: neg v2.4s, v0.4s +; CHECK-GI-NEXT: bit v0.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: ret + %div = sdiv <4 x i32> %A, + ret <4 x i32> %div +} + +define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) { +; CHECK-LABEL: combine_vec_sdiv_nonuniform: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI28_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI28_0] +; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI29_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI29_0] +; CHECK-SD-NEXT: adrp x8, .LCPI29_1 +; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI29_1] +; CHECK-SD-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI29_1 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI29_1] +; CHECK-GI-NEXT: adrp x8, .LCPI29_0 +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI29_0] +; CHECK-GI-NEXT: neg v1.8h, v1.8h +; CHECK-GI-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform3: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI30_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI30_0] +; CHECK-SD-NEXT: adrp x8, .LCPI30_1 +; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: add v0.8h, v1.8h, v0.8h +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI30_1] +; CHECK-SD-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform3: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI30_1 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI30_1] +; CHECK-GI-NEXT: adrp x8, .LCPI30_0 +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI30_0] +; CHECK-GI-NEXT: add v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: neg v1.8h, v2.8h +; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform4: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI31_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI31_0] +; CHECK-SD-NEXT: adrp x8, .LCPI31_1 +; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: sub v0.8h, v1.8h, v0.8h +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI31_1] +; CHECK-SD-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform4: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI31_1 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI31_1] +; CHECK-GI-NEXT: adrp x8, .LCPI31_0 +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI31_0] +; CHECK-GI-NEXT: sub v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: neg v1.8h, v2.8h +; CHECK-GI-NEXT: sshl v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform5: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI32_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI32_0] +; CHECK-SD-NEXT: adrp x8, .LCPI32_1 +; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI32_1] +; CHECK-SD-NEXT: adrp x8, .LCPI32_2 +; CHECK-SD-NEXT: mla v1.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI32_2] +; CHECK-SD-NEXT: sshl v0.8h, v1.8h, v0.8h +; CHECK-SD-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform5: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI32_2 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI32_2] +; CHECK-GI-NEXT: adrp x8, .LCPI32_1 +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI32_1] +; CHECK-GI-NEXT: adrp x8, .LCPI32_0 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI32_0] +; CHECK-GI-NEXT: mla v1.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: neg v0.8h, v3.8h +; CHECK-GI-NEXT: sshl v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform6: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI33_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI33_0] +; CHECK-SD-NEXT: adrp x8, .LCPI33_1 +; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI33_1] +; CHECK-SD-NEXT: adrp x8, .LCPI33_2 +; CHECK-SD-NEXT: mla v1.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI33_2] +; CHECK-SD-NEXT: adrp x8, .LCPI33_3 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI33_3] +; CHECK-SD-NEXT: sshl v0.8h, v1.8h, v0.8h +; CHECK-SD-NEXT: ushr v1.8h, v0.8h, #15 +; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform6: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI33_3 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI33_3] +; CHECK-GI-NEXT: adrp x8, .LCPI33_2 +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_2] +; CHECK-GI-NEXT: adrp x8, .LCPI33_1 +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI33_1] +; CHECK-GI-NEXT: adrp x8, .LCPI33_0 +; CHECK-GI-NEXT: mla v1.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: neg v0.8h, v3.8h +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_0] +; CHECK-GI-NEXT: sshl v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: ushr v1.8h, v0.8h, #15 +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) { +; CHECK-SD-LABEL: combine_vec_sdiv_nonuniform7: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: umov w9, v0.h[0] +; CHECK-SD-NEXT: mov w8, wzr +; CHECK-SD-NEXT: umov w10, v0.h[1] +; CHECK-SD-NEXT: umov w11, v0.h[2] +; CHECK-SD-NEXT: sub w9, w8, w9, sxth +; CHECK-SD-NEXT: sub w10, w8, w10, sxth +; CHECK-SD-NEXT: fmov s1, w9 +; CHECK-SD-NEXT: sub w9, w8, w11, sxth +; CHECK-SD-NEXT: mov v1.h[1], w10 +; CHECK-SD-NEXT: umov w10, v0.h[3] +; CHECK-SD-NEXT: mov v1.h[2], w9 +; CHECK-SD-NEXT: sub w8, w8, w10, sxth +; CHECK-SD-NEXT: mov v1.h[3], w8 +; CHECK-SD-NEXT: smov w8, v0.h[4] +; CHECK-SD-NEXT: mov v1.h[4], w8 +; CHECK-SD-NEXT: smov w8, v0.h[5] +; CHECK-SD-NEXT: mov v1.h[5], w8 +; CHECK-SD-NEXT: smov w8, v0.h[6] +; CHECK-SD-NEXT: mov v1.h[6], w8 +; CHECK-SD-NEXT: smov w8, v0.h[7] +; CHECK-SD-NEXT: mov v1.h[7], w8 +; CHECK-SD-NEXT: mov v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_nonuniform7: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI34_0 +; CHECK-GI-NEXT: neg v2.8h, v0.8h +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI34_0] +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: shl v1.8h, v1.8h, #15 +; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #15 +; CHECK-GI-NEXT: bit v0.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <16 x i8> @pr38658(<16 x i8> %x) { +; CHECK-SD-LABEL: pr38658: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI35_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI35_0] +; CHECK-SD-NEXT: adrp x8, .LCPI35_1 +; CHECK-SD-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: add v0.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI35_1] +; CHECK-SD-NEXT: sshl v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 +; CHECK-SD-NEXT: ushr v2.16b, v0.16b, #7 +; CHECK-SD-NEXT: mov v1.b[15], v2.b[15] +; CHECK-SD-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: pr38658: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI35_2 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI35_2] +; CHECK-GI-NEXT: adrp x8, .LCPI35_1 +; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_1] +; CHECK-GI-NEXT: adrp x8, .LCPI35_0 +; CHECK-GI-NEXT: add v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: neg v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_0] +; CHECK-GI-NEXT: sshl v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ushr v1.16b, v0.16b, #7 +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret + %1 = sdiv <16 x i8> %x, + ret <16 x i8> %1 +} + +define i1 @bool_sdiv(i1 %x, i1 %y) { +; CHECK-SD-LABEL: bool_sdiv: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w0, w0, #0x1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bool_sdiv: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sbfx w8, w0, #0, #1 +; CHECK-GI-NEXT: sbfx w9, w1, #0, #1 +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: and w0, w8, #0x1 +; CHECK-GI-NEXT: ret + %r = sdiv i1 %x, %y + ret i1 %r +} + +define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) { +; CHECK-SD-LABEL: boolvec_sdiv: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: boolvec_sdiv: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: umov w8, v0.h[0] +; CHECK-GI-NEXT: umov w9, v1.h[0] +; CHECK-GI-NEXT: umov w10, v1.h[1] +; CHECK-GI-NEXT: umov w11, v1.h[2] +; CHECK-GI-NEXT: umov w12, v1.h[3] +; CHECK-GI-NEXT: sbfx w8, w8, #0, #1 +; CHECK-GI-NEXT: sbfx w9, w9, #0, #1 +; CHECK-GI-NEXT: sbfx w10, w10, #0, #1 +; CHECK-GI-NEXT: sbfx w11, w11, #0, #1 +; CHECK-GI-NEXT: sbfx w12, w12, #0, #1 +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: umov w9, v0.h[1] +; CHECK-GI-NEXT: sbfx w9, w9, #0, #1 +; CHECK-GI-NEXT: sdiv w9, w9, w10 +; CHECK-GI-NEXT: umov w10, v0.h[2] +; CHECK-GI-NEXT: sbfx w10, w10, #0, #1 +; CHECK-GI-NEXT: sdiv w10, w10, w11 +; CHECK-GI-NEXT: umov w11, v0.h[3] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: sbfx w11, w11, #0, #1 +; CHECK-GI-NEXT: sdiv w8, w11, w12 +; CHECK-GI-NEXT: mov v0.h[2], w10 +; CHECK-GI-NEXT: mov v0.h[3], w8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret + %r = sdiv <4 x i1> %x, %y + ret <4 x i1> %r +} + +define i32 @combine_sdiv_two(i32 %x) { +; CHECK-SD-LABEL: combine_sdiv_two: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, w0, lsr #31 +; CHECK-SD-NEXT: asr w0, w8, #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_sdiv_two: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: asr w8, w0, #31 +; CHECK-GI-NEXT: add w8, w0, w8, lsr #31 +; CHECK-GI-NEXT: asr w0, w8, #1 +; CHECK-GI-NEXT: ret + %1 = sdiv i32 %x, 2 + ret i32 %1 +} + +define i32 @combine_sdiv_negtwo(i32 %x) { +; CHECK-SD-LABEL: combine_sdiv_negtwo: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, w0, lsr #31 +; CHECK-SD-NEXT: neg w0, w8, asr #1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_sdiv_negtwo: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: asr w8, w0, #31 +; CHECK-GI-NEXT: add w8, w0, w8, lsr #31 +; CHECK-GI-NEXT: neg w0, w8, asr #1 +; CHECK-GI-NEXT: ret + %1 = sdiv i32 %x, -2 + ret i32 %1 +} + +define i8 @combine_i8_sdiv_pow2(i8 %x) { +; CHECK-SD-LABEL: combine_i8_sdiv_pow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sxtb w8, w0 +; CHECK-SD-NEXT: ubfx w8, w8, #11, #4 +; CHECK-SD-NEXT: add w8, w0, w8 +; CHECK-SD-NEXT: sbfx w0, w8, #4, #4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i8_sdiv_pow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sbfx w8, w0, #7, #1 +; CHECK-GI-NEXT: ubfx w8, w8, #4, #4 +; CHECK-GI-NEXT: add w8, w0, w8 +; CHECK-GI-NEXT: sbfx w0, w8, #4, #4 +; CHECK-GI-NEXT: ret + %1 = sdiv i8 %x, 16 + ret i8 %1 +} + +define i8 @combine_i8_sdiv_negpow2(i8 %x) { +; CHECK-SD-LABEL: combine_i8_sdiv_negpow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sxtb w8, w0 +; CHECK-SD-NEXT: ubfx w8, w8, #9, #6 +; CHECK-SD-NEXT: add w8, w0, w8 +; CHECK-SD-NEXT: sxtb w8, w8 +; CHECK-SD-NEXT: neg w0, w8, asr #6 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i8_sdiv_negpow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sbfx w8, w0, #7, #1 +; CHECK-GI-NEXT: ubfx w8, w8, #2, #6 +; CHECK-GI-NEXT: add w8, w0, w8 +; CHECK-GI-NEXT: sxtb w8, w8 +; CHECK-GI-NEXT: neg w0, w8, asr #6 +; CHECK-GI-NEXT: ret + %1 = sdiv i8 %x, -64 + ret i8 %1 +} + +define i16 @combine_i16_sdiv_pow2(i16 %x) { +; CHECK-SD-LABEL: combine_i16_sdiv_pow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sxth w8, w0 +; CHECK-SD-NEXT: ubfx w8, w8, #27, #4 +; CHECK-SD-NEXT: add w8, w0, w8 +; CHECK-SD-NEXT: sbfx w0, w8, #4, #12 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i16_sdiv_pow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sbfx w8, w0, #15, #1 +; CHECK-GI-NEXT: ubfx w8, w8, #12, #4 +; CHECK-GI-NEXT: add w8, w0, w8 +; CHECK-GI-NEXT: sbfx w0, w8, #4, #12 +; CHECK-GI-NEXT: ret + %1 = sdiv i16 %x, 16 + ret i16 %1 +} + +define i16 @combine_i16_sdiv_negpow2(i16 %x) { +; CHECK-SD-LABEL: combine_i16_sdiv_negpow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sxth w8, w0 +; CHECK-SD-NEXT: lsr w8, w8, #23 +; CHECK-SD-NEXT: add w8, w0, w8, uxtb +; CHECK-SD-NEXT: sxth w8, w8 +; CHECK-SD-NEXT: neg w0, w8, asr #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i16_sdiv_negpow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sbfx w8, w0, #15, #1 +; CHECK-GI-NEXT: ubfx w8, w8, #8, #8 +; CHECK-GI-NEXT: add w8, w0, w8 +; CHECK-GI-NEXT: sxth w8, w8 +; CHECK-GI-NEXT: neg w0, w8, asr #8 +; CHECK-GI-NEXT: ret + %1 = sdiv i16 %x, -256 + ret i16 %1 +} + +define i32 @combine_i32_sdiv_pow2(i32 %x) { +; CHECK-SD-LABEL: combine_i32_sdiv_pow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, #15 +; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: csel w8, w8, w0, lt +; CHECK-SD-NEXT: asr w0, w8, #4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i32_sdiv_pow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: asr w8, w0, #31 +; CHECK-GI-NEXT: add w8, w0, w8, lsr #28 +; CHECK-GI-NEXT: asr w0, w8, #4 +; CHECK-GI-NEXT: ret + %1 = sdiv i32 %x, 16 + ret i32 %1 +} + +define i32 @combine_i32_sdiv_negpow2(i32 %x) { +; CHECK-SD-LABEL: combine_i32_sdiv_negpow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, #255 +; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: csel w8, w8, w0, lt +; CHECK-SD-NEXT: neg w0, w8, asr #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i32_sdiv_negpow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: asr w8, w0, #31 +; CHECK-GI-NEXT: add w8, w0, w8, lsr #24 +; CHECK-GI-NEXT: neg w0, w8, asr #8 +; CHECK-GI-NEXT: ret + %1 = sdiv i32 %x, -256 + ret i32 %1 +} + +define i64 @combine_i64_sdiv_pow2(i64 %x) { +; CHECK-SD-LABEL: combine_i64_sdiv_pow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add x8, x0, #15 +; CHECK-SD-NEXT: cmp x0, #0 +; CHECK-SD-NEXT: csel x8, x8, x0, lt +; CHECK-SD-NEXT: asr x0, x8, #4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i64_sdiv_pow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: asr x8, x0, #63 +; CHECK-GI-NEXT: add x8, x0, x8, lsr #60 +; CHECK-GI-NEXT: asr x0, x8, #4 +; CHECK-GI-NEXT: ret + %1 = sdiv i64 %x, 16 + ret i64 %1 +} + +define i64 @combine_i64_sdiv_negpow2(i64 %x) { +; CHECK-SD-LABEL: combine_i64_sdiv_negpow2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add x8, x0, #255 +; CHECK-SD-NEXT: cmp x0, #0 +; CHECK-SD-NEXT: csel x8, x8, x0, lt +; CHECK-SD-NEXT: neg x0, x8, asr #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_i64_sdiv_negpow2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: asr x8, x0, #63 +; CHECK-GI-NEXT: add x8, x0, x8, lsr #56 +; CHECK-GI-NEXT: neg x0, x8, asr #8 +; CHECK-GI-NEXT: ret + %1 = sdiv i64 %x, -256 + ret i64 %1 +} diff --git a/llvm/test/CodeGen/AArch64/select_const.ll b/llvm/test/CodeGen/AArch64/select_const.ll index 801093daef70d..0a73aed803415 100644 --- a/llvm/test/CodeGen/AArch64/select_const.ll +++ b/llvm/test/CodeGen/AArch64/select_const.ll @@ -461,15 +461,10 @@ define i8 @sel_constants_udiv_constant(i1 %cond) { ; CHECK-GI-LABEL: sel_constants_udiv_constant: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: and w8, w0, #0x1 -; CHECK-GI-NEXT: mov w9, #-4 // =0xfffffffc -; CHECK-GI-NEXT: mov w10, #23 // =0x17 +; CHECK-GI-NEXT: mov w9, #50 // =0x32 +; CHECK-GI-NEXT: mov w10, #4 // =0x4 ; CHECK-GI-NEXT: tst w8, #0x1 -; CHECK-GI-NEXT: csel w8, w9, w10, ne -; CHECK-GI-NEXT: mov w9, #205 // =0xcd -; CHECK-GI-NEXT: and w8, w8, #0xff -; CHECK-GI-NEXT: mul w8, w8, w9 -; CHECK-GI-NEXT: lsr w8, w8, #8 -; CHECK-GI-NEXT: lsr w0, w8, #2 +; CHECK-GI-NEXT: csel w0, w9, w10, ne ; CHECK-GI-NEXT: ret %sel = select i1 %cond, i8 -4, i8 23 %bo = udiv i8 %sel, 5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir index 96a776f6fbb69..cc4581195af45 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir @@ -778,8 +778,8 @@ body: | ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 50 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 21 - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT %cond(s1), [[C]], [[C1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[SELECT]](s32) + ; CHECK-NEXT: %udiv:_(s32) = G_SELECT %cond(s1), [[C]], [[C1]] + ; CHECK-NEXT: S_ENDPGM 0, implicit %udiv(s32) %reg:_(s32) = COPY $vgpr0 %zero:_(s32) = G_CONSTANT i32 0 %cond:_(s1) = G_ICMP intpred(eq), %reg, %zero diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll index 2fa5492c8a2b7..8981b11a90286 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -308,30 +308,12 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) { ; CHECK-LABEL: v_sdiv_i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x12d8fb +; CHECK-NEXT: v_mov_b32_e32 v1, 0xd9528441 +; CHECK-NEXT: v_mul_hi_i32 v1, v0, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 20, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 0xffed2705, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i32 %num, 1235195 ret i32 %result @@ -387,46 +369,17 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-LABEL: v_sdiv_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 -; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v5, 0x12d8fb -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; CGP-NEXT: v_mov_b32_e32 v2, 0xd9528441 +; CGP-NEXT: v_mul_hi_i32 v3, v0, v2 +; CGP-NEXT: v_mul_hi_i32 v2, v1, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CGP-NEXT: v_ashrrev_i32_e32 v0, 20, v0 +; CGP-NEXT: v_ashrrev_i32_e32 v1, 20, v1 +; CGP-NEXT: v_lshrrev_b32_e32 v2, 31, v0 +; CGP-NEXT: v_lshrrev_b32_e32 v3, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v7, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 -; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, v3, v5 -; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[6:7] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 0xffed2705, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i32> %num, ret <2 x i32> %result diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll index 389d59298505d..4b999b892ed35 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll @@ -375,14 +375,22 @@ define i16 @udiv16_constant_add(i16 %a) nounwind { define i32 @sdiv_constant_no_srai(i32 %a) nounwind { ; RV32-LABEL: sdiv_constant_no_srai: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 3 -; RV32-NEXT: div a0, a0, a1 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1366 +; RV32-NEXT: mulh a0, a0, a1 +; RV32-NEXT: srli a1, a0, 31 +; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: sdiv_constant_no_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 3 -; RV64-NEXT: divw a0, a0, a1 +; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: addi a1, a1, 1366 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: srai a0, a0, 32 +; RV64-NEXT: srliw a1, a0, 31 +; RV64-NEXT: addw a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i32 %a, 3 ret i32 %1 @@ -392,14 +400,24 @@ define i32 @sdiv_constant_no_srai(i32 %a) nounwind { define i32 @sdiv_constant_srai(i32 %a) nounwind { ; RV32-LABEL: sdiv_constant_srai: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 5 -; RV32-NEXT: div a0, a0, a1 +; RV32-NEXT: lui a1, 419430 +; RV32-NEXT: addi a1, a1, 1639 +; RV32-NEXT: mulh a0, a0, a1 +; RV32-NEXT: srai a0, a0, 1 +; RV32-NEXT: srli a1, a0, 31 +; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: sdiv_constant_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 5 -; RV64-NEXT: divw a0, a0, a1 +; RV64-NEXT: lui a1, 419430 +; RV64-NEXT: addi a1, a1, 1639 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: srai a0, a0, 32 +; RV64-NEXT: sraiw a0, a0, 1 +; RV64-NEXT: srliw a1, a0, 31 +; RV64-NEXT: addw a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i32 %a, 5 ret i32 %1 @@ -409,14 +427,26 @@ define i32 @sdiv_constant_srai(i32 %a) nounwind { define i32 @sdiv_constant_add_srai(i32 %a) nounwind { ; RV32-LABEL: sdiv_constant_add_srai: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 7 -; RV32-NEXT: div a0, a0, a1 +; RV32-NEXT: lui a1, 599186 +; RV32-NEXT: addi a1, a1, 1171 +; RV32-NEXT: mulh a1, a0, a1 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: srai a0, a0, 2 +; RV32-NEXT: srli a1, a0, 31 +; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: sdiv_constant_add_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 7 -; RV64-NEXT: divw a0, a0, a1 +; RV64-NEXT: lui a1, 599186 +; RV64-NEXT: addi a1, a1, 1171 +; RV64-NEXT: sext.w a2, a0 +; RV64-NEXT: mul a1, a2, a1 +; RV64-NEXT: srai a1, a1, 32 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: sraiw a0, a0, 2 +; RV64-NEXT: srliw a1, a0, 31 +; RV64-NEXT: addw a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i32 %a, 7 ret i32 %1 @@ -426,14 +456,26 @@ define i32 @sdiv_constant_add_srai(i32 %a) nounwind { define i32 @sdiv_constant_sub_srai(i32 %a) nounwind { ; RV32-LABEL: sdiv_constant_sub_srai: ; RV32: # %bb.0: -; RV32-NEXT: li a1, -7 -; RV32-NEXT: div a0, a0, a1 +; RV32-NEXT: lui a1, 449390 +; RV32-NEXT: addi a1, a1, -1171 +; RV32-NEXT: mulh a1, a0, a1 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: srai a0, a1, 2 +; RV32-NEXT: srli a1, a0, 31 +; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: sdiv_constant_sub_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, -7 -; RV64-NEXT: divw a0, a0, a1 +; RV64-NEXT: lui a1, 449390 +; RV64-NEXT: addi a1, a1, -1171 +; RV64-NEXT: sext.w a2, a0 +; RV64-NEXT: mul a1, a2, a1 +; RV64-NEXT: srai a1, a1, 32 +; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: sraiw a0, a1, 2 +; RV64-NEXT: srliw a1, a0, 31 +; RV64-NEXT: addw a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i32 %a, -7 ret i32 %1 @@ -453,8 +495,11 @@ define i64 @sdiv64_constant_no_srai(i64 %a) nounwind { ; ; RV64-LABEL: sdiv64_constant_no_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 3 -; RV64-NEXT: div a0, a0, a1 +; RV64-NEXT: lui a1, %hi(.LCPI12_0) +; RV64-NEXT: ld a1, %lo(.LCPI12_0)(a1) +; RV64-NEXT: mulh a0, a0, a1 +; RV64-NEXT: srli a1, a0, 63 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i64 %a, 3 ret i64 %1 @@ -474,8 +519,12 @@ define i64 @sdiv64_constant_srai(i64 %a) nounwind { ; ; RV64-LABEL: sdiv64_constant_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 5 -; RV64-NEXT: div a0, a0, a1 +; RV64-NEXT: lui a1, %hi(.LCPI13_0) +; RV64-NEXT: ld a1, %lo(.LCPI13_0)(a1) +; RV64-NEXT: mulh a0, a0, a1 +; RV64-NEXT: srai a0, a0, 1 +; RV64-NEXT: srli a1, a0, 63 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i64 %a, 5 ret i64 %1 @@ -495,8 +544,19 @@ define i64 @sdiv64_constant_add_srai(i64 %a) nounwind { ; ; RV64-LABEL: sdiv64_constant_add_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 15 -; RV64-NEXT: div a0, a0, a1 +; RV64-NEXT: lui a1, 1017993 +; RV64-NEXT: addi a1, a1, -1911 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1911 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1911 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1911 +; RV64-NEXT: mulh a1, a0, a1 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: srai a0, a0, 3 +; RV64-NEXT: srli a1, a0, 63 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i64 %a, 15 ret i64 %1 @@ -516,8 +576,19 @@ define i64 @sdiv64_constant_sub_srai(i64 %a) nounwind { ; ; RV64-LABEL: sdiv64_constant_sub_srai: ; RV64: # %bb.0: -; RV64-NEXT: li a1, -3 -; RV64-NEXT: div a0, a0, a1 +; RV64-NEXT: lui a1, 21845 +; RV64-NEXT: addi a1, a1, 1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, 1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, 1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, 1365 +; RV64-NEXT: mulh a1, a0, a1 +; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: srai a0, a1, 1 +; RV64-NEXT: srli a1, a0, 63 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i64 %a, -3 ret i64 %1 @@ -526,32 +597,52 @@ define i64 @sdiv64_constant_sub_srai(i64 %a) nounwind { define i8 @sdiv8_constant_no_srai(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_constant_no_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, 3 +; RV32IM-NEXT: li a1, 86 ; RV32IM-NEXT: slli a0, a0, 24 ; RV32IM-NEXT: srai a0, a0, 24 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: srai a0, a0, 24 +; RV32IM-NEXT: zext.b a1, a0 +; RV32IM-NEXT: srli a1, a1, 7 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv8_constant_no_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, 3 +; RV32IMZB-NEXT: li a1, 86 ; RV32IMZB-NEXT: sext.b a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: mul a0, a0, a1 +; RV32IMZB-NEXT: sext.h a0, a0 +; RV32IMZB-NEXT: srai a0, a0, 8 +; RV32IMZB-NEXT: zext.b a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 7 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv8_constant_no_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, 3 +; RV64IM-NEXT: li a1, 86 ; RV64IM-NEXT: slli a0, a0, 56 ; RV64IM-NEXT: srai a0, a0, 56 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: srai a0, a0, 56 +; RV64IM-NEXT: zext.b a1, a0 +; RV64IM-NEXT: srli a1, a1, 7 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv8_constant_no_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, 3 +; RV64IMZB-NEXT: li a1, 86 ; RV64IMZB-NEXT: sext.b a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: mul a0, a0, a1 +; RV64IMZB-NEXT: sext.h a0, a0 +; RV64IMZB-NEXT: srai a0, a0, 8 +; RV64IMZB-NEXT: zext.b a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 7 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i8 %a, 3 ret i8 %1 @@ -560,32 +651,60 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind { define i8 @sdiv8_constant_srai(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_constant_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, 5 +; RV32IM-NEXT: li a1, 103 ; RV32IM-NEXT: slli a0, a0, 24 ; RV32IM-NEXT: srai a0, a0, 24 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: srai a0, a0, 24 +; RV32IM-NEXT: slli a0, a0, 24 +; RV32IM-NEXT: srai a0, a0, 25 +; RV32IM-NEXT: zext.b a1, a0 +; RV32IM-NEXT: srli a1, a1, 7 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv8_constant_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, 5 +; RV32IMZB-NEXT: li a1, 103 ; RV32IMZB-NEXT: sext.b a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: mul a0, a0, a1 +; RV32IMZB-NEXT: sext.h a0, a0 +; RV32IMZB-NEXT: srai a0, a0, 8 +; RV32IMZB-NEXT: sext.b a0, a0 +; RV32IMZB-NEXT: srai a0, a0, 1 +; RV32IMZB-NEXT: zext.b a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 7 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv8_constant_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, 5 +; RV64IM-NEXT: li a1, 103 ; RV64IM-NEXT: slli a0, a0, 56 ; RV64IM-NEXT: srai a0, a0, 56 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: srai a0, a0, 56 +; RV64IM-NEXT: slli a0, a0, 56 +; RV64IM-NEXT: srai a0, a0, 57 +; RV64IM-NEXT: zext.b a1, a0 +; RV64IM-NEXT: srli a1, a1, 7 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv8_constant_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, 5 +; RV64IMZB-NEXT: li a1, 103 +; RV64IMZB-NEXT: sext.b a0, a0 +; RV64IMZB-NEXT: mul a0, a0, a1 +; RV64IMZB-NEXT: sext.h a0, a0 +; RV64IMZB-NEXT: srai a0, a0, 8 ; RV64IMZB-NEXT: sext.b a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: srai a0, a0, 1 +; RV64IMZB-NEXT: zext.b a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 7 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i8 %a, 5 ret i8 %1 @@ -594,32 +713,64 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind { define i8 @sdiv8_constant_add_srai(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_constant_add_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, 7 +; RV32IM-NEXT: li a1, -109 +; RV32IM-NEXT: slli a2, a0, 24 +; RV32IM-NEXT: srai a2, a2, 24 +; RV32IM-NEXT: mul a1, a2, a1 +; RV32IM-NEXT: slli a1, a1, 16 +; RV32IM-NEXT: srai a1, a1, 24 +; RV32IM-NEXT: add a0, a1, a0 ; RV32IM-NEXT: slli a0, a0, 24 -; RV32IM-NEXT: srai a0, a0, 24 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: srai a0, a0, 26 +; RV32IM-NEXT: zext.b a1, a0 +; RV32IM-NEXT: srli a1, a1, 7 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv8_constant_add_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, 7 +; RV32IMZB-NEXT: li a1, -109 +; RV32IMZB-NEXT: sext.b a2, a0 +; RV32IMZB-NEXT: mul a1, a2, a1 +; RV32IMZB-NEXT: sext.h a1, a1 +; RV32IMZB-NEXT: srai a1, a1, 8 +; RV32IMZB-NEXT: add a0, a1, a0 ; RV32IMZB-NEXT: sext.b a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: srai a0, a0, 2 +; RV32IMZB-NEXT: zext.b a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 7 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv8_constant_add_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, 7 +; RV64IM-NEXT: li a1, -109 +; RV64IM-NEXT: slli a2, a0, 56 +; RV64IM-NEXT: srai a2, a2, 56 +; RV64IM-NEXT: mul a1, a2, a1 +; RV64IM-NEXT: slli a1, a1, 48 +; RV64IM-NEXT: srai a1, a1, 56 +; RV64IM-NEXT: add a0, a1, a0 ; RV64IM-NEXT: slli a0, a0, 56 -; RV64IM-NEXT: srai a0, a0, 56 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: srai a0, a0, 58 +; RV64IM-NEXT: zext.b a1, a0 +; RV64IM-NEXT: srli a1, a1, 7 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv8_constant_add_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, 7 +; RV64IMZB-NEXT: li a1, -109 +; RV64IMZB-NEXT: sext.b a2, a0 +; RV64IMZB-NEXT: mul a1, a2, a1 +; RV64IMZB-NEXT: sext.h a1, a1 +; RV64IMZB-NEXT: srai a1, a1, 8 +; RV64IMZB-NEXT: add a0, a1, a0 ; RV64IMZB-NEXT: sext.b a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: srai a0, a0, 2 +; RV64IMZB-NEXT: zext.b a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 7 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i8 %a, 7 ret i8 %1 @@ -628,32 +779,64 @@ define i8 @sdiv8_constant_add_srai(i8 %a) nounwind { define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_constant_sub_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, -7 -; RV32IM-NEXT: slli a0, a0, 24 -; RV32IM-NEXT: srai a0, a0, 24 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: li a1, 109 +; RV32IM-NEXT: slli a2, a0, 24 +; RV32IM-NEXT: srai a2, a2, 24 +; RV32IM-NEXT: mul a1, a2, a1 +; RV32IM-NEXT: slli a1, a1, 16 +; RV32IM-NEXT: srai a1, a1, 24 +; RV32IM-NEXT: sub a1, a1, a0 +; RV32IM-NEXT: slli a1, a1, 24 +; RV32IM-NEXT: srai a0, a1, 26 +; RV32IM-NEXT: zext.b a1, a0 +; RV32IM-NEXT: srli a1, a1, 7 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv8_constant_sub_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, -7 -; RV32IMZB-NEXT: sext.b a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: li a1, 109 +; RV32IMZB-NEXT: sext.b a2, a0 +; RV32IMZB-NEXT: mul a1, a2, a1 +; RV32IMZB-NEXT: sext.h a1, a1 +; RV32IMZB-NEXT: srai a1, a1, 8 +; RV32IMZB-NEXT: sub a1, a1, a0 +; RV32IMZB-NEXT: sext.b a0, a1 +; RV32IMZB-NEXT: srai a0, a0, 2 +; RV32IMZB-NEXT: zext.b a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 7 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv8_constant_sub_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, -7 -; RV64IM-NEXT: slli a0, a0, 56 -; RV64IM-NEXT: srai a0, a0, 56 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: li a1, 109 +; RV64IM-NEXT: slli a2, a0, 56 +; RV64IM-NEXT: srai a2, a2, 56 +; RV64IM-NEXT: mul a1, a2, a1 +; RV64IM-NEXT: slli a1, a1, 48 +; RV64IM-NEXT: srai a1, a1, 56 +; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: slli a1, a1, 56 +; RV64IM-NEXT: srai a0, a1, 58 +; RV64IM-NEXT: zext.b a1, a0 +; RV64IM-NEXT: srli a1, a1, 7 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv8_constant_sub_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, -7 -; RV64IMZB-NEXT: sext.b a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: li a1, 109 +; RV64IMZB-NEXT: sext.b a2, a0 +; RV64IMZB-NEXT: mul a1, a2, a1 +; RV64IMZB-NEXT: sext.h a1, a1 +; RV64IMZB-NEXT: srai a1, a1, 8 +; RV64IMZB-NEXT: sub a1, a1, a0 +; RV64IMZB-NEXT: sext.b a0, a1 +; RV64IMZB-NEXT: srai a0, a0, 2 +; RV64IMZB-NEXT: zext.b a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 7 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i8 %a, -7 ret i8 %1 @@ -662,32 +845,54 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind { define i16 @sdiv16_constant_no_srai(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_constant_no_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, 3 +; RV32IM-NEXT: lui a1, 5 ; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: addi a1, a1, 1366 ; RV32IM-NEXT: srai a0, a0, 16 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: srai a0, a0, 16 +; RV32IM-NEXT: slli a1, a0, 16 +; RV32IM-NEXT: srli a1, a1, 16 +; RV32IM-NEXT: srli a1, a1, 15 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv16_constant_no_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, 3 +; RV32IMZB-NEXT: lui a1, 5 +; RV32IMZB-NEXT: addi a1, a1, 1366 ; RV32IMZB-NEXT: sext.h a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: mul a0, a0, a1 +; RV32IMZB-NEXT: srai a0, a0, 16 +; RV32IMZB-NEXT: zext.h a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 15 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv16_constant_no_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, 3 +; RV64IM-NEXT: lui a1, 5 ; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: addi a1, a1, 1366 ; RV64IM-NEXT: srai a0, a0, 48 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: sraiw a0, a0, 16 +; RV64IM-NEXT: slli a1, a0, 48 +; RV64IM-NEXT: srli a1, a1, 48 +; RV64IM-NEXT: srli a1, a1, 15 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv16_constant_no_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, 3 +; RV64IMZB-NEXT: lui a1, 5 +; RV64IMZB-NEXT: addi a1, a1, 1366 ; RV64IMZB-NEXT: sext.h a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: mul a0, a0, a1 +; RV64IMZB-NEXT: sraiw a0, a0, 16 +; RV64IMZB-NEXT: zext.h a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 15 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i16 %a, 3 ret i16 %1 @@ -696,32 +901,62 @@ define i16 @sdiv16_constant_no_srai(i16 %a) nounwind { define i16 @sdiv16_constant_srai(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_constant_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, 5 +; RV32IM-NEXT: lui a1, 6 ; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: addi a1, a1, 1639 ; RV32IM-NEXT: srai a0, a0, 16 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: srai a0, a0, 16 +; RV32IM-NEXT: slli a0, a0, 16 +; RV32IM-NEXT: srai a0, a0, 17 +; RV32IM-NEXT: slli a1, a0, 16 +; RV32IM-NEXT: srli a1, a1, 16 +; RV32IM-NEXT: srli a1, a1, 15 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv16_constant_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, 5 +; RV32IMZB-NEXT: lui a1, 6 +; RV32IMZB-NEXT: addi a1, a1, 1639 ; RV32IMZB-NEXT: sext.h a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: mul a0, a0, a1 +; RV32IMZB-NEXT: srai a0, a0, 16 +; RV32IMZB-NEXT: sext.h a0, a0 +; RV32IMZB-NEXT: srai a0, a0, 1 +; RV32IMZB-NEXT: zext.h a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 15 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv16_constant_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, 5 +; RV64IM-NEXT: lui a1, 6 ; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: addi a1, a1, 1639 ; RV64IM-NEXT: srai a0, a0, 48 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: sraiw a0, a0, 16 +; RV64IM-NEXT: slli a0, a0, 48 +; RV64IM-NEXT: srai a0, a0, 49 +; RV64IM-NEXT: slli a1, a0, 48 +; RV64IM-NEXT: srli a1, a1, 48 +; RV64IM-NEXT: srli a1, a1, 15 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv16_constant_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, 5 +; RV64IMZB-NEXT: lui a1, 6 +; RV64IMZB-NEXT: addi a1, a1, 1639 +; RV64IMZB-NEXT: sext.h a0, a0 +; RV64IMZB-NEXT: mul a0, a0, a1 +; RV64IMZB-NEXT: sraiw a0, a0, 16 ; RV64IMZB-NEXT: sext.h a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: srai a0, a0, 1 +; RV64IMZB-NEXT: zext.h a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 15 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i16 %a, 5 ret i16 %1 @@ -730,32 +965,66 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind { define i16 @sdiv16_constant_add_srai(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_constant_add_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, 15 +; RV32IM-NEXT: lui a1, 1048569 +; RV32IM-NEXT: slli a2, a0, 16 +; RV32IM-NEXT: addi a1, a1, -1911 +; RV32IM-NEXT: srai a2, a2, 16 +; RV32IM-NEXT: mul a1, a2, a1 +; RV32IM-NEXT: srai a1, a1, 16 +; RV32IM-NEXT: add a0, a1, a0 ; RV32IM-NEXT: slli a0, a0, 16 -; RV32IM-NEXT: srai a0, a0, 16 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: srai a0, a0, 19 +; RV32IM-NEXT: slli a1, a0, 16 +; RV32IM-NEXT: srli a1, a1, 16 +; RV32IM-NEXT: srli a1, a1, 15 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv16_constant_add_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, 15 +; RV32IMZB-NEXT: lui a1, 1048569 +; RV32IMZB-NEXT: addi a1, a1, -1911 +; RV32IMZB-NEXT: sext.h a2, a0 +; RV32IMZB-NEXT: mul a1, a2, a1 +; RV32IMZB-NEXT: srai a1, a1, 16 +; RV32IMZB-NEXT: add a0, a1, a0 ; RV32IMZB-NEXT: sext.h a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: srai a0, a0, 3 +; RV32IMZB-NEXT: zext.h a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 15 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv16_constant_add_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, 15 +; RV64IM-NEXT: lui a1, 1048569 +; RV64IM-NEXT: slli a2, a0, 48 +; RV64IM-NEXT: addi a1, a1, -1911 +; RV64IM-NEXT: srai a2, a2, 48 +; RV64IM-NEXT: mul a1, a2, a1 +; RV64IM-NEXT: sraiw a1, a1, 16 +; RV64IM-NEXT: add a0, a1, a0 ; RV64IM-NEXT: slli a0, a0, 48 -; RV64IM-NEXT: srai a0, a0, 48 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: srai a0, a0, 51 +; RV64IM-NEXT: slli a1, a0, 48 +; RV64IM-NEXT: srli a1, a1, 48 +; RV64IM-NEXT: srli a1, a1, 15 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv16_constant_add_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, 15 +; RV64IMZB-NEXT: lui a1, 1048569 +; RV64IMZB-NEXT: addi a1, a1, -1911 +; RV64IMZB-NEXT: sext.h a2, a0 +; RV64IMZB-NEXT: mul a1, a2, a1 +; RV64IMZB-NEXT: sraiw a1, a1, 16 +; RV64IMZB-NEXT: add a0, a1, a0 ; RV64IMZB-NEXT: sext.h a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: srai a0, a0, 3 +; RV64IMZB-NEXT: zext.h a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 15 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i16 %a, 15 ret i16 %1 @@ -764,32 +1033,66 @@ define i16 @sdiv16_constant_add_srai(i16 %a) nounwind { define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_constant_sub_srai: ; RV32IM: # %bb.0: -; RV32IM-NEXT: li a1, -15 -; RV32IM-NEXT: slli a0, a0, 16 -; RV32IM-NEXT: srai a0, a0, 16 -; RV32IM-NEXT: div a0, a0, a1 +; RV32IM-NEXT: lui a1, 7 +; RV32IM-NEXT: slli a2, a0, 16 +; RV32IM-NEXT: addi a1, a1, 1911 +; RV32IM-NEXT: srai a2, a2, 16 +; RV32IM-NEXT: mul a1, a2, a1 +; RV32IM-NEXT: srai a1, a1, 16 +; RV32IM-NEXT: sub a1, a1, a0 +; RV32IM-NEXT: slli a1, a1, 16 +; RV32IM-NEXT: srai a0, a1, 19 +; RV32IM-NEXT: slli a1, a0, 16 +; RV32IM-NEXT: srli a1, a1, 16 +; RV32IM-NEXT: srli a1, a1, 15 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; ; RV32IMZB-LABEL: sdiv16_constant_sub_srai: ; RV32IMZB: # %bb.0: -; RV32IMZB-NEXT: li a1, -15 -; RV32IMZB-NEXT: sext.h a0, a0 -; RV32IMZB-NEXT: div a0, a0, a1 +; RV32IMZB-NEXT: lui a1, 7 +; RV32IMZB-NEXT: addi a1, a1, 1911 +; RV32IMZB-NEXT: sext.h a2, a0 +; RV32IMZB-NEXT: mul a1, a2, a1 +; RV32IMZB-NEXT: srai a1, a1, 16 +; RV32IMZB-NEXT: sub a1, a1, a0 +; RV32IMZB-NEXT: sext.h a0, a1 +; RV32IMZB-NEXT: srai a0, a0, 3 +; RV32IMZB-NEXT: zext.h a1, a0 +; RV32IMZB-NEXT: srli a1, a1, 15 +; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; ; RV64IM-LABEL: sdiv16_constant_sub_srai: ; RV64IM: # %bb.0: -; RV64IM-NEXT: li a1, -15 -; RV64IM-NEXT: slli a0, a0, 48 -; RV64IM-NEXT: srai a0, a0, 48 -; RV64IM-NEXT: divw a0, a0, a1 +; RV64IM-NEXT: lui a1, 7 +; RV64IM-NEXT: slli a2, a0, 48 +; RV64IM-NEXT: addi a1, a1, 1911 +; RV64IM-NEXT: srai a2, a2, 48 +; RV64IM-NEXT: mul a1, a2, a1 +; RV64IM-NEXT: sraiw a1, a1, 16 +; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: slli a1, a1, 48 +; RV64IM-NEXT: srai a0, a1, 51 +; RV64IM-NEXT: slli a1, a0, 48 +; RV64IM-NEXT: srli a1, a1, 48 +; RV64IM-NEXT: srli a1, a1, 15 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret ; ; RV64IMZB-LABEL: sdiv16_constant_sub_srai: ; RV64IMZB: # %bb.0: -; RV64IMZB-NEXT: li a1, -15 -; RV64IMZB-NEXT: sext.h a0, a0 -; RV64IMZB-NEXT: divw a0, a0, a1 +; RV64IMZB-NEXT: lui a1, 7 +; RV64IMZB-NEXT: addi a1, a1, 1911 +; RV64IMZB-NEXT: sext.h a2, a0 +; RV64IMZB-NEXT: mul a1, a2, a1 +; RV64IMZB-NEXT: sraiw a1, a1, 16 +; RV64IMZB-NEXT: sub a1, a1, a0 +; RV64IMZB-NEXT: sext.h a0, a1 +; RV64IMZB-NEXT: srai a0, a0, 3 +; RV64IMZB-NEXT: zext.h a1, a0 +; RV64IMZB-NEXT: srli a1, a1, 15 +; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: ret %1 = sdiv i16 %a, -15 ret i16 %1