diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 845afa6d4228b..546ecaaccee8a 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -2229,6 +2229,7 @@ class LSRInstance { void NarrowSearchSpaceByDeletingCostlyFormulas(); void NarrowSearchSpaceByPickingWinnerRegs(); void NarrowSearchSpaceUsingHeuristics(); + bool SortLSRUses(); void SolveRecurse(SmallVectorImpl &Solution, Cost &SolutionCost, @@ -5368,6 +5369,45 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() { NarrowSearchSpaceByPickingWinnerRegs(); } +/// Sort LSRUses to address side effects of compile time optimization done in +/// SolveRecurse which filters out formulae not including required registers. +/// Such optimization makes the found best solution sensitive to the order +/// of LSRUses processing, hence it's important to ensure that that order +/// isn't random to avoid fluctuations and sub-optimal results. +/// +/// Also check that all LSRUses have formulae as otherwise the situation is +/// unsolvable. +bool LSRInstance::SortLSRUses() { + SmallVector NewOrder; + for (LSRUse &LU : Uses) { + if (LU.Formulae.empty()) { + return false; + } + NewOrder.push_back(&LU); + } + + stable_sort(NewOrder, [](const LSRUse *L, const LSRUse *R) { + auto CalcKey = [](const LSRUse *LU) { + // LSRUses w/ many registers and formulae go first avoid too big + // reduction of considered solutions count + return std::tuple(LU->Regs.size(), LU->Formulae.size(), LU->Kind, + LU->MinOffset.getKnownMinValue(), + LU->MaxOffset.getKnownMinValue(), + LU->AllFixupsOutsideLoop, LU->RigidFormula); + }; + return CalcKey(L) > CalcKey(R); + }); + + SmallVector NewUses; + for (LSRUse *LU : NewOrder) + NewUses.push_back(std::move(*LU)); + Uses = std::move(NewUses); + + LLVM_DEBUG(dbgs() << "\nAfter sorting:\n"; print_uses(dbgs())); + + return true; +} + /// This is the recursive solver. void LSRInstance::SolveRecurse(SmallVectorImpl &Solution, Cost &SolutionCost, @@ -5387,6 +5427,10 @@ void LSRInstance::SolveRecurse(SmallVectorImpl &Solution, const LSRUse &LU = Uses[Workspace.size()]; + assert(!LU.Formulae.empty() && + "LSRUse w/o formulae leads to unsolvable situation so it" + "shouldn't be here"); + // If this use references any register that's already a part of the // in-progress solution, consider it a requirement that a formula must // reference that register in order to be considered. This prunes out @@ -5398,54 +5442,73 @@ void LSRInstance::SolveRecurse(SmallVectorImpl &Solution, SmallPtrSet NewRegs; Cost NewCost(L, SE, TTI, AMK); - for (const Formula &F : LU.Formulae) { - // Ignore formulae which may not be ideal in terms of register reuse of - // ReqRegs. The formula should use all required registers before - // introducing new ones. - // This can sometimes (notably when trying to favour postinc) lead to - // sub-optimial decisions. There it is best left to the cost modelling to - // get correct. - if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) { - int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size()); - for (const SCEV *Reg : ReqRegs) { - if ((F.ScaledReg && F.ScaledReg == Reg) || - is_contained(F.BaseRegs, Reg)) { - --NumReqRegsToFind; - if (NumReqRegsToFind == 0) - break; + bool FormulaeTested = false; + unsigned NumReqRegsToIgnore = 0; + + while (!FormulaeTested) { + assert( + !NumReqRegsToIgnore || + NumReqRegsToIgnore < ReqRegs.size() && + "at least one formulae should have at least one required register"); + + for (const Formula &F : LU.Formulae) { + // ReqRegs. The formula should use required registers before + // introducing new ones. Firstly try the most aggressive option + // (when maximum of required registers are used) and then gradually make + // it weaker if all formulae don't satisfy this requirement. + // + // This can sometimes (notably when trying to favour postinc) lead to + // sub-optimal decisions. There it is best left to the cost modeling to + // get correct. + if (!ReqRegs.empty() && + (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address)) { + unsigned NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size()); + bool ReqRegsFound = false; + for (const SCEV *Reg : ReqRegs) { + if ((F.ScaledReg && F.ScaledReg == Reg) || + is_contained(F.BaseRegs, Reg)) { + ReqRegsFound = true; + if (--NumReqRegsToFind == NumReqRegsToIgnore) + break; + } + } + if (!ReqRegsFound || NumReqRegsToFind != NumReqRegsToIgnore) { + continue; } } - if (NumReqRegsToFind != 0) { - // If none of the formulae satisfied the required registers, then we could - // clear ReqRegs and try again. Currently, we simply give up in this case. - continue; - } - } - // Evaluate the cost of the current formula. If it's already worse than - // the current best, prune the search at that point. - NewCost = CurCost; - NewRegs = CurRegs; - NewCost.RateFormula(F, NewRegs, VisitedRegs, LU); - if (NewCost.isLess(SolutionCost)) { - Workspace.push_back(&F); - if (Workspace.size() != Uses.size()) { - SolveRecurse(Solution, SolutionCost, Workspace, NewCost, - NewRegs, VisitedRegs); - if (F.getNumRegs() == 1 && Workspace.size() == 1) - VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]); - } else { - LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); - dbgs() << ".\nRegs:\n"; - for (const SCEV *S : NewRegs) dbgs() - << "- " << *S << "\n"; - dbgs() << '\n'); - - SolutionCost = NewCost; - Solution = Workspace; + // Evaluate the cost of the current formula. If it's already worse than + // the current best, prune the search at that point. + FormulaeTested = true; + NewCost = CurCost; + NewRegs = CurRegs; + NewCost.RateFormula(F, NewRegs, VisitedRegs, LU); + if (NewCost.isLess(SolutionCost)) { + Workspace.push_back(&F); + if (Workspace.size() != Uses.size()) { + SolveRecurse(Solution, SolutionCost, Workspace, NewCost, NewRegs, + VisitedRegs); + if (F.getNumRegs() == 1 && Workspace.size() == 1) + VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]); + } else { + LLVM_DEBUG({ + dbgs() << "New best at "; + NewCost.print(dbgs()); + dbgs() << ".\nRegs:\n"; + for (const SCEV *S : NewRegs) + dbgs() << "- " << *S << "\n"; + dbgs() << '\n'; + }); + SolutionCost = NewCost; + Solution = Workspace; + } + Workspace.pop_back(); } - Workspace.pop_back(); } + + // none of formulae has necessary number of required registers - then + // make the requirement weaker + NumReqRegsToIgnore++; } } @@ -6180,7 +6243,11 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, NarrowSearchSpaceUsingHeuristics(); SmallVector Solution; - Solve(Solution); + bool LSRUsesConsistent = SortLSRUses(); + + if (LSRUsesConsistent) { + Solve(Solution); + } // Release memory that is no longer needed. Factors.clear(); diff --git a/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll b/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll index 99e0d06bf4218..b54492b38ea79 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll @@ -16,14 +16,14 @@ define i32 @a(i32 %x, ptr nocapture readonly %y, ptr nocapture readonly %z) { ; CHECK-IMPLICIT: .p2align 5 ; CHECK-NEXT: .LBB0_8: // %for.body ; CHECK-OBJ;Disassembly of section .text: -; CHECK-OBJ: 88: 8b0a002a add +; CHECK-OBJ: 88: 8b0b002b add ; CHECK-OBJ-IMPLICIT-NEXT: 8c: d503201f nop ; CHECK-OBJ-IMPLICIT-NEXT: 90: d503201f nop ; CHECK-OBJ-IMPLICIT-NEXT: 94: d503201f nop ; CHECK-OBJ-IMPLICIT-NEXT: 98: d503201f nop ; CHECK-OBJ-IMPLICIT-NEXT: 9c: d503201f nop -; CHECK-OBJ-IMPLICIT-NEXT: a0: b840454b ldr -; CHECK-OBJ-EXPLICIT-NEXT: 8c: b840454b ldr +; CHECK-OBJ-IMPLICIT-NEXT: a0: b8404569 ldr +; CHECK-OBJ-EXPLICIT-NEXT: 8c: b8404569 ldr entry: %cmp10 = icmp sgt i32 %x, 0 br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll index 4c8e589391c3a..8ffdc1e8dcb41 100644 --- a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll +++ b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll @@ -33,17 +33,17 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef ; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: b.eq .LBB0_8 ; CHECK-NEXT: .LBB0_6: // %for.body.preheader1 -; CHECK-NEXT: lsl x10, x9, #1 +; CHECK-NEXT: lsl x11, x9, #1 ; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: add x9, x1, x10 -; CHECK-NEXT: add x10, x0, x10 +; CHECK-NEXT: add x10, x1, x11 +; CHECK-NEXT: add x11, x0, x11 ; CHECK-NEXT: .LBB0_7: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr h1, [x10], #2 -; CHECK-NEXT: ldr h2, [x9] +; CHECK-NEXT: ldr h1, [x11], #2 +; CHECK-NEXT: ldr h2, [x10] ; CHECK-NEXT: subs x8, x8, #1 ; CHECK-NEXT: fmadd h1, h1, h0, h2 -; CHECK-NEXT: str h1, [x9], #2 +; CHECK-NEXT: str h1, [x10], #2 ; CHECK-NEXT: b.ne .LBB0_7 ; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll index f6bbdf5d95d87..cb281518d1fd5 100644 --- a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll @@ -36,23 +36,23 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) { ; CHECK-NEXT: .LBB0_5: // %vector.ph ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: dup v0.8h, w15 -; CHECK-NEXT: mov x16, x14 -; CHECK-NEXT: mov x17, x13 -; CHECK-NEXT: mov x18, x12 +; CHECK-NEXT: mov x16, x12 +; CHECK-NEXT: mov x17, x14 +; CHECK-NEXT: mov x18, x13 ; CHECK-NEXT: .LBB0_6: // %vector.body ; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldp q1, q4, [x16, #-16] -; CHECK-NEXT: subs x18, x18, #16 -; CHECK-NEXT: ldp q3, q2, [x17, #-32] -; CHECK-NEXT: add x16, x16, #32 -; CHECK-NEXT: ldp q6, q5, [x17] +; CHECK-NEXT: ldp q1, q4, [x17, #-16] +; CHECK-NEXT: subs x16, x16, #16 +; CHECK-NEXT: ldp q3, q2, [x18, #-32] +; CHECK-NEXT: add x17, x17, #32 +; CHECK-NEXT: ldp q6, q5, [x18] ; CHECK-NEXT: smlal2 v2.4s, v0.8h, v1.8h ; CHECK-NEXT: smlal v3.4s, v0.4h, v1.4h ; CHECK-NEXT: smlal2 v5.4s, v0.8h, v4.8h ; CHECK-NEXT: smlal v6.4s, v0.4h, v4.4h -; CHECK-NEXT: stp q3, q2, [x17, #-32] -; CHECK-NEXT: stp q6, q5, [x17], #64 +; CHECK-NEXT: stp q3, q2, [x18, #-32] +; CHECK-NEXT: stp q6, q5, [x18], #64 ; CHECK-NEXT: b.ne .LBB0_6 ; CHECK-NEXT: // %bb.7: // %middle.block ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 2a37183c47d51..bc78558ec398d 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -1666,15 +1666,15 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-NEXT: ldr q0, [x8, lCPI17_0@PAGEOFF] ; CHECK-NEXT: Lloh21: ; CHECK-NEXT: ldr q1, [x9, lCPI17_1@PAGEOFF] -; CHECK-NEXT: add x8, x1, #64 -; CHECK-NEXT: add x9, x0, #8 +; CHECK-NEXT: add x8, x0, #8 +; CHECK-NEXT: add x9, x1, #64 ; CHECK-NEXT: LBB17_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp d2, d3, [x9, #-8] +; CHECK-NEXT: ldp d2, d3, [x8, #-8] ; CHECK-NEXT: subs x10, x10, #16 -; CHECK-NEXT: ldp q7, q5, [x8, #-32] -; CHECK-NEXT: add x9, x9, #16 -; CHECK-NEXT: ldp q17, q6, [x8, #-64] +; CHECK-NEXT: ldp q7, q5, [x9, #-32] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: ldp q17, q6, [x9, #-64] ; CHECK-NEXT: tbl.16b v4, { v2 }, v1 ; CHECK-NEXT: tbl.16b v2, { v2 }, v0 ; CHECK-NEXT: tbl.16b v16, { v3 }, v1 @@ -1682,17 +1682,17 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-NEXT: uaddw2.2d v5, v5, v4 ; CHECK-NEXT: uaddw2.2d v6, v6, v2 ; CHECK-NEXT: uaddw.2d v4, v7, v4 -; CHECK-NEXT: ldp q18, q7, [x8, #32] +; CHECK-NEXT: ldp q18, q7, [x9, #32] ; CHECK-NEXT: uaddw.2d v2, v17, v2 -; CHECK-NEXT: stp q4, q5, [x8, #-32] +; CHECK-NEXT: stp q4, q5, [x9, #-32] ; CHECK-NEXT: uaddw2.2d v5, v7, v16 -; CHECK-NEXT: stp q2, q6, [x8, #-64] +; CHECK-NEXT: stp q2, q6, [x9, #-64] ; CHECK-NEXT: uaddw.2d v16, v18, v16 -; CHECK-NEXT: ldp q7, q6, [x8] -; CHECK-NEXT: stp q16, q5, [x8, #32] +; CHECK-NEXT: ldp q7, q6, [x9] +; CHECK-NEXT: stp q16, q5, [x9, #32] ; CHECK-NEXT: uaddw2.2d v4, v6, v3 ; CHECK-NEXT: uaddw.2d v2, v7, v3 -; CHECK-NEXT: stp q2, q4, [x8], #128 +; CHECK-NEXT: stp q2, q4, [x9], #128 ; CHECK-NEXT: b.ne LBB17_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret @@ -1708,31 +1708,31 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-BE-NEXT: adrp x9, .LCPI17_1 ; CHECK-BE-NEXT: add x9, x9, :lo12:.LCPI17_1 ; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] -; CHECK-BE-NEXT: add x9, x1, #64 -; CHECK-BE-NEXT: add x10, x0, #8 +; CHECK-BE-NEXT: add x9, x0, #8 +; CHECK-BE-NEXT: add x10, x1, #64 ; CHECK-BE-NEXT: .LBB17_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: ld1 { v2.8b }, [x10] -; CHECK-BE-NEXT: sub x11, x10, #8 -; CHECK-BE-NEXT: add x15, x9, #32 +; CHECK-BE-NEXT: ld1 { v2.8b }, [x9] +; CHECK-BE-NEXT: sub x11, x9, #8 +; CHECK-BE-NEXT: add x15, x10, #32 ; CHECK-BE-NEXT: ld1 { v3.8b }, [x11] ; CHECK-BE-NEXT: ld1 { v16.2d }, [x15] -; CHECK-BE-NEXT: sub x11, x9, #64 -; CHECK-BE-NEXT: sub x12, x9, #32 -; CHECK-BE-NEXT: ld1 { v6.2d }, [x9] +; CHECK-BE-NEXT: sub x11, x10, #64 +; CHECK-BE-NEXT: sub x12, x10, #32 +; CHECK-BE-NEXT: ld1 { v6.2d }, [x10] ; CHECK-BE-NEXT: ld1 { v21.2d }, [x11] ; CHECK-BE-NEXT: tbl v4.16b, { v2.16b }, v1.16b ; CHECK-BE-NEXT: tbl v2.16b, { v2.16b }, v0.16b ; CHECK-BE-NEXT: ld1 { v19.2d }, [x12] ; CHECK-BE-NEXT: tbl v5.16b, { v3.16b }, v1.16b ; CHECK-BE-NEXT: tbl v3.16b, { v3.16b }, v0.16b -; CHECK-BE-NEXT: sub x13, x9, #16 -; CHECK-BE-NEXT: sub x14, x9, #48 -; CHECK-BE-NEXT: add x16, x9, #48 -; CHECK-BE-NEXT: add x17, x9, #16 +; CHECK-BE-NEXT: sub x13, x10, #16 +; CHECK-BE-NEXT: sub x14, x10, #48 +; CHECK-BE-NEXT: add x16, x10, #48 +; CHECK-BE-NEXT: add x17, x10, #16 ; CHECK-BE-NEXT: ld1 { v22.2d }, [x13] ; CHECK-BE-NEXT: subs x8, x8, #16 -; CHECK-BE-NEXT: add x10, x10, #16 +; CHECK-BE-NEXT: add x9, x9, #16 ; CHECK-BE-NEXT: rev32 v7.8b, v4.8b ; CHECK-BE-NEXT: ext v4.16b, v4.16b, v4.16b, #8 ; CHECK-BE-NEXT: rev32 v17.8b, v2.8b @@ -1753,8 +1753,8 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-BE-NEXT: uaddw v3.2d, v21.2d, v3.2s ; CHECK-BE-NEXT: st1 { v7.2d }, [x15] ; CHECK-BE-NEXT: ld1 { v7.2d }, [x17] -; CHECK-BE-NEXT: st1 { v6.2d }, [x9] -; CHECK-BE-NEXT: add x9, x9, #128 +; CHECK-BE-NEXT: st1 { v6.2d }, [x10] +; CHECK-BE-NEXT: add x10, x10, #128 ; CHECK-BE-NEXT: uaddw v4.2d, v16.2d, v4.2s ; CHECK-BE-NEXT: st1 { v5.2d }, [x12] ; CHECK-BE-NEXT: uaddw v5.2d, v22.2d, v17.2s diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index ecbf5dfeb3af1..d0a6255cd0395 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -24,27 +24,27 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: .LBB0_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_not_b32 s10, s5 -; GFX9-NEXT: s_mul_i32 s9, s6, s5 +; GFX9-NEXT: s_not_b32 s10, s3 +; GFX9-NEXT: s_mul_i32 s9, s6, s3 ; GFX9-NEXT: s_mul_i32 s10, s6, s10 -; GFX9-NEXT: s_add_i32 s11, s5, 1 +; GFX9-NEXT: s_add_i32 s11, s3, 1 ; GFX9-NEXT: s_sub_i32 s9, s7, s9 ; GFX9-NEXT: s_add_i32 s10, s7, s10 ; GFX9-NEXT: s_cmp_ge_u32 s9, s6 -; GFX9-NEXT: s_cselect_b32 s11, s11, s5 +; GFX9-NEXT: s_cselect_b32 s11, s11, s3 ; GFX9-NEXT: s_cselect_b32 s9, s10, s9 ; GFX9-NEXT: s_add_i32 s10, s11, 1 ; GFX9-NEXT: s_cmp_ge_u32 s9, s6 ; GFX9-NEXT: s_cselect_b32 s9, s10, s11 -; GFX9-NEXT: s_add_u32 s10, s0, s2 -; GFX9-NEXT: s_addc_u32 s11, s1, s3 +; GFX9-NEXT: s_add_u32 s10, s0, s4 +; GFX9-NEXT: s_addc_u32 s11, s1, s5 ; GFX9-NEXT: s_add_i32 s7, s7, 1 -; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: s_add_u32 s4, s4, 4 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_add_u32 s2, s2, 4 +; GFX9-NEXT: s_add_u32 s2, s2, s8 ; GFX9-NEXT: s_addc_u32 s3, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000 +; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x1000 ; GFX9-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -72,27 +72,27 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: .LBB0_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_not_b32 s10, s5 -; GFX10-NEXT: s_mul_i32 s9, s6, s5 +; GFX10-NEXT: s_not_b32 s10, s3 +; GFX10-NEXT: s_mul_i32 s9, s6, s3 ; GFX10-NEXT: s_mul_i32 s10, s6, s10 ; GFX10-NEXT: s_sub_i32 s9, s7, s9 -; GFX10-NEXT: s_add_i32 s11, s5, 1 +; GFX10-NEXT: s_add_i32 s11, s3, 1 ; GFX10-NEXT: s_add_i32 s10, s7, s10 ; GFX10-NEXT: s_cmp_ge_u32 s9, s6 -; GFX10-NEXT: s_cselect_b32 s11, s11, s5 +; GFX10-NEXT: s_cselect_b32 s11, s11, s3 ; GFX10-NEXT: s_cselect_b32 s9, s10, s9 ; GFX10-NEXT: s_add_i32 s10, s11, 1 ; GFX10-NEXT: s_cmp_ge_u32 s9, s6 ; GFX10-NEXT: s_cselect_b32 s9, s10, s11 -; GFX10-NEXT: s_add_u32 s10, s0, s2 -; GFX10-NEXT: s_addc_u32 s11, s1, s3 +; GFX10-NEXT: s_add_u32 s10, s0, s4 +; GFX10-NEXT: s_addc_u32 s11, s1, s5 ; GFX10-NEXT: s_add_i32 s7, s7, 1 -; GFX10-NEXT: s_add_u32 s4, s4, s8 +; GFX10-NEXT: s_add_u32 s4, s4, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-NEXT: s_add_u32 s2, s2, 4 +; GFX10-NEXT: s_add_u32 s2, s2, s8 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000 +; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x1000 ; GFX10-NEXT: global_store_dword v0, v1, s[10:11] ; GFX10-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 @@ -123,28 +123,27 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB0_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s10, s5 -; GFX11-NEXT: s_mul_i32 s9, s6, s5 +; GFX11-NEXT: s_not_b32 s10, s3 +; GFX11-NEXT: s_mul_i32 s9, s6, s3 ; GFX11-NEXT: s_mul_i32 s10, s6, s10 ; GFX11-NEXT: s_sub_i32 s9, s7, s9 -; GFX11-NEXT: s_add_i32 s11, s5, 1 +; GFX11-NEXT: s_add_i32 s11, s3, 1 ; GFX11-NEXT: s_add_i32 s10, s7, s10 ; GFX11-NEXT: s_cmp_ge_u32 s9, s6 -; GFX11-NEXT: s_cselect_b32 s11, s11, s5 +; GFX11-NEXT: s_cselect_b32 s11, s11, s3 ; GFX11-NEXT: s_cselect_b32 s9, s10, s9 ; GFX11-NEXT: s_add_i32 s10, s11, 1 ; GFX11-NEXT: s_cmp_ge_u32 s9, s6 ; GFX11-NEXT: s_cselect_b32 s9, s10, s11 -; GFX11-NEXT: s_add_u32 s10, s0, s2 -; GFX11-NEXT: s_addc_u32 s11, s1, s3 +; GFX11-NEXT: s_add_u32 s10, s0, s4 +; GFX11-NEXT: s_addc_u32 s11, s1, s5 ; GFX11-NEXT: s_add_i32 s7, s7, 1 -; GFX11-NEXT: s_add_u32 s4, s4, s8 +; GFX11-NEXT: s_add_u32 s4, s4, 4 ; GFX11-NEXT: v_mov_b32_e32 v1, s9 ; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s2, s2, 4 +; GFX11-NEXT: s_add_u32 s2, s2, s8 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000 +; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x1000 ; GFX11-NEXT: global_store_b32 v0, v1, s[10:11] ; GFX11-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 @@ -187,8 +186,8 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: .LBB1_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_not_b32 s10, s5 -; GFX9-NEXT: s_mul_i32 s9, s6, s5 +; GFX9-NEXT: s_not_b32 s10, s3 +; GFX9-NEXT: s_mul_i32 s9, s6, s3 ; GFX9-NEXT: s_mul_i32 s10, s6, s10 ; GFX9-NEXT: s_sub_i32 s9, s7, s9 ; GFX9-NEXT: s_add_i32 s10, s7, s10 @@ -197,15 +196,15 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_sub_i32 s10, s9, s6 ; GFX9-NEXT: s_cmp_ge_u32 s9, s6 ; GFX9-NEXT: s_cselect_b32 s9, s10, s9 -; GFX9-NEXT: s_add_u32 s10, s0, s2 -; GFX9-NEXT: s_addc_u32 s11, s1, s3 +; GFX9-NEXT: s_add_u32 s10, s0, s4 +; GFX9-NEXT: s_addc_u32 s11, s1, s5 ; GFX9-NEXT: s_add_i32 s7, s7, 1 -; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: s_add_u32 s4, s4, 4 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_add_u32 s2, s2, 4 +; GFX9-NEXT: s_add_u32 s2, s2, s8 ; GFX9-NEXT: s_addc_u32 s3, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000 +; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x1000 ; GFX9-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -232,9 +231,9 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_mov_b64 s[4:5], 0 ; GFX10-NEXT: .LBB1_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_not_b32 s9, s5 +; GFX10-NEXT: s_not_b32 s9, s3 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mul_i32 s10, s6, s5 +; GFX10-NEXT: s_mul_i32 s10, s6, s3 ; GFX10-NEXT: s_mul_i32 s9, s6, s9 ; GFX10-NEXT: s_sub_i32 s10, s7, s10 ; GFX10-NEXT: s_add_i32 s9, s7, s9 @@ -243,15 +242,15 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_sub_i32 s10, s9, s6 ; GFX10-NEXT: s_cmp_ge_u32 s9, s6 ; GFX10-NEXT: s_cselect_b32 s9, s10, s9 -; GFX10-NEXT: s_add_u32 s10, s0, s2 -; GFX10-NEXT: s_addc_u32 s11, s1, s3 +; GFX10-NEXT: s_add_u32 s10, s0, s4 +; GFX10-NEXT: s_addc_u32 s11, s1, s5 ; GFX10-NEXT: s_add_i32 s7, s7, 1 -; GFX10-NEXT: s_add_u32 s4, s4, s8 +; GFX10-NEXT: s_add_u32 s4, s4, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-NEXT: s_add_u32 s2, s2, 4 +; GFX10-NEXT: s_add_u32 s2, s2, s8 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000 +; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x1000 ; GFX10-NEXT: global_store_dword v0, v1, s[10:11] ; GFX10-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 @@ -282,9 +281,8 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB1_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s9, s5 -; GFX11-NEXT: s_mul_i32 s10, s6, s5 +; GFX11-NEXT: s_not_b32 s9, s3 +; GFX11-NEXT: s_mul_i32 s10, s6, s3 ; GFX11-NEXT: s_mul_i32 s9, s6, s9 ; GFX11-NEXT: s_sub_i32 s10, s7, s10 ; GFX11-NEXT: s_add_i32 s9, s7, s9 @@ -294,15 +292,15 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_sub_i32 s10, s9, s6 ; GFX11-NEXT: s_cmp_ge_u32 s9, s6 ; GFX11-NEXT: s_cselect_b32 s9, s10, s9 -; GFX11-NEXT: s_add_u32 s10, s0, s2 -; GFX11-NEXT: s_addc_u32 s11, s1, s3 +; GFX11-NEXT: s_add_u32 s10, s0, s4 +; GFX11-NEXT: s_addc_u32 s11, s1, s5 ; GFX11-NEXT: s_add_i32 s7, s7, 1 -; GFX11-NEXT: s_add_u32 s4, s4, s8 +; GFX11-NEXT: s_add_u32 s4, s4, 4 ; GFX11-NEXT: v_mov_b32_e32 v1, s9 ; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s2, s2, 4 +; GFX11-NEXT: s_add_u32 s2, s2, s8 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000 +; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x1000 ; GFX11-NEXT: global_store_b32 v0, v1, s[10:11] ; GFX11-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 9cc42ac448067..9adad3e6ddb18 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -12628,8 +12628,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo -; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 ; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 ; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(35) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 @@ -14296,7 +14296,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_mov_b32 s7, -1 ; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x3a +; ALIGNED-NEXT: s_clause 0x39 ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 @@ -14355,55 +14355,54 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: s_waitcnt vmcnt(58) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(57) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(56) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(55) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(54) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(51) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(50) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(48) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(47) -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(45) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(34) +; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 @@ -14412,27 +14411,27 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(26) +; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(24) +; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(22) +; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(17) +; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -14442,13 +14441,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 @@ -14457,13 +14456,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -14741,10 +14740,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:146 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:150 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 @@ -14756,8 +14753,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) @@ -14770,85 +14765,90 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v127 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v125, 8, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:159 ; ALIGNED-NEXT: buffer_load_ubyte v106, v4, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v120, 8, v121 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v109, 8, v120 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v107, 8, v110 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v94, 8, v105 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v93, 8, v105 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v106, 8, v92 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v106, 8, v91 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 ; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:160 ; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v73, v4, s[0:3], 0 offen offset:162 ; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:163 ; ALIGNED-NEXT: buffer_load_ubyte v88, v4, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:166 ; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v75 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v73 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v76 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v77 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v88 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 @@ -15046,19 +15046,31 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: v_lshl_or_b32 v123, v3, 16, v2 +; ALIGNED-NEXT: s_clause 0x5 ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen -; ALIGNED-NEXT: s_waitcnt vmcnt(23) +; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(28) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v25, 8, v27 -; ALIGNED-NEXT: s_waitcnt vmcnt(21) +; ALIGNED-NEXT: s_waitcnt vmcnt(26) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v24, 8, v26 -; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v43, v12, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v57, v8, 8, v10 ; ALIGNED-NEXT: v_lshl_or_b32 v104, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v21, 8, v22 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v20 -; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v76, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 ; ALIGNED-NEXT: v_lshl_or_b32 v101, v3, 16, v2 @@ -15066,60 +15078,46 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v84, v43, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v43, v9, 8, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v57, 16, v43 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v43, v5, 8, v6 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v57, v7, 8, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v57, 16, v43 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x5 -; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v43, v43, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v57, v57, 8, v127 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v57, v57, 8, v94 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 ; ALIGNED-NEXT: v_lshl_or_b32 v43, v90, 8, v78 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v124, 8, v91 +; ALIGNED-NEXT: v_lshl_or_b32 v57, v124, 8, v92 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v43, v107, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v43, v109, 8, v122 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v108, 8, v110 +; ALIGNED-NEXT: v_lshl_or_b32 v57, v108, 8, v111 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v43, v91, 8, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v43, v92, 8, v95 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v93, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v57, v94, 8, v90 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 @@ -15158,7 +15156,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v16 offset:246 ; ALIGNED-NEXT: flat_store_byte v[2:3], v18 offset:244 ; ALIGNED-NEXT: flat_store_byte v[2:3], v19 offset:240 -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:504 ; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126 @@ -15288,12 +15286,12 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v62 offset:174 ; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:172 ; ALIGNED-NEXT: flat_store_byte v[2:3], v59 offset:168 -; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:162 +; ALIGNED-NEXT: flat_store_byte v[2:3], v73 offset:162 ; ALIGNED-NEXT: flat_store_byte v[2:3], v74 offset:163 ; ALIGNED-NEXT: flat_store_byte v[2:3], v79 offset:161 ; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:167 -; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:165 -; ALIGNED-NEXT: flat_store_byte v[2:3], v76 offset:166 +; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:165 +; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:166 ; ALIGNED-NEXT: flat_store_byte v[2:3], v88 offset:164 ; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:160 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload @@ -15308,18 +15306,16 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:154 +; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:154 ; ALIGNED-NEXT: flat_store_byte v[2:3], v106 offset:155 -; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:153 -; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:159 -; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:157 -; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:158 -; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:156 +; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:153 +; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:159 +; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:157 +; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:158 +; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:156 ; ALIGNED-NEXT: flat_store_byte v[2:3], v105 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:146 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:147 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload @@ -15329,10 +15325,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:149 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:148 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload @@ -15823,28 +15819,30 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; ALIGNED-NEXT: flat_store_byte v[2:3], v90 offset:10 -; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:11 -; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:13 -; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:9 +; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:11 +; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:13 +; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:9 ; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:15 -; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:14 -; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:12 +; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:14 +; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:12 ; ALIGNED-NEXT: flat_store_byte v[2:3], v95 offset:8 -; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:3 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:1 ; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:6 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:4 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload @@ -16005,8 +16003,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:44 ; UNROLL3-NEXT: v_add_co_u32 v15, vcc_lo, v0, s4 ; UNROLL3-NEXT: v_add_co_ci_u32_e64 v16, null, s5, v1, vcc_lo -; UNROLL3-NEXT: v_subrev_nc_u32_e32 v2, 48, v2 ; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: v_subrev_nc_u32_e32 v2, 48, v2 ; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(4) ; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[7:10] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll index 272daa9dd0b59..9f627820369eb 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -331,9 +331,9 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v1 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 +; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: v_mov_b32_e32 v11, v5 -; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_5: ; %memmove_fwd_main_loop @@ -378,9 +378,9 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: .LBB2_9: ; %Flow32 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 ; CHECK-NEXT: ; implicit-def: $vgpr2 -; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB2_2 @@ -389,11 +389,11 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_cbranch_execz .LBB2_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4 -; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 -; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, v4, v1, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v4, s4 +; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -623,9 +623,9 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v1 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 +; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: v_mov_b32_e32 v11, v5 -; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB4_5: ; %memmove_fwd_main_loop @@ -674,9 +674,9 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: .LBB4_9: ; %Flow32 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 ; CHECK-NEXT: ; implicit-def: $vgpr2 -; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB4_2 @@ -685,11 +685,11 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_cbranch_execz .LBB4_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4 -; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 -; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, v4, v1, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v4, s4 +; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB4_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1079,8 +1079,8 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 -; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo +; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: .LBB7_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v7, v2 @@ -1296,8 +1296,8 @@ define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 -; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo +; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: .LBB9_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen @@ -1354,9 +1354,9 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v2 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 +; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: v_mov_b32_e32 v9, v1 ; CHECK-NEXT: v_mov_b32_e32 v11, v7 -; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB10_5: ; %memmove_fwd_main_loop @@ -1379,31 +1379,31 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execz .LBB10_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 +; CHECK-NEXT: v_add_co_u32 v1, s5, v1, v7 +; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v8, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3 -; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v8, s5 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB10_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: flat_load_ubyte v2, v[0:1] +; CHECK-NEXT: flat_load_ubyte v3, v[1:2] ; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v6, null, -1, v6, s5 -; CHECK-NEXT: v_add_co_u32 v0, s5, v0, 1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s5 +; CHECK-NEXT: v_add_co_u32 v1, s5, v1, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[5:6] ; CHECK-NEXT: s_or_b32 s9, s6, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_write_b8 v3, v2 -; CHECK-NEXT: v_add_nc_u32_e32 v3, 1, v3 +; CHECK-NEXT: ds_write_b8 v0, v3 +; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB10_8 ; CHECK-NEXT: .LBB10_9: ; %Flow32 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 +; CHECK-NEXT: ; implicit-def: $vgpr1_vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: ; implicit-def: $vgpr1_vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB10_2 @@ -1412,11 +1412,11 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execz .LBB10_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4 -; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 -; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, v4, v2, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v4, s4 +; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB10_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1507,8 +1507,8 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 -; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: .LBB11_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 @@ -1701,8 +1701,8 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 -; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: .LBB13_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 @@ -1829,9 +1829,9 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v2 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 +; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: v_mov_b32_e32 v9, v1 ; CHECK-NEXT: v_mov_b32_e32 v11, v7 -; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB15_5: ; %memmove_fwd_main_loop @@ -1857,31 +1857,31 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execz .LBB15_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 +; CHECK-NEXT: v_add_co_u32 v1, s5, v1, v7 +; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v8, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3 -; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v8, s5 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB15_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: flat_load_ubyte v2, v[0:1] +; CHECK-NEXT: flat_load_ubyte v3, v[1:2] ; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v6, null, -1, v6, s5 -; CHECK-NEXT: v_add_co_u32 v0, s5, v0, 1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s5 +; CHECK-NEXT: v_add_co_u32 v1, s5, v1, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[5:6] ; CHECK-NEXT: s_or_b32 s9, s6, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen -; CHECK-NEXT: v_add_nc_u32_e32 v3, 1, v3 +; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB15_8 ; CHECK-NEXT: .LBB15_9: ; %Flow32 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 +; CHECK-NEXT: ; implicit-def: $vgpr1_vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: ; implicit-def: $vgpr1_vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB15_2 @@ -1890,11 +1890,11 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execz .LBB15_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4 -; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 -; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, v4, v2, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v4, s4 +; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB15_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1991,8 +1991,8 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 -; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: .LBB16_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 @@ -2129,8 +2129,8 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 -; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: .LBB18_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 803cae4a7f9cd..e78c31e22e87c 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -63,10 +63,10 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6 ; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v6 -; GFX9-NEXT: v_lshl_add_u32 v6, v4, 2, v3 +; GFX9-NEXT: v_add_u32_e32 v6, v17, v12 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v2 -; GFX9-NEXT: v_add_u32_e32 v9, v17, v12 +; GFX9-NEXT: v_lshl_add_u32 v8, v4, 2, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 2, v2 ; GFX9-NEXT: s_mov_b64 s[10:11], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr3 @@ -77,16 +77,16 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: v_madak_f32 v3, v3, v7, 0x3727c5ac ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_u32_u24_e32 v18, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v19, v3, v16 -; GFX9-NEXT: v_add_u32_e32 v3, v9, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v18 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v16 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, v13 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v15 ; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v19, v13 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v19, v15, v[3:4] ; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v12, v14 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: v_sub_u32_e32 v3, v3, v18 +; GFX9-NEXT: v_add3_u32 v3, v6, v0, v3 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v18, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[18:19], 2, v[3:4] ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_co_u32_e64 v18, s[6:7], v10, v18 @@ -96,8 +96,8 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] -; GFX9-NEXT: ds_write_b32 v6, v3 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 +; GFX9-NEXT: ds_write_b32 v8, v3 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v9 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_cbranch_execnz .LBB1_2 ; GFX9-NEXT: .LBB1_3: ; %Flow2 diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll index 944951d3a536a..b10d8551a2eef 100644 --- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll @@ -11,13 +11,13 @@ define amdgpu_kernel void @simple_barrier(ptr addrspace(1) %arg) { ; CHECK-LABEL: @simple_barrier( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0:![0-9]+]] ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 -; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform [[META0]] +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 @@ -55,17 +55,17 @@ bb: define amdgpu_kernel void @memory_phi_no_clobber(ptr addrspace(1) %arg, i1 %cond) { ; CHECK-LABEL: @memory_phi_no_clobber( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br i1 %cond, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform [[META0]] ; CHECK: if.then: ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform [[META0]] ; CHECK: if.else: ; CHECK-NEXT: fence syncscope("workgroup") release -; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 +; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform [[META0]] ; CHECK: if.end: -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 -; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform [[META0]] +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 @@ -121,17 +121,17 @@ if.end: define amdgpu_kernel void @memory_phi_clobber1(ptr addrspace(1) %arg, i1 %cond) { ; CHECK-LABEL: @memory_phi_clobber1( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br i1 %cond, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform [[META0]] ; CHECK: if.then: ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3 ; CHECK-NEXT: store i32 1, ptr addrspace(1) [[GEP]], align 4 -; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform [[META0]] ; CHECK: if.else: ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 +; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform [[META0]] ; CHECK: if.end: -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform [[META0]] ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 @@ -190,17 +190,17 @@ if.end: define amdgpu_kernel void @memory_phi_clobber2(ptr addrspace(1) %arg, i1 %cond) { ; CHECK-LABEL: @memory_phi_clobber2( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br i1 %cond, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform [[META0]] ; CHECK: if.then: ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform [[META0]] ; CHECK: if.else: ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3 ; CHECK-NEXT: store i32 1, ptr addrspace(1) [[GEP]], align 4 -; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 +; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform [[META0]] ; CHECK: if.end: -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform [[META0]] ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 @@ -259,16 +259,16 @@ if.end: define amdgpu_kernel void @no_clobbering_loop1(ptr addrspace(1) %arg, i1 %cc) { ; CHECK-LABEL: @no_clobbering_loop1( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform [[META0]] ; CHECK: while.cond: -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 -; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform [[META0]] +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() -; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform [[META0]] ; CHECK: end: ; CHECK-NEXT: ret void ; @@ -316,18 +316,18 @@ end: define amdgpu_kernel void @no_clobbering_loop2(ptr addrspace(1) noalias %arg, ptr addrspace(1) noalias %out, i32 %n) { ; CHECK-LABEL: @no_clobbering_loop2( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform [[META0]] ; CHECK: while.cond: ; CHECK-NEXT: [[C:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[INC:%.*]], [[WHILE_COND]] ] ; CHECK-NEXT: [[ACC:%.*]] = phi i32 [ [[I]], [[BB]] ], [ [[I3:%.*]], [[WHILE_COND]] ] -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i32 [[C]], !amdgpu.uniform !0 -; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i32 [[C]], !amdgpu.uniform [[META0]] +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: [[I3]] = add i32 [[I2]], [[ACC]] ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[C]], 1 ; CHECK-NEXT: [[CC:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] -; CHECK-NEXT: br i1 [[CC]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br i1 [[CC]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform [[META0]] ; CHECK: end: ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void @@ -341,12 +341,12 @@ define amdgpu_kernel void @no_clobbering_loop2(ptr addrspace(1) noalias %arg, pt ; GCN-NEXT: .LBB5_1: ; %while.cond ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_load_dword s5, s[0:1], 0x0 +; GCN-NEXT: s_add_i32 s6, s6, -1 ; GCN-NEXT: ; wave barrier ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_i32 s4, s5, s4 ; GCN-NEXT: s_add_u32 s0, s0, 4 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_add_i32 s6, s6, -1 ; GCN-NEXT: s_cmp_eq_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB5_1 ; GCN-NEXT: ; %bb.2: ; %end @@ -377,16 +377,16 @@ end: define amdgpu_kernel void @clobbering_loop(ptr addrspace(1) %arg, ptr addrspace(1) %out, i1 %cc) { ; CHECK-LABEL: @clobbering_loop( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform [[META0]] ; CHECK: while.cond: -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform [[META0]] ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT:%.*]], i64 1 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4 ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() -; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 +; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform [[META0]] ; CHECK: end: ; CHECK-NEXT: ret void ; @@ -433,10 +433,10 @@ end: define amdgpu_kernel void @clobber_by_atomic_load(ptr addrspace(1) %arg) { ; CHECK-LABEL: @clobber_by_atomic_load( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2, !amdgpu.uniform !0 -; CHECK-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[GEP]] seq_cst, align 4, !amdgpu.noclobber !0 -; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3, !amdgpu.uniform !0 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2, !amdgpu.uniform [[META0]] +; CHECK-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[GEP]] seq_cst, align 4, !amdgpu.noclobber [[META0]] +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3, !amdgpu.uniform [[META0]] ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 4 @@ -477,7 +477,7 @@ define protected amdgpu_kernel void @no_alias_store(ptr addrspace(1) %in, ptr ad ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -543,7 +543,7 @@ define protected amdgpu_kernel void @no_alias_volatile_store(ptr addrspace(1) %i ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -573,7 +573,7 @@ define protected amdgpu_kernel void @no_alias_atomic_rmw_relaxed(ptr addrspace(1 ; CHECK-LABEL: @no_alias_atomic_rmw_relaxed( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 monotonic, align 4 -; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -603,7 +603,7 @@ define protected amdgpu_kernel void @no_alias_atomic_cmpxchg(ptr addrspace(1) %i ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -640,7 +640,7 @@ define protected amdgpu_kernel void @no_alias_atomic_rmw(ptr addrspace(1) %in, p ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -791,7 +791,7 @@ define protected amdgpu_kernel void @no_alias_atomic_rmw_then_no_alias_store(ptr ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber [[META0]] ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll b/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll index 71a1678669a80..456468d793c09 100644 --- a/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll +++ b/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll @@ -22,14 +22,12 @@ define void @main(i8 %val8) nounwind "frame-pointer"="none" { ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r1, [r0, #-21] ; CHECK-NEXT: ldr r2, [r0, #-17] +; CHECK-NEXT: adds r0, #12 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: bxne lr -; CHECK-NEXT: LBB0_2: @ %_Z14printIsNotZeroi.exit17.for.body_crit_edge -; CHECK-NEXT: @ in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: adds r0, #12 -; CHECK-NEXT: b LBB0_1 +; CHECK-NEXT: beq LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %if.then.i16 +; CHECK-NEXT: bx lr for.body.lr.ph: br label %for.body diff --git a/llvm/test/CodeGen/ARM/dsp-loop-indexing.ll b/llvm/test/CodeGen/ARM/dsp-loop-indexing.ll index 9fb64471e9881..54b265ec22546 100644 --- a/llvm/test/CodeGen/ARM/dsp-loop-indexing.ll +++ b/llvm/test/CodeGen/ARM/dsp-loop-indexing.ll @@ -80,12 +80,12 @@ exit: ; CHECK-DEFAULT: str{{.*}}, #-4] ; CHECK-DEFAULT: sub{{.*}}, #8 -; CHECK-COMPLEX: ldr{{.*}} lsl #2] -; CHECK-COMPLEX: ldr{{.*}} lsl #2] -; CHECK-COMPLEX: str{{.*}} lsl #2] -; CHECK-COMPLEX: ldr{{.*}} lsl #2] -; CHECK-COMPLEX: ldr{{.*}} lsl #2] -; CHECK-COMPLEX: str{{.*}} lsl #2] +; CHECK-COMPLEX: ldr{{.*}} [[MEM:\[r[0-9]+\]]] +; CHECK-COMPLEX: ldr{{.*}} [[MEM:\[r[0-9]+\]]] +; CHECK-COMPLEX: str{{.*}} [[MEM:\[r[0-9]+\]]] +; CHECK-COMPLEX: ldr{{.*}} #-4] +; CHECK-COMPLEX: ldr{{.*}} #-4] +; CHECK-COMPLEX: str{{.*}} #-4] ; DISABLED-NOT: ldr{{.*}}]! ; DISABLED-NOT: str{{.*}}]! diff --git a/llvm/test/CodeGen/ARM/fpclamptosat.ll b/llvm/test/CodeGen/ARM/fpclamptosat.ll index 8ab56b228d2a7..c63f0a6313698 100644 --- a/llvm/test/CodeGen/ARM/fpclamptosat.ll +++ b/llvm/test/CodeGen/ARM/fpclamptosat.ll @@ -3658,25 +3658,28 @@ define void @unroll_maxmin(ptr nocapture %0, ptr nocapture readonly %1, i32 %2) ; ; VFP2-LABEL: unroll_maxmin: ; VFP2: @ %bb.0: -; VFP2-NEXT: subs r1, #8 +; VFP2-NEXT: .save {r7, lr} +; VFP2-NEXT: push {r7, lr} ; VFP2-NEXT: subs r0, #8 ; VFP2-NEXT: vldr s0, .LCPI54_0 -; VFP2-NEXT: mov.w r2, #1024 +; VFP2-NEXT: mvn r2, #7 +; VFP2-NEXT: movw r12, #4088 ; VFP2-NEXT: .LBB54_1: @ =>This Inner Loop Header: Depth=1 -; VFP2-NEXT: vldr s2, [r1, #8] -; VFP2-NEXT: subs r2, #2 +; VFP2-NEXT: add.w lr, r1, r2 +; VFP2-NEXT: adds r2, #8 +; VFP2-NEXT: cmp r2, r12 +; VFP2-NEXT: vldr s2, [lr, #8] ; VFP2-NEXT: vmul.f32 s2, s2, s0 ; VFP2-NEXT: vcvt.s32.f32 s2, s2 ; VFP2-NEXT: vmov r3, s2 ; VFP2-NEXT: str r3, [r0, #8]! -; VFP2-NEXT: vldr s2, [r1, #12] -; VFP2-NEXT: add.w r1, r1, #8 +; VFP2-NEXT: vldr s2, [lr, #12] ; VFP2-NEXT: vmul.f32 s2, s2, s0 ; VFP2-NEXT: vcvt.s32.f32 s2, s2 ; VFP2-NEXT: vstr s2, [r0, #4] ; VFP2-NEXT: bne .LBB54_1 ; VFP2-NEXT: @ %bb.2: -; VFP2-NEXT: bx lr +; VFP2-NEXT: pop {r7, pc} ; VFP2-NEXT: .p2align 2 ; VFP2-NEXT: @ %bb.3: ; VFP2-NEXT: .LCPI54_0: @@ -3687,17 +3690,18 @@ define void @unroll_maxmin(ptr nocapture %0, ptr nocapture readonly %1, i32 %2) ; FULL-NEXT: .save {r7, lr} ; FULL-NEXT: push {r7, lr} ; FULL-NEXT: mov.w lr, #512 -; FULL-NEXT: subs r1, #8 ; FULL-NEXT: subs r0, #8 ; FULL-NEXT: vldr s0, .LCPI54_0 +; FULL-NEXT: mvn r2, #7 ; FULL-NEXT: .LBB54_1: @ =>This Inner Loop Header: Depth=1 -; FULL-NEXT: vldr s2, [r1, #8] +; FULL-NEXT: add.w r12, r1, r2 +; FULL-NEXT: adds r2, #8 +; FULL-NEXT: vldr s2, [r12, #8] ; FULL-NEXT: vmul.f32 s2, s2, s0 ; FULL-NEXT: vcvt.s32.f32 s2, s2 -; FULL-NEXT: vmov r2, s2 -; FULL-NEXT: str r2, [r0, #8]! -; FULL-NEXT: vldr s2, [r1, #12] -; FULL-NEXT: adds r1, #8 +; FULL-NEXT: vmov r3, s2 +; FULL-NEXT: str r3, [r0, #8]! +; FULL-NEXT: vldr s2, [r12, #12] ; FULL-NEXT: vmul.f32 s2, s2, s0 ; FULL-NEXT: vcvt.s32.f32 s2, s2 ; FULL-NEXT: vstr s2, [r0, #4] @@ -3829,25 +3833,28 @@ define void @unroll_minmax(ptr nocapture %0, ptr nocapture readonly %1, i32 %2) ; ; VFP2-LABEL: unroll_minmax: ; VFP2: @ %bb.0: -; VFP2-NEXT: subs r1, #8 +; VFP2-NEXT: .save {r7, lr} +; VFP2-NEXT: push {r7, lr} ; VFP2-NEXT: subs r0, #8 ; VFP2-NEXT: vldr s0, .LCPI55_0 -; VFP2-NEXT: mov.w r2, #1024 +; VFP2-NEXT: mvn r2, #7 +; VFP2-NEXT: movw r12, #4088 ; VFP2-NEXT: .LBB55_1: @ =>This Inner Loop Header: Depth=1 -; VFP2-NEXT: vldr s2, [r1, #8] -; VFP2-NEXT: subs r2, #2 +; VFP2-NEXT: add.w lr, r1, r2 +; VFP2-NEXT: adds r2, #8 +; VFP2-NEXT: cmp r2, r12 +; VFP2-NEXT: vldr s2, [lr, #8] ; VFP2-NEXT: vmul.f32 s2, s2, s0 ; VFP2-NEXT: vcvt.s32.f32 s2, s2 ; VFP2-NEXT: vmov r3, s2 ; VFP2-NEXT: str r3, [r0, #8]! -; VFP2-NEXT: vldr s2, [r1, #12] -; VFP2-NEXT: add.w r1, r1, #8 +; VFP2-NEXT: vldr s2, [lr, #12] ; VFP2-NEXT: vmul.f32 s2, s2, s0 ; VFP2-NEXT: vcvt.s32.f32 s2, s2 ; VFP2-NEXT: vstr s2, [r0, #4] ; VFP2-NEXT: bne .LBB55_1 ; VFP2-NEXT: @ %bb.2: -; VFP2-NEXT: bx lr +; VFP2-NEXT: pop {r7, pc} ; VFP2-NEXT: .p2align 2 ; VFP2-NEXT: @ %bb.3: ; VFP2-NEXT: .LCPI55_0: @@ -3858,17 +3865,18 @@ define void @unroll_minmax(ptr nocapture %0, ptr nocapture readonly %1, i32 %2) ; FULL-NEXT: .save {r7, lr} ; FULL-NEXT: push {r7, lr} ; FULL-NEXT: mov.w lr, #512 -; FULL-NEXT: subs r1, #8 ; FULL-NEXT: subs r0, #8 ; FULL-NEXT: vldr s0, .LCPI55_0 +; FULL-NEXT: mvn r2, #7 ; FULL-NEXT: .LBB55_1: @ =>This Inner Loop Header: Depth=1 -; FULL-NEXT: vldr s2, [r1, #8] +; FULL-NEXT: add.w r12, r1, r2 +; FULL-NEXT: adds r2, #8 +; FULL-NEXT: vldr s2, [r12, #8] ; FULL-NEXT: vmul.f32 s2, s2, s0 ; FULL-NEXT: vcvt.s32.f32 s2, s2 -; FULL-NEXT: vmov r2, s2 -; FULL-NEXT: str r2, [r0, #8]! -; FULL-NEXT: vldr s2, [r1, #12] -; FULL-NEXT: adds r1, #8 +; FULL-NEXT: vmov r3, s2 +; FULL-NEXT: str r3, [r0, #8]! +; FULL-NEXT: vldr s2, [r12, #12] ; FULL-NEXT: vmul.f32 s2, s2, s0 ; FULL-NEXT: vcvt.s32.f32 s2, s2 ; FULL-NEXT: vstr s2, [r0, #4] diff --git a/llvm/test/CodeGen/ARM/loop-indexing.ll b/llvm/test/CodeGen/ARM/loop-indexing.ll index bb859b202bbc0..b48f2a406b195 100644 --- a/llvm/test/CodeGen/ARM/loop-indexing.ll +++ b/llvm/test/CodeGen/ARM/loop-indexing.ll @@ -740,7 +740,7 @@ for.cond.cleanup: ; preds = %for.cond1.for.cond. ; CHECK-DEFAULT-NOT: str{{.*}}]! ; TODO: Increased complexity shouldn't prevent indexed accesses. -; CHECK-COMPLEX-NOT: ldr{{.*}}]! +; CHECK-COMPLEX: ldr{{.*}}, #4]! ; CHECK-COMPLEX-NOT: str{{.*}}]! ; DISABLED-NOT: ldr{{.*}}]! diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll index b6a00e03a80ab..097fa444c74b1 100644 --- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll +++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll @@ -611,9 +611,9 @@ define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) { ; SM20-NEXT: // =>This Inner Loop Header: Depth=1 ; SM20-NEXT: ld.global.b32 %r7, [%rd7]; ; SM20-NEXT: add.rn.f32 %r9, %r7, %r9; -; SM20-NEXT: add.s64 %rd7, %rd7, 4; ; SM20-NEXT: add.s32 %r8, %r8, -1; -; SM20-NEXT: setp.ne.b32 %p1, %r8, 0; +; SM20-NEXT: add.s64 %rd7, %rd7, 4; +; SM20-NEXT: setp.ne.s32 %p1, %r8, 0; ; SM20-NEXT: @%p1 bra $L__BB18_1; ; SM20-NEXT: // %bb.2: // %exit ; SM20-NEXT: st.global.b32 [%rd2], %r9; @@ -636,9 +636,9 @@ define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) { ; SM35-NEXT: // =>This Inner Loop Header: Depth=1 ; SM35-NEXT: ld.global.nc.b32 %r7, [%rd7]; ; SM35-NEXT: add.rn.f32 %r9, %r7, %r9; -; SM35-NEXT: add.s64 %rd7, %rd7, 4; ; SM35-NEXT: add.s32 %r8, %r8, -1; -; SM35-NEXT: setp.ne.b32 %p1, %r8, 0; +; SM35-NEXT: add.s64 %rd7, %rd7, 4; +; SM35-NEXT: setp.ne.s32 %p1, %r8, 0; ; SM35-NEXT: @%p1 bra $L__BB18_1; ; SM35-NEXT: // %bb.2: // %exit ; SM35-NEXT: st.global.b32 [%rd2], %r9; diff --git a/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll b/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll index 79f2ef3e3746a..1a5986711f346 100644 --- a/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll +++ b/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll @@ -9,16 +9,16 @@ define void @foo(ptr readonly %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6 ; CHECK-NEXT: bgelr 0 ; CHECK-NEXT: # %bb.1: # %.preheader ; CHECK-NEXT: std 27, -40(1) # 8-byte Folded Spill -; CHECK-NEXT: addi 27, 5, 2 +; CHECK-NEXT: addi 27, 5, 3 ; CHECK-NEXT: std 28, -32(1) # 8-byte Folded Spill -; CHECK-NEXT: addi 28, 5, 3 +; CHECK-NEXT: addi 28, 5, 2 ; CHECK-NEXT: std 30, -16(1) # 8-byte Folded Spill ; CHECK-NEXT: addi 30, 5, 1 -; CHECK-NEXT: mulld 12, 8, 5 -; CHECK-NEXT: mulld 0, 9, 8 +; CHECK-NEXT: mulld 11, 8, 5 +; CHECK-NEXT: mulld 12, 9, 8 ; CHECK-NEXT: std 29, -24(1) # 8-byte Folded Spill ; CHECK-NEXT: addi 29, 3, 16 -; CHECK-NEXT: sldi 11, 10, 3 +; CHECK-NEXT: sldi 0, 10, 3 ; CHECK-NEXT: std 22, -80(1) # 8-byte Folded Spill ; CHECK-NEXT: std 23, -72(1) # 8-byte Folded Spill ; CHECK-NEXT: std 24, -64(1) # 8-byte Folded Spill @@ -31,10 +31,10 @@ define void @foo(ptr readonly %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: add 5, 5, 9 -; CHECK-NEXT: add 12, 12, 0 -; CHECK-NEXT: add 30, 30, 0 -; CHECK-NEXT: add 28, 28, 0 -; CHECK-NEXT: add 8, 8, 0 +; CHECK-NEXT: add 11, 11, 12 +; CHECK-NEXT: add 30, 30, 12 +; CHECK-NEXT: add 28, 28, 12 +; CHECK-NEXT: add 8, 8, 12 ; CHECK-NEXT: cmpd 5, 7 ; CHECK-NEXT: bge 0, .LBB0_6 ; CHECK-NEXT: .LBB0_3: # =>This Loop Header: Depth=1 @@ -43,24 +43,24 @@ define void @foo(ptr readonly %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6 ; CHECK-NEXT: cmpd 6, 27 ; CHECK-NEXT: bge 0, .LBB0_2 ; CHECK-NEXT: # %bb.4: -; CHECK-NEXT: add 25, 6, 12 -; CHECK-NEXT: add 24, 6, 8 -; CHECK-NEXT: sldi 26, 6, 3 -; CHECK-NEXT: sldi 23, 25, 3 -; CHECK-NEXT: add 25, 6, 30 -; CHECK-NEXT: sldi 24, 24, 3 -; CHECK-NEXT: add 26, 4, 26 -; CHECK-NEXT: sldi 22, 25, 3 -; CHECK-NEXT: add 25, 6, 28 -; CHECK-NEXT: add 24, 29, 24 -; CHECK-NEXT: add 23, 3, 23 +; CHECK-NEXT: add 26, 6, 11 +; CHECK-NEXT: add 25, 6, 8 +; CHECK-NEXT: sldi 24, 6, 3 +; CHECK-NEXT: sldi 23, 26, 3 +; CHECK-NEXT: add 26, 6, 30 ; CHECK-NEXT: sldi 25, 25, 3 -; CHECK-NEXT: add 22, 3, 22 +; CHECK-NEXT: add 24, 4, 24 +; CHECK-NEXT: sldi 22, 26, 3 +; CHECK-NEXT: add 26, 6, 28 ; CHECK-NEXT: add 25, 29, 25 +; CHECK-NEXT: add 23, 3, 23 +; CHECK-NEXT: sldi 26, 26, 3 +; CHECK-NEXT: add 22, 3, 22 +; CHECK-NEXT: add 26, 29, 26 ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_5: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: lfd 0, 0(26) +; CHECK-NEXT: lfd 0, 0(24) ; CHECK-NEXT: lfd 1, 0(23) ; CHECK-NEXT: add 6, 6, 10 ; CHECK-NEXT: cmpd 6, 27 @@ -70,7 +70,7 @@ define void @foo(ptr readonly %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6 ; CHECK-NEXT: lfd 1, 16(23) ; CHECK-NEXT: xsadddp 0, 0, 1 ; CHECK-NEXT: lfd 1, 24(23) -; CHECK-NEXT: add 23, 23, 11 +; CHECK-NEXT: add 23, 23, 0 ; CHECK-NEXT: xsadddp 0, 0, 1 ; CHECK-NEXT: lfd 1, 0(22) ; CHECK-NEXT: xsadddp 0, 0, 1 @@ -79,16 +79,16 @@ define void @foo(ptr readonly %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6 ; CHECK-NEXT: lfd 1, 16(22) ; CHECK-NEXT: xsadddp 0, 0, 1 ; CHECK-NEXT: lfd 1, 24(22) -; CHECK-NEXT: add 22, 22, 11 +; CHECK-NEXT: add 22, 22, 0 ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -16(24) +; CHECK-NEXT: lfd 1, -16(26) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -8(24) +; CHECK-NEXT: lfd 1, -8(26) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 0(24) +; CHECK-NEXT: lfd 1, 0(26) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 8(24) -; CHECK-NEXT: add 24, 24, 11 +; CHECK-NEXT: lfd 1, 8(26) +; CHECK-NEXT: add 26, 26, 0 ; CHECK-NEXT: xsadddp 0, 0, 1 ; CHECK-NEXT: lfd 1, -16(25) ; CHECK-NEXT: xsadddp 0, 0, 1 @@ -97,10 +97,10 @@ define void @foo(ptr readonly %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6 ; CHECK-NEXT: lfd 1, 0(25) ; CHECK-NEXT: xsadddp 0, 0, 1 ; CHECK-NEXT: lfd 1, 8(25) -; CHECK-NEXT: add 25, 25, 11 +; CHECK-NEXT: add 25, 25, 0 ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: stfd 0, 0(26) -; CHECK-NEXT: add 26, 26, 11 +; CHECK-NEXT: stfd 0, 0(24) +; CHECK-NEXT: add 24, 24, 0 ; CHECK-NEXT: blt 0, .LBB0_5 ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_6: diff --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll index 9f62477ae01df..e3825afd5e778 100644 --- a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll @@ -63,97 +63,99 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: mr 11, 7 ; CHECK-NEXT: ld 23, 688(1) -; CHECK-NEXT: ld 7, 728(1) +; CHECK-NEXT: ld 2, 760(1) ; CHECK-NEXT: std 18, 432(1) # 8-byte Folded Spill ; CHECK-NEXT: std 19, 440(1) # 8-byte Folded Spill ; CHECK-NEXT: mr 18, 6 ; CHECK-NEXT: li 6, 9 ; CHECK-NEXT: ld 19, 768(1) -; CHECK-NEXT: ld 2, 760(1) +; CHECK-NEXT: ld 7, 728(1) ; CHECK-NEXT: std 26, 496(1) # 8-byte Folded Spill ; CHECK-NEXT: std 27, 504(1) # 8-byte Folded Spill ; CHECK-NEXT: cmpldi 3, 9 ; CHECK-NEXT: ld 27, 816(1) ; CHECK-NEXT: ld 26, 808(1) +; CHECK-NEXT: std 16, 416(1) # 8-byte Folded Spill +; CHECK-NEXT: std 17, 424(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 17, 752(1) +; CHECK-NEXT: ld 16, 744(1) ; CHECK-NEXT: std 14, 400(1) # 8-byte Folded Spill ; CHECK-NEXT: std 15, 408(1) # 8-byte Folded Spill ; CHECK-NEXT: ld 15, 736(1) ; CHECK-NEXT: lxv 39, 0(8) -; CHECK-NEXT: std 30, 528(1) # 8-byte Folded Spill -; CHECK-NEXT: std 31, 536(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 30, 704(1) -; CHECK-NEXT: lxv 38, 0(9) ; CHECK-NEXT: std 20, 448(1) # 8-byte Folded Spill ; CHECK-NEXT: std 21, 456(1) # 8-byte Folded Spill ; CHECK-NEXT: ld 21, 784(1) ; CHECK-NEXT: ld 20, 776(1) +; CHECK-NEXT: std 30, 528(1) # 8-byte Folded Spill +; CHECK-NEXT: std 31, 536(1) # 8-byte Folded Spill +; CHECK-NEXT: iselgt 3, 3, 6 +; CHECK-NEXT: ld 6, 696(1) +; CHECK-NEXT: ld 30, 712(1) ; CHECK-NEXT: std 24, 480(1) # 8-byte Folded Spill ; CHECK-NEXT: std 25, 488(1) # 8-byte Folded Spill -; CHECK-NEXT: iselgt 3, 3, 6 -; CHECK-NEXT: ld 6, 720(1) ; CHECK-NEXT: ld 24, 792(1) +; CHECK-NEXT: ld 25, 800(1) ; CHECK-NEXT: std 10, 72(1) # 8-byte Folded Spill ; CHECK-NEXT: std 7, 80(1) # 8-byte Folded Spill -; CHECK-NEXT: addi 3, 3, -2 +; CHECK-NEXT: lxv 7, 0(2) ; CHECK-NEXT: lxv 6, 0(19) -; CHECK-NEXT: lxv 11, 0(7) ; CHECK-NEXT: std 5, 200(1) # 8-byte Folded Spill ; CHECK-NEXT: std 23, 40(1) # 8-byte Folded Spill ; CHECK-NEXT: std 6, 48(1) # 8-byte Folded Spill ; CHECK-NEXT: ld 5, 840(1) -; CHECK-NEXT: lxv 12, 0(6) -; CHECK-NEXT: rldicl 12, 3, 61, 3 +; CHECK-NEXT: lxv 37, 0(6) +; CHECK-NEXT: std 17, 104(1) # 8-byte Folded Spill +; CHECK-NEXT: std 2, 112(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 11, 0(7) +; CHECK-NEXT: lxv 4, 0(21) +; CHECK-NEXT: addi 3, 3, -2 +; CHECK-NEXT: lxv 38, 0(9) ; CHECK-NEXT: std 19, 120(1) # 8-byte Folded Spill ; CHECK-NEXT: std 20, 128(1) # 8-byte Folded Spill -; CHECK-NEXT: std 21, 136(1) # 8-byte Folded Spill -; CHECK-NEXT: std 24, 144(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 4, 0(21) -; CHECK-NEXT: ld 25, 800(1) ; CHECK-NEXT: lxv 33, 0(10) ; CHECK-NEXT: lxv 32, 0(23) -; CHECK-NEXT: lxv 36, 0(30) -; CHECK-NEXT: std 16, 416(1) # 8-byte Folded Spill -; CHECK-NEXT: std 17, 424(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 17, 752(1) -; CHECK-NEXT: ld 16, 744(1) +; CHECK-NEXT: std 21, 136(1) # 8-byte Folded Spill +; CHECK-NEXT: std 24, 144(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 13, 0(30) +; CHECK-NEXT: lxv 10, 0(15) ; CHECK-NEXT: std 28, 512(1) # 8-byte Folded Spill ; CHECK-NEXT: std 29, 520(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 29, 712(1) -; CHECK-NEXT: ld 28, 696(1) +; CHECK-NEXT: ld 29, 720(1) +; CHECK-NEXT: ld 28, 704(1) +; CHECK-NEXT: rldicl 12, 3, 61, 3 ; CHECK-NEXT: std 8, 56(1) # 8-byte Folded Spill ; CHECK-NEXT: std 9, 64(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 37, 0(28) -; CHECK-NEXT: lxv 13, 0(29) +; CHECK-NEXT: lxv 36, 0(28) +; CHECK-NEXT: lxv 12, 0(29) ; CHECK-NEXT: mr 8, 29 ; CHECK-NEXT: mr 9, 30 ; CHECK-NEXT: mr 10, 28 ; CHECK-NEXT: std 25, 152(1) # 8-byte Folded Spill ; CHECK-NEXT: std 26, 160(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 10, 0(15) ; CHECK-NEXT: lxv 9, 0(16) +; CHECK-NEXT: lxv 8, 0(17) ; CHECK-NEXT: li 28, 1 ; CHECK-NEXT: stfd 26, 544(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 27, 552(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 8, 0(17) -; CHECK-NEXT: lxv 7, 0(2) -; CHECK-NEXT: stfd 28, 560(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 29, 568(1) # 8-byte Folded Spill ; CHECK-NEXT: lxv 5, 0(20) ; CHECK-NEXT: lxv 3, 0(24) -; CHECK-NEXT: stfd 30, 576(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 31, 584(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 28, 560(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 29, 568(1) # 8-byte Folded Spill ; CHECK-NEXT: lxv 2, 0(25) ; CHECK-NEXT: lxv 1, 0(26) +; CHECK-NEXT: stfd 30, 576(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 31, 584(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 0, 0(27) ; CHECK-NEXT: stxv 52, 208(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 53, 224(1) # 16-byte Folded Spill -; CHECK-NEXT: lxv 0, 0(27) ; CHECK-NEXT: stxv 54, 240(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 55, 256(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 56, 272(1) # 16-byte Folded Spill -; CHECK-NEXT: stxv 57, 288(1) # 16-byte Folded Spill -; CHECK-NEXT: stxv 58, 304(1) # 16-byte Folded Spill ; CHECK-NEXT: std 5, 192(1) # 8-byte Folded Spill ; CHECK-NEXT: ld 5, 832(1) +; CHECK-NEXT: stxv 57, 288(1) # 16-byte Folded Spill +; CHECK-NEXT: stxv 58, 304(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 59, 320(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 60, 336(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 61, 352(1) # 16-byte Folded Spill @@ -161,8 +163,6 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: stxv 63, 384(1) # 16-byte Folded Spill ; CHECK-NEXT: std 15, 88(1) # 8-byte Folded Spill ; CHECK-NEXT: std 16, 96(1) # 8-byte Folded Spill -; CHECK-NEXT: std 17, 104(1) # 8-byte Folded Spill -; CHECK-NEXT: std 2, 112(1) # 8-byte Folded Spill ; CHECK-NEXT: std 5, 184(1) # 8-byte Folded Spill ; CHECK-NEXT: ld 5, 824(1) ; CHECK-NEXT: std 5, 176(1) # 8-byte Folded Spill @@ -171,17 +171,17 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: li 27, 0 ; CHECK-NEXT: ld 7, 176(1) # 8-byte Folded Reload ; CHECK-NEXT: mulli 6, 5, 40 -; CHECK-NEXT: sldi 0, 5, 4 ; CHECK-NEXT: extswsli 14, 5, 3 +; CHECK-NEXT: sldi 0, 5, 4 +; CHECK-NEXT: mulli 2, 5, 48 ; CHECK-NEXT: lxv 40, 0(7) ; CHECK-NEXT: ld 7, 184(1) # 8-byte Folded Reload ; CHECK-NEXT: add 31, 14, 22 ; CHECK-NEXT: add 11, 0, 22 ; CHECK-NEXT: mr 26, 22 +; CHECK-NEXT: addi 31, 31, 32 ; CHECK-NEXT: addi 3, 11, 32 ; CHECK-NEXT: addi 11, 12, 1 -; CHECK-NEXT: mulli 12, 5, 48 -; CHECK-NEXT: addi 31, 31, 32 ; CHECK-NEXT: add 19, 22, 6 ; CHECK-NEXT: sldi 6, 5, 5 ; CHECK-NEXT: mulli 5, 5, 24 @@ -196,14 +196,14 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: .LBB0_3: # %_loop_2_do_.lr.ph ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB0_4 Depth 2 -; CHECK-NEXT: maddld 5, 12, 27, 0 +; CHECK-NEXT: maddld 5, 2, 27, 0 ; CHECK-NEXT: mr 6, 18 ; CHECK-NEXT: mr 29, 21 ; CHECK-NEXT: mr 30, 20 -; CHECK-NEXT: mr 2, 19 +; CHECK-NEXT: mr 12, 19 ; CHECK-NEXT: mtctr 11 ; CHECK-NEXT: add 25, 22, 5 -; CHECK-NEXT: maddld 5, 12, 27, 14 +; CHECK-NEXT: maddld 5, 2, 27, 14 ; CHECK-NEXT: add 24, 22, 5 ; CHECK-NEXT: mr 5, 26 ; CHECK-NEXT: .p2align 5 @@ -218,14 +218,14 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: lxvp 48, 0(25) ; CHECK-NEXT: lxvp 50, 0(29) ; CHECK-NEXT: lxvp 62, 0(30) -; CHECK-NEXT: lxvp 60, 0(2) +; CHECK-NEXT: lxvp 60, 0(12) ; CHECK-NEXT: lxvp 58, 32(6) ; CHECK-NEXT: lxvp 56, 32(5) ; CHECK-NEXT: lxvp 54, 32(24) ; CHECK-NEXT: lxvp 52, 32(25) ; CHECK-NEXT: lxvp 30, 32(29) ; CHECK-NEXT: lxvp 28, 32(30) -; CHECK-NEXT: lxvp 26, 32(2) +; CHECK-NEXT: lxvp 26, 32(12) ; CHECK-NEXT: xvmaddadp 33, 49, 35 ; CHECK-NEXT: xvmaddadp 32, 51, 35 ; CHECK-NEXT: xvmaddadp 37, 63, 35 @@ -254,17 +254,17 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: addi 25, 25, 64 ; CHECK-NEXT: addi 29, 29, 64 ; CHECK-NEXT: addi 30, 30, 64 -; CHECK-NEXT: addi 2, 2, 64 +; CHECK-NEXT: addi 12, 12, 64 ; CHECK-NEXT: bdnz .LBB0_4 ; CHECK-NEXT: # %bb.5: # %_loop_2_endl_ ; CHECK-NEXT: # ; CHECK-NEXT: addi 28, 28, 6 -; CHECK-NEXT: add 26, 26, 12 -; CHECK-NEXT: add 31, 31, 12 -; CHECK-NEXT: add 19, 19, 12 -; CHECK-NEXT: add 3, 3, 12 -; CHECK-NEXT: add 20, 20, 12 -; CHECK-NEXT: add 21, 21, 12 +; CHECK-NEXT: add 19, 19, 2 +; CHECK-NEXT: add 20, 20, 2 +; CHECK-NEXT: add 21, 21, 2 +; CHECK-NEXT: add 26, 26, 2 +; CHECK-NEXT: add 31, 31, 2 +; CHECK-NEXT: add 3, 3, 2 ; CHECK-NEXT: addi 27, 27, 1 ; CHECK-NEXT: cmpld 28, 4 ; CHECK-NEXT: ble 0, .LBB0_3 @@ -312,13 +312,13 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: ld 16, 416(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 32, 0(3) ; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 37, 0(10) -; CHECK-NEXT: stxv 36, 0(9) -; CHECK-NEXT: stxv 13, 0(8) ; CHECK-NEXT: ld 15, 408(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 14, 400(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 12, 0(3) +; CHECK-NEXT: stxv 37, 0(3) ; CHECK-NEXT: ld 3, 80(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 36, 0(10) +; CHECK-NEXT: stxv 13, 0(9) +; CHECK-NEXT: stxv 12, 0(8) ; CHECK-NEXT: stxv 11, 0(3) ; CHECK-NEXT: ld 3, 88(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 10, 0(3) diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll index 6136c321c08ca..625d53e4a10d4 100644 --- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll +++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll @@ -54,28 +54,31 @@ define void @test2(ptr nocapture noundef %a, i32 noundef signext %n) { ; CHECK-NEXT: andi a2, a1, 1 ; CHECK-NEXT: bne a1, a3, .LBB1_3 ; CHECK-NEXT: # %bb.2: -; CHECK-NEXT: li a3, 0 -; CHECK-NEXT: j .LBB1_5 +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: bnez a2, .LBB1_6 +; CHECK-NEXT: j .LBB1_7 ; CHECK-NEXT: .LBB1_3: # %for.body.preheader.new ; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: andi a1, a1, -2 +; CHECK-NEXT: neg a1, a1 ; CHECK-NEXT: addi a4, a0, 4 ; CHECK-NEXT: .LBB1_4: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: lw a5, -4(a4) ; CHECK-NEXT: lw a6, 0(a4) -; CHECK-NEXT: addi a3, a3, 2 +; CHECK-NEXT: addi a3, a3, -2 ; CHECK-NEXT: addi a5, a5, 4 ; CHECK-NEXT: addi a6, a6, 4 ; CHECK-NEXT: sw a5, -4(a4) ; CHECK-NEXT: sw a6, 0(a4) ; CHECK-NEXT: addi a4, a4, 8 ; CHECK-NEXT: bne a1, a3, .LBB1_4 -; CHECK-NEXT: .LBB1_5: # %for.cond.cleanup.loopexit.unr-lcssa +; CHECK-NEXT: # %bb.5: # %for.cond.cleanup.loopexit.unr-lcssa.loopexit +; CHECK-NEXT: neg a1, a3 ; CHECK-NEXT: beqz a2, .LBB1_7 -; CHECK-NEXT: # %bb.6: # %for.body.epil -; CHECK-NEXT: slli a3, a3, 2 -; CHECK-NEXT: add a0, a0, a3 +; CHECK-NEXT: .LBB1_6: # %for.body.epil +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: lw a1, 0(a0) ; CHECK-NEXT: addi a1, a1, 4 ; CHECK-NEXT: sw a1, 0(a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll index 07aa05f609c40..2d82ca5e8e50c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll @@ -570,10 +570,10 @@ define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture reado ; CHECK-NEXT: addi a5, a0, 12 ; CHECK-NEXT: vlse32.v v9, (a5), a4 ; CHECK-NEXT: addi a2, a2, -8 -; CHECK-NEXT: addi a1, a1, 512 +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: vsse32.v v8, (a5), a4 -; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: addi a1, a1, 512 ; CHECK-NEXT: bnez a2, .LBB11_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll index 7c6c7e90413b1..918411b06a3b8 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -663,14 +663,14 @@ define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapt ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: sub.w r7, r12, #4 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r5], #16 -; CHECK-NEXT: vldrw.u32 q1, [r4], #16 +; CHECK-NEXT: vldrw.u32 q0, [r4], #16 +; CHECK-NEXT: vldrw.u32 q1, [r5], #16 ; CHECK-NEXT: vcvt.f32.s32 q0, q0 ; CHECK-NEXT: vmul.f32 q0, q1, q0 ; CHECK-NEXT: vstrb.8 q0, [r6], #16 @@ -688,14 +688,14 @@ define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapt ; CHECK-NEXT: add.w r8, r12, r7 ; CHECK-NEXT: wls lr, r7, .LBB3_10 ; CHECK-NEXT: @ %bb.8: @ %for.body.prol.preheader -; CHECK-NEXT: add.w r6, r0, r12, lsl #2 -; CHECK-NEXT: add.w r7, r1, r12, lsl #2 +; CHECK-NEXT: add.w r6, r1, r12, lsl #2 +; CHECK-NEXT: add.w r7, r0, r12, lsl #2 ; CHECK-NEXT: add.w r5, r2, r12, lsl #2 ; CHECK-NEXT: mov r12, r8 ; CHECK-NEXT: .LBB3_9: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r4, [r7], #4 -; CHECK-NEXT: vldmia r6!, {s2} +; CHECK-NEXT: ldr r4, [r6], #4 +; CHECK-NEXT: vldmia r7!, {s2} ; CHECK-NEXT: vmov s0, r4 ; CHECK-NEXT: vcvt.f32.s32 s0, s0 ; CHECK-NEXT: vmul.f32 s0, s2, s0 @@ -718,10 +718,10 @@ define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapt ; CHECK-NEXT: vldr s0, [r1, #-8] ; CHECK-NEXT: adds r7, r0, r3 ; CHECK-NEXT: adds r6, r2, r3 -; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: adds r2, #16 ; CHECK-NEXT: vcvt.f32.s32 s0, s0 ; CHECK-NEXT: vldr s2, [r7] -; CHECK-NEXT: adds r2, #16 +; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r6] ; CHECK-NEXT: vldr s0, [r1, #-4] @@ -970,37 +970,37 @@ define arm_aapcs_vfpcc void @half_half_mul(ptr nocapture readonly %a, ptr nocapt ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r4, r2 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: .LBB5_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr.w r9, [r4] -; CHECK-NEXT: ldr r7, [r5] -; CHECK-NEXT: ldr.w r8, [r4, #4] +; CHECK-NEXT: ldr.w r9, [r5] +; CHECK-NEXT: ldr r7, [r6] +; CHECK-NEXT: ldr.w r8, [r5, #4] ; CHECK-NEXT: vmov.32 q0[0], r9 -; CHECK-NEXT: ldr.w r10, [r5, #4] +; CHECK-NEXT: ldr.w r10, [r6, #4] ; CHECK-NEXT: vmov.32 q1[0], r7 ; CHECK-NEXT: vmov.32 q0[1], r8 -; CHECK-NEXT: adds r4, #8 -; CHECK-NEXT: vmov.32 q1[1], r10 ; CHECK-NEXT: adds r5, #8 +; CHECK-NEXT: vmov.32 q1[1], r10 +; CHECK-NEXT: adds r6, #8 ; CHECK-NEXT: vmul.f16 q0, q0, q1 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vstrb.8 q0, [r6], #16 +; CHECK-NEXT: vstrb.8 q0, [r4], #16 ; CHECK-NEXT: le lr, .LBB5_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader11 ; CHECK-NEXT: sub.w lr, r3, r12 +; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1 -; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: .LBB5_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr.16 s0, [r1] @@ -1081,37 +1081,37 @@ define arm_aapcs_vfpcc void @half_half_add(ptr nocapture readonly %a, ptr nocapt ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r4, r2 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: .LBB6_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr.w r9, [r4] -; CHECK-NEXT: ldr r7, [r5] -; CHECK-NEXT: ldr.w r8, [r4, #4] +; CHECK-NEXT: ldr.w r9, [r5] +; CHECK-NEXT: ldr r7, [r6] +; CHECK-NEXT: ldr.w r8, [r5, #4] ; CHECK-NEXT: vmov.32 q0[0], r9 -; CHECK-NEXT: ldr.w r10, [r5, #4] +; CHECK-NEXT: ldr.w r10, [r6, #4] ; CHECK-NEXT: vmov.32 q1[0], r7 ; CHECK-NEXT: vmov.32 q0[1], r8 -; CHECK-NEXT: adds r4, #8 -; CHECK-NEXT: vmov.32 q1[1], r10 ; CHECK-NEXT: adds r5, #8 +; CHECK-NEXT: vmov.32 q1[1], r10 +; CHECK-NEXT: adds r6, #8 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vstrb.8 q0, [r6], #16 +; CHECK-NEXT: vstrb.8 q0, [r4], #16 ; CHECK-NEXT: le lr, .LBB6_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: .LBB6_6: @ %for.body.preheader11 ; CHECK-NEXT: sub.w lr, r3, r12 +; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1 -; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: .LBB6_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr.16 s0, [r1] @@ -1192,37 +1192,37 @@ define arm_aapcs_vfpcc void @half_half_sub(ptr nocapture readonly %a, ptr nocapt ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r4, r2 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: .LBB7_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr.w r9, [r4] -; CHECK-NEXT: ldr r7, [r5] -; CHECK-NEXT: ldr.w r8, [r4, #4] +; CHECK-NEXT: ldr.w r9, [r5] +; CHECK-NEXT: ldr r7, [r6] +; CHECK-NEXT: ldr.w r8, [r5, #4] ; CHECK-NEXT: vmov.32 q0[0], r9 -; CHECK-NEXT: ldr.w r10, [r5, #4] +; CHECK-NEXT: ldr.w r10, [r6, #4] ; CHECK-NEXT: vmov.32 q1[0], r7 ; CHECK-NEXT: vmov.32 q0[1], r8 -; CHECK-NEXT: adds r4, #8 -; CHECK-NEXT: vmov.32 q1[1], r10 ; CHECK-NEXT: adds r5, #8 +; CHECK-NEXT: vmov.32 q1[1], r10 +; CHECK-NEXT: adds r6, #8 ; CHECK-NEXT: vsub.f16 q0, q0, q1 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vstrb.8 q0, [r6], #16 +; CHECK-NEXT: vstrb.8 q0, [r4], #16 ; CHECK-NEXT: le lr, .LBB7_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 ; CHECK-NEXT: beq .LBB7_8 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader11 ; CHECK-NEXT: sub.w lr, r3, r12 +; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1 -; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr.16 s0, [r1] @@ -1777,8 +1777,8 @@ define arm_aapcs_vfpcc float @half_short_mac(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r12, .LBB11_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader -; CHECK-NEXT: add.w r0, r0, r2, lsl #1 ; CHECK-NEXT: add.w r1, r1, r2, lsl #1 +; CHECK-NEXT: add.w r0, r0, r2, lsl #1 ; CHECK-NEXT: .LBB11_8: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh r2, [r1], #2 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll index 8a5a15a57912c..ce2668250bbc7 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -921,33 +921,33 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly ; CHECK-NEXT: bic r7, r4, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: add.w r5, r3, #8 +; CHECK-NEXT: add.w r5, r0, #8 ; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: add.w r6, r0, #8 +; CHECK-NEXT: add.w r6, r3, #8 ; CHECK-NEXT: add.w r7, r1, #8 ; CHECK-NEXT: .LBB9_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r9, [r6, #-8] +; CHECK-NEXT: ldr r9, [r5, #-8] ; CHECK-NEXT: add.w r8, r8, #4 ; CHECK-NEXT: ldr r4, [r7, #-8] ; CHECK-NEXT: mla r4, r4, r9, r2 -; CHECK-NEXT: str r4, [r5, #-8] -; CHECK-NEXT: ldr r9, [r6, #-4] +; CHECK-NEXT: str r4, [r6, #-8] +; CHECK-NEXT: ldr r9, [r5, #-4] ; CHECK-NEXT: ldr r4, [r7, #-4] ; CHECK-NEXT: mla r4, r4, r9, r2 -; CHECK-NEXT: str r4, [r5, #-4] -; CHECK-NEXT: ldr.w r9, [r6] +; CHECK-NEXT: str r4, [r6, #-4] +; CHECK-NEXT: ldr.w r9, [r5] ; CHECK-NEXT: ldr r4, [r7] ; CHECK-NEXT: mla r4, r4, r9, r2 -; CHECK-NEXT: str r4, [r5] -; CHECK-NEXT: ldr.w r9, [r6, #4] -; CHECK-NEXT: adds r6, #16 +; CHECK-NEXT: str r4, [r6] +; CHECK-NEXT: ldr.w r9, [r5, #4] +; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: ldr r4, [r7, #4] ; CHECK-NEXT: adds r7, #16 ; CHECK-NEXT: mla r4, r4, r9, r2 -; CHECK-NEXT: str r4, [r5, #4] -; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: str r4, [r6, #4] +; CHECK-NEXT: adds r6, #16 ; CHECK-NEXT: le lr, .LBB9_7 ; CHECK-NEXT: .LBB9_8: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r12, .LBB9_11 diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll index d076cb00ad7e0..7723f58e78aa8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll +++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll @@ -34,7 +34,7 @@ define i32 @test(i8 zeroext %var_2, i16 signext %var_15, ptr %arr_60) { ; CHECK-NEXT: @ Child Loop BB0_5 Depth 3 ; CHECK-NEXT: @ Child Loop BB0_7 Depth 3 ; CHECK-NEXT: @ Child Loop BB0_9 Depth 3 -; CHECK-NEXT: ldr.w r12, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr.w r11, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov.w r10, #0 ; CHECK-NEXT: ldrd r8, r0, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: .LBB0_4: @ %for.cond6.preheader @@ -45,10 +45,10 @@ define i32 @test(i8 zeroext %var_2, i16 signext %var_15, ptr %arr_60) { ; CHECK-NEXT: @ Child Loop BB0_9 Depth 3 ; CHECK-NEXT: movw r3, :lower16:arr_61 ; CHECK-NEXT: movt r3, :upper16:arr_61 -; CHECK-NEXT: add.w r11, r3, #4 +; CHECK-NEXT: add.w r12, r3, #4 ; CHECK-NEXT: movs r3, #11 ; CHECK-NEXT: dls lr, r3 -; CHECK-NEXT: mov r4, r11 +; CHECK-NEXT: mov r4, r12 ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: .LBB0_5: @ %for.body10 ; CHECK-NEXT: @ Parent Loop BB0_3 Depth=1 @@ -65,10 +65,10 @@ define i32 @test(i8 zeroext %var_2, i16 signext %var_15, ptr %arr_60) { ; CHECK-NEXT: str r6, [r4] ; CHECK-NEXT: cset r6, ne ; CHECK-NEXT: strb r6, [r5] -; CHECK-NEXT: add.w r2, r2, #792 -; CHECK-NEXT: ldrb r6, [r3] ; CHECK-NEXT: adds r4, #8 +; CHECK-NEXT: ldrb r6, [r3] ; CHECK-NEXT: adds r3, #2 +; CHECK-NEXT: add.w r2, r2, #792 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: ite ne ; CHECK-NEXT: sxthne r6, r1 @@ -80,7 +80,7 @@ define i32 @test(i8 zeroext %var_2, i16 signext %var_15, ptr %arr_60) { ; CHECK-NEXT: @ %bb.6: @ %for.cond.cleanup9 ; CHECK-NEXT: @ in Loop: Header=BB0_4 Depth=2 ; CHECK-NEXT: movs r2, #11 -; CHECK-NEXT: mov r4, r11 +; CHECK-NEXT: mov r4, r12 ; CHECK-NEXT: dls lr, r2 ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload @@ -114,28 +114,28 @@ define i32 @test(i8 zeroext %var_2, i16 signext %var_15, ptr %arr_60) { ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup9.1 ; CHECK-NEXT: @ in Loop: Header=BB0_4 Depth=2 ; CHECK-NEXT: movs r2, #11 -; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: dls lr, r2 -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: .LBB0_9: @ %for.body10.2 ; CHECK-NEXT: @ Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: @ Parent Loop BB0_4 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: str r2, [r11, #-4] -; CHECK-NEXT: add.w r6, r2, #396 -; CHECK-NEXT: ldrb r4, [r3, #-1] -; CHECK-NEXT: add.w r2, r2, #792 +; CHECK-NEXT: str r3, [r12, #-4] +; CHECK-NEXT: add.w r6, r3, #396 +; CHECK-NEXT: ldrb r4, [r2, #-1] +; CHECK-NEXT: add.w r3, r3, #792 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: ite ne ; CHECK-NEXT: sxthne r4, r1 ; CHECK-NEXT: moveq r4, #0 ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: str.w r6, [r11] +; CHECK-NEXT: str.w r6, [r12] ; CHECK-NEXT: cset r4, ne -; CHECK-NEXT: add.w r11, r11, #8 +; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: strb r4, [r5] -; CHECK-NEXT: ldrb r4, [r3] -; CHECK-NEXT: adds r3, #2 +; CHECK-NEXT: ldrb r4, [r2] +; CHECK-NEXT: adds r2, #2 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: ite ne ; CHECK-NEXT: sxthne r4, r1 @@ -147,7 +147,7 @@ define i32 @test(i8 zeroext %var_2, i16 signext %var_15, ptr %arr_60) { ; CHECK-NEXT: @ %bb.10: @ %for.cond.cleanup9.2 ; CHECK-NEXT: @ in Loop: Header=BB0_4 Depth=2 ; CHECK-NEXT: add.w r2, r10, #3 -; CHECK-NEXT: add.w r12, r12, #66 +; CHECK-NEXT: add.w r11, r11, #66 ; CHECK-NEXT: adds r0, #66 ; CHECK-NEXT: add.w r8, r8, #66 ; CHECK-NEXT: uxtb.w r10, r2 diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll index c8dd949ca9d88..02e09e1d8c3ed 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -671,18 +671,18 @@ define dso_local void @test_nested(ptr noalias nocapture %pInT1, ptr noalias noc ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_2 Depth 2 ; CHECK-NEXT: ldrh r4, [r1] -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: vdup.16 q0, r4 -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r4, r12 ; CHECK-NEXT: .LBB14_2: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrw.u32 q1, [r5], #16 -; CHECK-NEXT: vldrw.u32 q2, [r4] -; CHECK-NEXT: subs r6, #8 +; CHECK-NEXT: vldrw.u32 q1, [r6], #16 +; CHECK-NEXT: vldrw.u32 q2, [r5] +; CHECK-NEXT: subs r4, #8 ; CHECK-NEXT: vfms.f16 q2, q1, q0 -; CHECK-NEXT: vstrb.8 q2, [r4], #16 +; CHECK-NEXT: vstrb.8 q2, [r5], #16 ; CHECK-NEXT: bne .LBB14_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index 28166e455aba2..bff7e3b21548d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -671,18 +671,18 @@ define dso_local void @test_nested(ptr noalias nocapture %pInT1, ptr noalias noc ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_2 Depth 2 ; CHECK-NEXT: ldr r4, [r1] -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r4, r12 ; CHECK-NEXT: .LBB14_2: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrw.u32 q1, [r5], #16 -; CHECK-NEXT: vldrw.u32 q2, [r4] -; CHECK-NEXT: subs r6, #4 +; CHECK-NEXT: vldrw.u32 q1, [r6], #16 +; CHECK-NEXT: vldrw.u32 q2, [r5] +; CHECK-NEXT: subs r4, #4 ; CHECK-NEXT: vfms.f32 q2, q1, q0 -; CHECK-NEXT: vstrb.8 q2, [r4], #16 +; CHECK-NEXT: vstrb.8 q2, [r5], #16 ; CHECK-NEXT: bne .LBB14_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll index da59cb259db61..eb5cedf8e4f86 100644 --- a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll @@ -550,11 +550,11 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: movw r6, :lower16:arr_20 ; CHECK-NEXT: movt r6, :upper16:arr_20 -; CHECK-NEXT: add.w r3, r6, #80 +; CHECK-NEXT: add.w r5, r6, #80 ; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: movw r0, :lower16:arr_21 ; CHECK-NEXT: movt r0, :upper16:arr_21 -; CHECK-NEXT: add.w r5, r0, #36 +; CHECK-NEXT: add.w r3, r0, #36 ; CHECK-NEXT: add.w r11, r6, #128 ; CHECK-NEXT: add.w r7, r6, #112 ; CHECK-NEXT: add.w r2, r6, #96 @@ -572,11 +572,11 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: movt r6, :upper16:arr_20 ; CHECK-NEXT: .LBB19_3: @ %for.cond8.preheader ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r8, [r5, #-4] -; CHECK-NEXT: vstrh.16 q1, [r5, #-36] -; CHECK-NEXT: strh.w r9, [r5] -; CHECK-NEXT: vstrh.16 q1, [r5, #-20] -; CHECK-NEXT: vstrw.32 q0, [r3] +; CHECK-NEXT: str r8, [r3, #-4] +; CHECK-NEXT: vstrh.16 q1, [r3, #-36] +; CHECK-NEXT: strh.w r9, [r3] +; CHECK-NEXT: vstrh.16 q1, [r3, #-20] +; CHECK-NEXT: vstrw.32 q0, [r5] ; CHECK-NEXT: vstrh.16 q0, [r12], #152 ; CHECK-NEXT: vstrh.16 q0, [r6], #152 ; CHECK-NEXT: vstrh.16 q0, [r1], #152 @@ -585,9 +585,9 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: vstrh.16 q0, [r2], #152 ; CHECK-NEXT: vstrh.16 q0, [r7], #152 ; CHECK-NEXT: vstrh.16 q0, [r11], #152 -; CHECK-NEXT: strd r9, r10, [r3, #64] -; CHECK-NEXT: adds r5, #38 -; CHECK-NEXT: adds r3, #152 +; CHECK-NEXT: strd r9, r10, [r5, #64] +; CHECK-NEXT: adds r5, #152 +; CHECK-NEXT: adds r3, #38 ; CHECK-NEXT: le lr, .LBB19_3 ; CHECK-NEXT: @ %bb.4: @ %for.cond.cleanup6 ; CHECK-NEXT: movw r0, :lower16:arr_22 @@ -641,8 +641,8 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: vstrh.16 q0, [r12], #152 ; CHECK-NEXT: vstrh.16 q0, [r1], #152 ; CHECK-NEXT: strd r10, r11, [r3, #64] -; CHECK-NEXT: adds r0, #38 ; CHECK-NEXT: adds r3, #152 +; CHECK-NEXT: adds r0, #38 ; CHECK-NEXT: le lr, .LBB19_7 ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup6.1 ; CHECK-NEXT: movw r0, :lower16:arr_22 @@ -698,8 +698,8 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: vstrh.16 q0, [r12], #152 ; CHECK-NEXT: vstrh.16 q0, [r4], #152 ; CHECK-NEXT: strd r10, r11, [r3, #64] -; CHECK-NEXT: adds r1, #38 ; CHECK-NEXT: adds r3, #152 +; CHECK-NEXT: adds r1, #38 ; CHECK-NEXT: le lr, .LBB19_11 ; CHECK-NEXT: @ %bb.12: @ %for.cond.cleanup6.2 ; CHECK-NEXT: movw r0, :lower16:arr_22 @@ -712,25 +712,25 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: vstrb.8 q1, [r0], #16 ; CHECK-NEXT: letp lr, .LBB19_13 ; CHECK-NEXT: .LBB19_14: @ %for.cond.cleanup6.2 -; CHECK-NEXT: movw r2, :lower16:arr_21 -; CHECK-NEXT: movw r1, #5508 -; CHECK-NEXT: movt r2, :upper16:arr_21 +; CHECK-NEXT: movw r1, :lower16:arr_21 +; CHECK-NEXT: movw r0, #5508 +; CHECK-NEXT: movt r1, :upper16:arr_21 ; CHECK-NEXT: movw r7, :lower16:arr_20 -; CHECK-NEXT: add r2, r1 -; CHECK-NEXT: movw r1, #22000 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: movw r1, #21968 ; CHECK-NEXT: movt r7, :upper16:arr_20 +; CHECK-NEXT: adds r2, r7, r1 +; CHECK-NEXT: movw r1, #22000 ; CHECK-NEXT: add.w r12, r7, r1 ; CHECK-NEXT: movw r1, #21984 ; CHECK-NEXT: add.w r8, r7, r1 ; CHECK-NEXT: movw r1, #21952 ; CHECK-NEXT: add.w r9, r7, r1 ; CHECK-NEXT: movw r1, #21936 -; CHECK-NEXT: movw r0, #21968 ; CHECK-NEXT: adds r5, r7, r1 ; CHECK-NEXT: movw r1, #21920 ; CHECK-NEXT: movw r3, #21904 ; CHECK-NEXT: adds r4, r7, r3 -; CHECK-NEXT: add r0, r7 ; CHECK-NEXT: add r1, r7 ; CHECK-NEXT: add.w r3, r7, #22016 ; CHECK-NEXT: add.w r6, r7, #21888 @@ -742,11 +742,11 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: mov.w r7, #327685 ; CHECK-NEXT: .LBB19_15: @ %for.cond8.preheader.3 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r7, [r2, #-4] -; CHECK-NEXT: vstrh.16 q1, [r2, #-36] -; CHECK-NEXT: strh.w r10, [r2] -; CHECK-NEXT: vstrh.16 q1, [r2, #-20] -; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: str r7, [r0, #-4] +; CHECK-NEXT: vstrh.16 q1, [r0, #-36] +; CHECK-NEXT: strh.w r10, [r0] +; CHECK-NEXT: vstrh.16 q1, [r0, #-20] +; CHECK-NEXT: vstrw.32 q0, [r2] ; CHECK-NEXT: vstrh.16 q0, [r4], #152 ; CHECK-NEXT: vstrh.16 q0, [r6], #152 ; CHECK-NEXT: vstrh.16 q0, [r1], #152 @@ -755,9 +755,9 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: vstrh.16 q0, [r8], #152 ; CHECK-NEXT: vstrh.16 q0, [r12], #152 ; CHECK-NEXT: vstrh.16 q0, [r3], #152 -; CHECK-NEXT: strd r10, r11, [r0, #64] -; CHECK-NEXT: adds r2, #38 -; CHECK-NEXT: adds r0, #152 +; CHECK-NEXT: strd r10, r11, [r2, #64] +; CHECK-NEXT: adds r2, #152 +; CHECK-NEXT: adds r0, #38 ; CHECK-NEXT: le lr, .LBB19_15 ; CHECK-NEXT: @ %bb.16: @ %for.cond.cleanup6.3 ; CHECK-NEXT: add sp, #12 diff --git a/llvm/test/CodeGen/WebAssembly/unrolled-mem-indices.ll b/llvm/test/CodeGen/WebAssembly/unrolled-mem-indices.ll index a232853688cb1..9bd04fdd9d6c3 100644 --- a/llvm/test/CodeGen/WebAssembly/unrolled-mem-indices.ll +++ b/llvm/test/CodeGen/WebAssembly/unrolled-mem-indices.ll @@ -341,100 +341,107 @@ define hidden void @runtime(ptr nocapture noundef readonly %arg, ptr nocapture n ; CHECK-NEXT: .local i32, i32, i32, i32, i32 ; CHECK-NEXT: # %bb.0: # %bb ; CHECK-NEXT: block -; CHECK-NEXT: local.get $push32=, 3 -; CHECK-NEXT: i32.eqz $push64=, $pop32 -; CHECK-NEXT: br_if 0, $pop64 # 0: down to label4 +; CHECK-NEXT: local.get $push35=, 3 +; CHECK-NEXT: i32.eqz $push69=, $pop35 +; CHECK-NEXT: br_if 0, $pop69 # 0: down to label4 ; CHECK-NEXT: # %bb.1: # %bb4 -; CHECK-NEXT: local.get $push34=, 3 +; CHECK-NEXT: local.get $push37=, 3 ; CHECK-NEXT: i32.const $push0=, 1 -; CHECK-NEXT: i32.and $push33=, $pop34, $pop0 -; CHECK-NEXT: local.set 4, $pop33 -; CHECK-NEXT: i32.const $push35=, 0 -; CHECK-NEXT: local.set 5, $pop35 +; CHECK-NEXT: i32.and $push36=, $pop37, $pop0 +; CHECK-NEXT: local.set 4, $pop36 +; CHECK-NEXT: i32.const $push38=, 0 +; CHECK-NEXT: local.set 5, $pop38 ; CHECK-NEXT: block -; CHECK-NEXT: local.get $push36=, 3 -; CHECK-NEXT: i32.const $push20=, 1 -; CHECK-NEXT: i32.eq $push1=, $pop36, $pop20 +; CHECK-NEXT: local.get $push39=, 3 +; CHECK-NEXT: i32.const $push21=, 1 +; CHECK-NEXT: i32.eq $push1=, $pop39, $pop21 ; CHECK-NEXT: br_if 0, $pop1 # 0: down to label5 ; CHECK-NEXT: # %bb.2: # %bb7 -; CHECK-NEXT: local.get $push38=, 3 -; CHECK-NEXT: i32.const $push2=, -2 -; CHECK-NEXT: i32.and $push37=, $pop38, $pop2 -; CHECK-NEXT: local.set 6, $pop37 -; CHECK-NEXT: i32.const $push39=, 0 -; CHECK-NEXT: local.set 5, $pop39 -; CHECK-NEXT: local.get $push40=, 0 -; CHECK-NEXT: local.set 3, $pop40 -; CHECK-NEXT: local.get $push41=, 1 -; CHECK-NEXT: local.set 7, $pop41 -; CHECK-NEXT: local.get $push42=, 2 -; CHECK-NEXT: local.set 8, $pop42 +; CHECK-NEXT: i32.const $push40=, 0 +; CHECK-NEXT: local.set 6, $pop40 +; CHECK-NEXT: i32.const $push23=, 0 +; CHECK-NEXT: local.get $push41=, 3 +; CHECK-NEXT: i32.const $push22=, -2 +; CHECK-NEXT: i32.and $push2=, $pop41, $pop22 +; CHECK-NEXT: i32.sub $push42=, $pop23, $pop2 +; CHECK-NEXT: local.set 7, $pop42 +; CHECK-NEXT: local.get $push43=, 0 +; CHECK-NEXT: local.set 3, $pop43 +; CHECK-NEXT: local.get $push44=, 1 +; CHECK-NEXT: local.set 5, $pop44 +; CHECK-NEXT: local.get $push45=, 2 +; CHECK-NEXT: local.set 8, $pop45 ; CHECK-NEXT: .LBB3_3: # %bb20 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: loop # label6: -; CHECK-NEXT: local.get $push45=, 8 -; CHECK-NEXT: local.get $push43=, 3 -; CHECK-NEXT: f32.load $push4=, 0($pop43) -; CHECK-NEXT: local.get $push44=, 7 -; CHECK-NEXT: f32.load $push3=, 0($pop44) +; CHECK-NEXT: local.get $push48=, 8 +; CHECK-NEXT: local.get $push46=, 3 +; CHECK-NEXT: f32.load $push4=, 0($pop46) +; CHECK-NEXT: local.get $push47=, 5 +; CHECK-NEXT: f32.load $push3=, 0($pop47) ; CHECK-NEXT: f32.add $push5=, $pop4, $pop3 -; CHECK-NEXT: f32.store 0($pop45), $pop5 -; CHECK-NEXT: local.get $push46=, 8 -; CHECK-NEXT: i32.const $push29=, 4 -; CHECK-NEXT: i32.add $push11=, $pop46, $pop29 -; CHECK-NEXT: local.get $push47=, 3 -; CHECK-NEXT: i32.const $push28=, 4 -; CHECK-NEXT: i32.add $push8=, $pop47, $pop28 +; CHECK-NEXT: f32.store 0($pop48), $pop5 +; CHECK-NEXT: local.get $push49=, 8 +; CHECK-NEXT: i32.const $push32=, 4 +; CHECK-NEXT: i32.add $push11=, $pop49, $pop32 +; CHECK-NEXT: local.get $push50=, 3 +; CHECK-NEXT: i32.const $push31=, 4 +; CHECK-NEXT: i32.add $push8=, $pop50, $pop31 ; CHECK-NEXT: f32.load $push9=, 0($pop8) -; CHECK-NEXT: local.get $push48=, 7 -; CHECK-NEXT: i32.const $push27=, 4 -; CHECK-NEXT: i32.add $push6=, $pop48, $pop27 +; CHECK-NEXT: local.get $push51=, 5 +; CHECK-NEXT: i32.const $push30=, 4 +; CHECK-NEXT: i32.add $push6=, $pop51, $pop30 ; CHECK-NEXT: f32.load $push7=, 0($pop6) ; CHECK-NEXT: f32.add $push10=, $pop9, $pop7 ; CHECK-NEXT: f32.store 0($pop11), $pop10 -; CHECK-NEXT: local.get $push50=, 3 -; CHECK-NEXT: i32.const $push26=, 8 -; CHECK-NEXT: i32.add $push49=, $pop50, $pop26 -; CHECK-NEXT: local.set 3, $pop49 -; CHECK-NEXT: local.get $push52=, 7 -; CHECK-NEXT: i32.const $push25=, 8 -; CHECK-NEXT: i32.add $push51=, $pop52, $pop25 -; CHECK-NEXT: local.set 7, $pop51 -; CHECK-NEXT: local.get $push54=, 8 -; CHECK-NEXT: i32.const $push24=, 8 -; CHECK-NEXT: i32.add $push53=, $pop54, $pop24 -; CHECK-NEXT: local.set 8, $pop53 -; CHECK-NEXT: local.get $push56=, 6 +; CHECK-NEXT: local.get $push53=, 3 +; CHECK-NEXT: i32.const $push29=, 8 +; CHECK-NEXT: i32.add $push52=, $pop53, $pop29 +; CHECK-NEXT: local.set 3, $pop52 ; CHECK-NEXT: local.get $push55=, 5 -; CHECK-NEXT: i32.const $push23=, 2 -; CHECK-NEXT: i32.add $push22=, $pop55, $pop23 -; CHECK-NEXT: local.tee $push21=, 5, $pop22 -; CHECK-NEXT: i32.ne $push12=, $pop56, $pop21 +; CHECK-NEXT: i32.const $push28=, 8 +; CHECK-NEXT: i32.add $push54=, $pop55, $pop28 +; CHECK-NEXT: local.set 5, $pop54 +; CHECK-NEXT: local.get $push57=, 8 +; CHECK-NEXT: i32.const $push27=, 8 +; CHECK-NEXT: i32.add $push56=, $pop57, $pop27 +; CHECK-NEXT: local.set 8, $pop56 +; CHECK-NEXT: local.get $push59=, 7 +; CHECK-NEXT: local.get $push58=, 6 +; CHECK-NEXT: i32.const $push26=, -2 +; CHECK-NEXT: i32.add $push25=, $pop58, $pop26 +; CHECK-NEXT: local.tee $push24=, 6, $pop25 +; CHECK-NEXT: i32.ne $push12=, $pop59, $pop24 ; CHECK-NEXT: br_if 0, $pop12 # 0: up to label6 -; CHECK-NEXT: .LBB3_4: # %bb9 +; CHECK-NEXT: # %bb.4: # %bb9.loopexit ; CHECK-NEXT: end_loop +; CHECK-NEXT: i32.const $push13=, 0 +; CHECK-NEXT: local.get $push61=, 6 +; CHECK-NEXT: i32.sub $push60=, $pop13, $pop61 +; CHECK-NEXT: local.set 5, $pop60 +; CHECK-NEXT: .LBB3_5: # %bb9 ; CHECK-NEXT: end_block # label5: -; CHECK-NEXT: local.get $push57=, 4 -; CHECK-NEXT: i32.eqz $push65=, $pop57 -; CHECK-NEXT: br_if 0, $pop65 # 0: down to label4 -; CHECK-NEXT: # %bb.5: # %bb12 -; CHECK-NEXT: local.get $push59=, 2 -; CHECK-NEXT: local.get $push58=, 5 -; CHECK-NEXT: i32.const $push13=, 2 -; CHECK-NEXT: i32.shl $push31=, $pop58, $pop13 -; CHECK-NEXT: local.tee $push30=, 3, $pop31 -; CHECK-NEXT: i32.add $push19=, $pop59, $pop30 -; CHECK-NEXT: local.get $push61=, 0 -; CHECK-NEXT: local.get $push60=, 3 -; CHECK-NEXT: i32.add $push16=, $pop61, $pop60 -; CHECK-NEXT: f32.load $push17=, 0($pop16) -; CHECK-NEXT: local.get $push63=, 1 -; CHECK-NEXT: local.get $push62=, 3 -; CHECK-NEXT: i32.add $push14=, $pop63, $pop62 -; CHECK-NEXT: f32.load $push15=, 0($pop14) -; CHECK-NEXT: f32.add $push18=, $pop17, $pop15 -; CHECK-NEXT: f32.store 0($pop19), $pop18 -; CHECK-NEXT: .LBB3_6: # %bb19 +; CHECK-NEXT: local.get $push62=, 4 +; CHECK-NEXT: i32.eqz $push70=, $pop62 +; CHECK-NEXT: br_if 0, $pop70 # 0: down to label4 +; CHECK-NEXT: # %bb.6: # %bb12 +; CHECK-NEXT: local.get $push64=, 2 +; CHECK-NEXT: local.get $push63=, 5 +; CHECK-NEXT: i32.const $push14=, 2 +; CHECK-NEXT: i32.shl $push34=, $pop63, $pop14 +; CHECK-NEXT: local.tee $push33=, 3, $pop34 +; CHECK-NEXT: i32.add $push20=, $pop64, $pop33 +; CHECK-NEXT: local.get $push66=, 0 +; CHECK-NEXT: local.get $push65=, 3 +; CHECK-NEXT: i32.add $push17=, $pop66, $pop65 +; CHECK-NEXT: f32.load $push18=, 0($pop17) +; CHECK-NEXT: local.get $push68=, 1 +; CHECK-NEXT: local.get $push67=, 3 +; CHECK-NEXT: i32.add $push15=, $pop68, $pop67 +; CHECK-NEXT: f32.load $push16=, 0($pop15) +; CHECK-NEXT: f32.add $push19=, $pop18, $pop16 +; CHECK-NEXT: f32.store 0($pop20), $pop19 +; CHECK-NEXT: .LBB3_7: # %bb19 ; CHECK-NEXT: end_block # label4: ; CHECK-NEXT: # fallthrough-return bb: diff --git a/llvm/test/CodeGen/X86/apx/check-nf-in-suppress-reloc-pass.ll b/llvm/test/CodeGen/X86/apx/check-nf-in-suppress-reloc-pass.ll index 3a2c954e37077..a34f9deae47ec 100644 --- a/llvm/test/CodeGen/X86/apx/check-nf-in-suppress-reloc-pass.ll +++ b/llvm/test/CodeGen/X86/apx/check-nf-in-suppress-reloc-pass.ll @@ -30,8 +30,8 @@ define fastcc void @foo(i32 %0, i1 %or.cond) nounwind { ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: movq %r14, %rdx ; CHECK-NEXT: callq *%r13 -; CHECK-NEXT: incq %rbp ; CHECK-NEXT: addq $20, %r14 +; CHECK-NEXT: incq %rbp ; CHECK-NEXT: .LBB0_1: # %for.body30 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: testb $1, %bl diff --git a/llvm/test/CodeGen/X86/avx512vnni-combine.ll b/llvm/test/CodeGen/X86/avx512vnni-combine.ll index b7d950e994241..6551bcb898242 100644 --- a/llvm/test/CodeGen/X86/avx512vnni-combine.ll +++ b/llvm/test/CodeGen/X86/avx512vnni-combine.ll @@ -40,22 +40,25 @@ define <8 x i64> @foo_512(i32 %0, <8 x i64> %1, <8 x i64> %2, ptr %3) { ; CHECK-LABEL: foo_512: ; CHECK: # %bb.0: ; CHECK-NEXT: testl %edi, %edi -; CHECK-NEXT: jle .LBB1_6 +; CHECK-NEXT: jle .LBB1_7 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: movl %edi, %edx ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: andl $3, %eax ; CHECK-NEXT: cmpl $4, %edi -; CHECK-NEXT: jae .LBB1_7 +; CHECK-NEXT: jae .LBB1_8 ; CHECK-NEXT: # %bb.2: ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: jmp .LBB1_3 -; CHECK-NEXT: .LBB1_7: +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jne .LBB1_5 +; CHECK-NEXT: jmp .LBB1_7 +; CHECK-NEXT: .LBB1_8: ; CHECK-NEXT: andl $-4, %edx +; CHECK-NEXT: negq %rdx ; CHECK-NEXT: leaq 192(%rsi), %rdi ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: .LBB1_9: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vpdpwssd -192(%rdi), %zmm1, %zmm0 ; CHECK-NEXT: vpmaddwd -128(%rdi), %zmm1, %zmm2 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 @@ -63,25 +66,26 @@ define <8 x i64> @foo_512(i32 %0, <8 x i64> %1, <8 x i64> %2, ptr %3) { ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vpmaddwd (%rdi), %zmm1, %zmm2 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; CHECK-NEXT: addq $4, %rcx +; CHECK-NEXT: addq $-4, %rcx ; CHECK-NEXT: addq $256, %rdi # imm = 0x100 ; CHECK-NEXT: cmpq %rcx, %rdx -; CHECK-NEXT: jne .LBB1_8 -; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: jne .LBB1_9 +; CHECK-NEXT: # %bb.3: # %.loopexit1 +; CHECK-NEXT: negq %rcx ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: je .LBB1_6 -; CHECK-NEXT: # %bb.4: # %.preheader +; CHECK-NEXT: je .LBB1_7 +; CHECK-NEXT: .LBB1_5: # %.preheader +; CHECK-NEXT: shll $6, %eax ; CHECK-NEXT: shlq $6, %rcx ; CHECK-NEXT: addq %rcx, %rsi -; CHECK-NEXT: shll $6, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: .LBB1_6: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vpdpwssd (%rsi,%rcx), %zmm1, %zmm0 ; CHECK-NEXT: addq $64, %rcx ; CHECK-NEXT: cmpq %rcx, %rax -; CHECK-NEXT: jne .LBB1_5 -; CHECK-NEXT: .LBB1_6: +; CHECK-NEXT: jne .LBB1_6 +; CHECK-NEXT: .LBB1_7: ; CHECK-NEXT: retq %5 = icmp sgt i32 %0, 0 br i1 %5, label %6, label %33 @@ -166,21 +170,24 @@ define void @bar_512(i32 %0, ptr %1, <8 x i64> %2, ptr %3) { ; CHECK-LABEL: bar_512: ; CHECK: # %bb.0: ; CHECK-NEXT: testl %edi, %edi -; CHECK-NEXT: jle .LBB2_5 +; CHECK-NEXT: jle .LBB2_6 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: cmpl $1, %edi -; CHECK-NEXT: jne .LBB2_6 +; CHECK-NEXT: jne .LBB2_7 ; CHECK-NEXT: # %bb.2: ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: jmp .LBB2_3 -; CHECK-NEXT: .LBB2_6: +; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: jne .LBB2_5 +; CHECK-NEXT: jmp .LBB2_6 +; CHECK-NEXT: .LBB2_7: ; CHECK-NEXT: movl %eax, %edi ; CHECK-NEXT: andl $-2, %edi +; CHECK-NEXT: negq %rdi ; CHECK-NEXT: movl $64, %r8d ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: .LBB2_8: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqa64 (%rsi,%r8), %zmm1 ; CHECK-NEXT: vpmaddwd -64(%rdx,%r8), %zmm0, %zmm2 ; CHECK-NEXT: vpaddd -64(%rsi,%r8), %zmm2, %zmm2 @@ -188,19 +195,20 @@ define void @bar_512(i32 %0, ptr %1, <8 x i64> %2, ptr %3) { ; CHECK-NEXT: vpmaddwd (%rdx,%r8), %zmm0, %zmm2 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vmovdqa64 %zmm1, (%rsi,%r8) -; CHECK-NEXT: addq $2, %rcx +; CHECK-NEXT: addq $-2, %rcx ; CHECK-NEXT: subq $-128, %r8 ; CHECK-NEXT: cmpq %rcx, %rdi -; CHECK-NEXT: jne .LBB2_7 -; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: jne .LBB2_8 +; CHECK-NEXT: # %bb.3: # %.loopexit +; CHECK-NEXT: negq %rcx ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB2_5 -; CHECK-NEXT: # %bb.4: +; CHECK-NEXT: je .LBB2_6 +; CHECK-NEXT: .LBB2_5: ; CHECK-NEXT: shlq $6, %rcx ; CHECK-NEXT: vpmaddwd (%rdx,%rcx), %zmm0, %zmm0 ; CHECK-NEXT: vpaddd (%rsi,%rcx), %zmm0, %zmm0 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi,%rcx) -; CHECK-NEXT: .LBB2_5: +; CHECK-NEXT: .LBB2_6: ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %5 = icmp sgt i32 %0, 0 diff --git a/llvm/test/CodeGen/X86/avxvnni-combine.ll b/llvm/test/CodeGen/X86/avxvnni-combine.ll index 45f9a6475244e..737a4710af69a 100644 --- a/llvm/test/CodeGen/X86/avxvnni-combine.ll +++ b/llvm/test/CodeGen/X86/avxvnni-combine.ll @@ -45,22 +45,25 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) { ; AVX-LABEL: foo_128: ; AVX: # %bb.0: ; AVX-NEXT: testl %edi, %edi -; AVX-NEXT: jle .LBB1_6 +; AVX-NEXT: jle .LBB1_7 ; AVX-NEXT: # %bb.1: ; AVX-NEXT: movl %edi, %edx ; AVX-NEXT: movl %edx, %eax ; AVX-NEXT: andl $3, %eax ; AVX-NEXT: cmpl $4, %edi -; AVX-NEXT: jae .LBB1_7 +; AVX-NEXT: jae .LBB1_8 ; AVX-NEXT: # %bb.2: ; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: jmp .LBB1_3 -; AVX-NEXT: .LBB1_7: +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: jne .LBB1_5 +; AVX-NEXT: jmp .LBB1_7 +; AVX-NEXT: .LBB1_8: ; AVX-NEXT: andl $-4, %edx +; AVX-NEXT: negq %rdx ; AVX-NEXT: leaq 48(%rsi), %rdi ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4 -; AVX-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: .LBB1_9: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: {vex} vpdpwssd -48(%rdi), %xmm1, %xmm0 ; AVX-NEXT: vpmaddwd -32(%rdi), %xmm1, %xmm2 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 @@ -68,46 +71,50 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) { ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpmaddwd (%rdi), %xmm1, %xmm2 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: addq $4, %rcx +; AVX-NEXT: addq $-4, %rcx ; AVX-NEXT: addq $64, %rdi ; AVX-NEXT: cmpq %rcx, %rdx -; AVX-NEXT: jne .LBB1_8 -; AVX-NEXT: .LBB1_3: +; AVX-NEXT: jne .LBB1_9 +; AVX-NEXT: # %bb.3: # %.loopexit1 +; AVX-NEXT: negq %rcx ; AVX-NEXT: testq %rax, %rax -; AVX-NEXT: je .LBB1_6 -; AVX-NEXT: # %bb.4: # %.preheader +; AVX-NEXT: je .LBB1_7 +; AVX-NEXT: .LBB1_5: # %.preheader +; AVX-NEXT: shll $4, %eax ; AVX-NEXT: shlq $4, %rcx ; AVX-NEXT: addq %rcx, %rsi -; AVX-NEXT: shll $4, %eax ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4 -; AVX-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: .LBB1_6: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: {vex} vpdpwssd (%rsi,%rcx), %xmm1, %xmm0 ; AVX-NEXT: addq $16, %rcx ; AVX-NEXT: cmpq %rcx, %rax -; AVX-NEXT: jne .LBB1_5 -; AVX-NEXT: .LBB1_6: +; AVX-NEXT: jne .LBB1_6 +; AVX-NEXT: .LBB1_7: ; AVX-NEXT: retq ; ; AVX512-LABEL: foo_128: ; AVX512: # %bb.0: ; AVX512-NEXT: testl %edi, %edi -; AVX512-NEXT: jle .LBB1_6 +; AVX512-NEXT: jle .LBB1_7 ; AVX512-NEXT: # %bb.1: ; AVX512-NEXT: movl %edi, %edx ; AVX512-NEXT: movl %edx, %eax ; AVX512-NEXT: andl $3, %eax ; AVX512-NEXT: cmpl $4, %edi -; AVX512-NEXT: jae .LBB1_7 +; AVX512-NEXT: jae .LBB1_8 ; AVX512-NEXT: # %bb.2: ; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: jmp .LBB1_3 -; AVX512-NEXT: .LBB1_7: +; AVX512-NEXT: testq %rax, %rax +; AVX512-NEXT: jne .LBB1_5 +; AVX512-NEXT: jmp .LBB1_7 +; AVX512-NEXT: .LBB1_8: ; AVX512-NEXT: andl $-4, %edx +; AVX512-NEXT: negq %rdx ; AVX512-NEXT: leaq 48(%rsi), %rdi ; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: .p2align 4 -; AVX512-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: .LBB1_9: # =>This Inner Loop Header: Depth=1 ; AVX512-NEXT: vpdpwssd -48(%rdi), %xmm1, %xmm0 ; AVX512-NEXT: vpmaddwd -32(%rdi), %xmm1, %xmm2 ; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 @@ -115,25 +122,26 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) { ; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpmaddwd (%rdi), %xmm1, %xmm2 ; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: addq $4, %rcx +; AVX512-NEXT: addq $-4, %rcx ; AVX512-NEXT: addq $64, %rdi ; AVX512-NEXT: cmpq %rcx, %rdx -; AVX512-NEXT: jne .LBB1_8 -; AVX512-NEXT: .LBB1_3: +; AVX512-NEXT: jne .LBB1_9 +; AVX512-NEXT: # %bb.3: # %.loopexit1 +; AVX512-NEXT: negq %rcx ; AVX512-NEXT: testq %rax, %rax -; AVX512-NEXT: je .LBB1_6 -; AVX512-NEXT: # %bb.4: # %.preheader +; AVX512-NEXT: je .LBB1_7 +; AVX512-NEXT: .LBB1_5: # %.preheader +; AVX512-NEXT: shll $4, %eax ; AVX512-NEXT: shlq $4, %rcx ; AVX512-NEXT: addq %rcx, %rsi -; AVX512-NEXT: shll $4, %eax ; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: .p2align 4 -; AVX512-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: .LBB1_6: # =>This Inner Loop Header: Depth=1 ; AVX512-NEXT: vpdpwssd (%rsi,%rcx), %xmm1, %xmm0 ; AVX512-NEXT: addq $16, %rcx ; AVX512-NEXT: cmpq %rcx, %rax -; AVX512-NEXT: jne .LBB1_5 -; AVX512-NEXT: .LBB1_6: +; AVX512-NEXT: jne .LBB1_6 +; AVX512-NEXT: .LBB1_7: ; AVX512-NEXT: retq %5 = icmp sgt i32 %0, 0 br i1 %5, label %6, label %33 @@ -212,21 +220,24 @@ define void @bar_128(i32 %0, ptr %1, <2 x i64> %2, ptr %3) { ; AVX-LABEL: bar_128: ; AVX: # %bb.0: ; AVX-NEXT: testl %edi, %edi -; AVX-NEXT: jle .LBB2_5 +; AVX-NEXT: jle .LBB2_6 ; AVX-NEXT: # %bb.1: ; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmpl $1, %edi -; AVX-NEXT: jne .LBB2_6 +; AVX-NEXT: jne .LBB2_7 ; AVX-NEXT: # %bb.2: ; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: jmp .LBB2_3 -; AVX-NEXT: .LBB2_6: +; AVX-NEXT: testb $1, %al +; AVX-NEXT: jne .LBB2_5 +; AVX-NEXT: jmp .LBB2_6 +; AVX-NEXT: .LBB2_7: ; AVX-NEXT: movl %eax, %edi ; AVX-NEXT: andl $-2, %edi +; AVX-NEXT: negq %rdi ; AVX-NEXT: movl $16, %r8d ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4 -; AVX-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: .LBB2_8: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: vmovdqa (%rsi,%r8), %xmm1 ; AVX-NEXT: vpmaddwd -16(%rdx,%r8), %xmm0, %xmm2 ; AVX-NEXT: vpaddd -16(%rsi,%r8), %xmm2, %xmm2 @@ -234,39 +245,43 @@ define void @bar_128(i32 %0, ptr %1, <2 x i64> %2, ptr %3) { ; AVX-NEXT: vpmaddwd (%rdx,%r8), %xmm0, %xmm2 ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rsi,%r8) -; AVX-NEXT: addq $2, %rcx +; AVX-NEXT: addq $-2, %rcx ; AVX-NEXT: addq $32, %r8 ; AVX-NEXT: cmpq %rcx, %rdi -; AVX-NEXT: jne .LBB2_7 -; AVX-NEXT: .LBB2_3: +; AVX-NEXT: jne .LBB2_8 +; AVX-NEXT: # %bb.3: # %.loopexit +; AVX-NEXT: negq %rcx ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB2_5 -; AVX-NEXT: # %bb.4: -; AVX-NEXT: shlq $4, %rcx -; AVX-NEXT: vmovdqa (%rsi,%rcx), %xmm1 -; AVX-NEXT: {vex} vpdpwssd (%rdx,%rcx), %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rsi,%rcx) +; AVX-NEXT: je .LBB2_6 ; AVX-NEXT: .LBB2_5: +; AVX-NEXT: shlq $4, %rcx +; AVX-NEXT: vpmaddwd (%rdx,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpaddd (%rsi,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi,%rcx) +; AVX-NEXT: .LBB2_6: ; AVX-NEXT: retq ; ; AVX512-LABEL: bar_128: ; AVX512: # %bb.0: ; AVX512-NEXT: testl %edi, %edi -; AVX512-NEXT: jle .LBB2_5 +; AVX512-NEXT: jle .LBB2_6 ; AVX512-NEXT: # %bb.1: ; AVX512-NEXT: movl %edi, %eax ; AVX512-NEXT: cmpl $1, %edi -; AVX512-NEXT: jne .LBB2_6 +; AVX512-NEXT: jne .LBB2_7 ; AVX512-NEXT: # %bb.2: ; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: jmp .LBB2_3 -; AVX512-NEXT: .LBB2_6: +; AVX512-NEXT: testb $1, %al +; AVX512-NEXT: jne .LBB2_5 +; AVX512-NEXT: jmp .LBB2_6 +; AVX512-NEXT: .LBB2_7: ; AVX512-NEXT: movl %eax, %edi ; AVX512-NEXT: andl $-2, %edi +; AVX512-NEXT: negq %rdi ; AVX512-NEXT: movl $16, %r8d ; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: .p2align 4 -; AVX512-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: .LBB2_8: # =>This Inner Loop Header: Depth=1 ; AVX512-NEXT: vmovdqa (%rsi,%r8), %xmm1 ; AVX512-NEXT: vpmaddwd -16(%rdx,%r8), %xmm0, %xmm2 ; AVX512-NEXT: vpaddd -16(%rsi,%r8), %xmm2, %xmm2 @@ -274,19 +289,20 @@ define void @bar_128(i32 %0, ptr %1, <2 x i64> %2, ptr %3) { ; AVX512-NEXT: vpmaddwd (%rdx,%r8), %xmm0, %xmm2 ; AVX512-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovdqa %xmm1, (%rsi,%r8) -; AVX512-NEXT: addq $2, %rcx +; AVX512-NEXT: addq $-2, %rcx ; AVX512-NEXT: addq $32, %r8 ; AVX512-NEXT: cmpq %rcx, %rdi -; AVX512-NEXT: jne .LBB2_7 -; AVX512-NEXT: .LBB2_3: +; AVX512-NEXT: jne .LBB2_8 +; AVX512-NEXT: # %bb.3: # %.loopexit +; AVX512-NEXT: negq %rcx ; AVX512-NEXT: testb $1, %al -; AVX512-NEXT: je .LBB2_5 -; AVX512-NEXT: # %bb.4: +; AVX512-NEXT: je .LBB2_6 +; AVX512-NEXT: .LBB2_5: ; AVX512-NEXT: shlq $4, %rcx ; AVX512-NEXT: vpmaddwd (%rdx,%rcx), %xmm0, %xmm0 ; AVX512-NEXT: vpaddd (%rsi,%rcx), %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi,%rcx) -; AVX512-NEXT: .LBB2_5: +; AVX512-NEXT: .LBB2_6: ; AVX512-NEXT: retq %5 = icmp sgt i32 %0, 0 br i1 %5, label %6, label %22 @@ -392,22 +408,25 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) { ; AVX-LABEL: foo_256: ; AVX: # %bb.0: ; AVX-NEXT: testl %edi, %edi -; AVX-NEXT: jle .LBB4_6 +; AVX-NEXT: jle .LBB4_7 ; AVX-NEXT: # %bb.1: ; AVX-NEXT: movl %edi, %edx ; AVX-NEXT: movl %edx, %eax ; AVX-NEXT: andl $3, %eax ; AVX-NEXT: cmpl $4, %edi -; AVX-NEXT: jae .LBB4_7 +; AVX-NEXT: jae .LBB4_8 ; AVX-NEXT: # %bb.2: ; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: jmp .LBB4_3 -; AVX-NEXT: .LBB4_7: +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: jne .LBB4_5 +; AVX-NEXT: jmp .LBB4_7 +; AVX-NEXT: .LBB4_8: ; AVX-NEXT: andl $-4, %edx +; AVX-NEXT: negq %rdx ; AVX-NEXT: leaq 96(%rsi), %rdi ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4 -; AVX-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: .LBB4_9: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0 ; AVX-NEXT: vpmaddwd -64(%rdi), %ymm1, %ymm2 ; AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 @@ -415,46 +434,50 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) { ; AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2 ; AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX-NEXT: addq $4, %rcx +; AVX-NEXT: addq $-4, %rcx ; AVX-NEXT: subq $-128, %rdi ; AVX-NEXT: cmpq %rcx, %rdx -; AVX-NEXT: jne .LBB4_8 -; AVX-NEXT: .LBB4_3: +; AVX-NEXT: jne .LBB4_9 +; AVX-NEXT: # %bb.3: # %.loopexit1 +; AVX-NEXT: negq %rcx ; AVX-NEXT: testq %rax, %rax -; AVX-NEXT: je .LBB4_6 -; AVX-NEXT: # %bb.4: # %.preheader +; AVX-NEXT: je .LBB4_7 +; AVX-NEXT: .LBB4_5: # %.preheader +; AVX-NEXT: shll $5, %eax ; AVX-NEXT: shlq $5, %rcx ; AVX-NEXT: addq %rcx, %rsi -; AVX-NEXT: shll $5, %eax ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4 -; AVX-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: .LBB4_6: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: {vex} vpdpwssd (%rsi,%rcx), %ymm1, %ymm0 ; AVX-NEXT: addq $32, %rcx ; AVX-NEXT: cmpq %rcx, %rax -; AVX-NEXT: jne .LBB4_5 -; AVX-NEXT: .LBB4_6: +; AVX-NEXT: jne .LBB4_6 +; AVX-NEXT: .LBB4_7: ; AVX-NEXT: retq ; ; AVX512-LABEL: foo_256: ; AVX512: # %bb.0: ; AVX512-NEXT: testl %edi, %edi -; AVX512-NEXT: jle .LBB4_6 +; AVX512-NEXT: jle .LBB4_7 ; AVX512-NEXT: # %bb.1: ; AVX512-NEXT: movl %edi, %edx ; AVX512-NEXT: movl %edx, %eax ; AVX512-NEXT: andl $3, %eax ; AVX512-NEXT: cmpl $4, %edi -; AVX512-NEXT: jae .LBB4_7 +; AVX512-NEXT: jae .LBB4_8 ; AVX512-NEXT: # %bb.2: ; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: jmp .LBB4_3 -; AVX512-NEXT: .LBB4_7: +; AVX512-NEXT: testq %rax, %rax +; AVX512-NEXT: jne .LBB4_5 +; AVX512-NEXT: jmp .LBB4_7 +; AVX512-NEXT: .LBB4_8: ; AVX512-NEXT: andl $-4, %edx +; AVX512-NEXT: negq %rdx ; AVX512-NEXT: leaq 96(%rsi), %rdi ; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: .p2align 4 -; AVX512-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: .LBB4_9: # =>This Inner Loop Header: Depth=1 ; AVX512-NEXT: vpdpwssd -96(%rdi), %ymm1, %ymm0 ; AVX512-NEXT: vpmaddwd -64(%rdi), %ymm1, %ymm2 ; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 @@ -462,25 +485,26 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) { ; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2 ; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: addq $4, %rcx +; AVX512-NEXT: addq $-4, %rcx ; AVX512-NEXT: subq $-128, %rdi ; AVX512-NEXT: cmpq %rcx, %rdx -; AVX512-NEXT: jne .LBB4_8 -; AVX512-NEXT: .LBB4_3: +; AVX512-NEXT: jne .LBB4_9 +; AVX512-NEXT: # %bb.3: # %.loopexit1 +; AVX512-NEXT: negq %rcx ; AVX512-NEXT: testq %rax, %rax -; AVX512-NEXT: je .LBB4_6 -; AVX512-NEXT: # %bb.4: # %.preheader +; AVX512-NEXT: je .LBB4_7 +; AVX512-NEXT: .LBB4_5: # %.preheader +; AVX512-NEXT: shll $5, %eax ; AVX512-NEXT: shlq $5, %rcx ; AVX512-NEXT: addq %rcx, %rsi -; AVX512-NEXT: shll $5, %eax ; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: .p2align 4 -; AVX512-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: .LBB4_6: # =>This Inner Loop Header: Depth=1 ; AVX512-NEXT: vpdpwssd (%rsi,%rcx), %ymm1, %ymm0 ; AVX512-NEXT: addq $32, %rcx ; AVX512-NEXT: cmpq %rcx, %rax -; AVX512-NEXT: jne .LBB4_5 -; AVX512-NEXT: .LBB4_6: +; AVX512-NEXT: jne .LBB4_6 +; AVX512-NEXT: .LBB4_7: ; AVX512-NEXT: retq %5 = icmp sgt i32 %0, 0 br i1 %5, label %6, label %33 @@ -566,21 +590,24 @@ define void @bar_256(i32 %0, ptr %1, <4 x i64> %2, ptr %3) { ; AVX-LABEL: bar_256: ; AVX: # %bb.0: ; AVX-NEXT: testl %edi, %edi -; AVX-NEXT: jle .LBB5_5 +; AVX-NEXT: jle .LBB5_6 ; AVX-NEXT: # %bb.1: ; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmpl $1, %edi -; AVX-NEXT: jne .LBB5_6 +; AVX-NEXT: jne .LBB5_7 ; AVX-NEXT: # %bb.2: ; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: jmp .LBB5_3 -; AVX-NEXT: .LBB5_6: +; AVX-NEXT: testb $1, %al +; AVX-NEXT: jne .LBB5_5 +; AVX-NEXT: jmp .LBB5_6 +; AVX-NEXT: .LBB5_7: ; AVX-NEXT: movl %eax, %edi ; AVX-NEXT: andl $-2, %edi +; AVX-NEXT: negq %rdi ; AVX-NEXT: movl $32, %r8d ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4 -; AVX-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: .LBB5_8: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: vmovdqa (%rsi,%r8), %ymm1 ; AVX-NEXT: vpmaddwd -32(%rdx,%r8), %ymm0, %ymm2 ; AVX-NEXT: vpaddd -32(%rsi,%r8), %ymm2, %ymm2 @@ -588,40 +615,44 @@ define void @bar_256(i32 %0, ptr %1, <4 x i64> %2, ptr %3) { ; AVX-NEXT: vpmaddwd (%rdx,%r8), %ymm0, %ymm2 ; AVX-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vmovdqa %ymm1, (%rsi,%r8) -; AVX-NEXT: addq $2, %rcx +; AVX-NEXT: addq $-2, %rcx ; AVX-NEXT: addq $64, %r8 ; AVX-NEXT: cmpq %rcx, %rdi -; AVX-NEXT: jne .LBB5_7 -; AVX-NEXT: .LBB5_3: +; AVX-NEXT: jne .LBB5_8 +; AVX-NEXT: # %bb.3: # %.loopexit +; AVX-NEXT: negq %rcx ; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB5_5 -; AVX-NEXT: # %bb.4: -; AVX-NEXT: shlq $5, %rcx -; AVX-NEXT: vmovdqa (%rsi,%rcx), %ymm1 -; AVX-NEXT: {vex} vpdpwssd (%rdx,%rcx), %ymm0, %ymm1 -; AVX-NEXT: vmovdqa %ymm1, (%rsi,%rcx) +; AVX-NEXT: je .LBB5_6 ; AVX-NEXT: .LBB5_5: +; AVX-NEXT: shlq $5, %rcx +; AVX-NEXT: vpmaddwd (%rdx,%rcx), %ymm0, %ymm0 +; AVX-NEXT: vpaddd (%rsi,%rcx), %ymm0, %ymm0 +; AVX-NEXT: vmovdqa %ymm0, (%rsi,%rcx) +; AVX-NEXT: .LBB5_6: ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: bar_256: ; AVX512: # %bb.0: ; AVX512-NEXT: testl %edi, %edi -; AVX512-NEXT: jle .LBB5_5 +; AVX512-NEXT: jle .LBB5_6 ; AVX512-NEXT: # %bb.1: ; AVX512-NEXT: movl %edi, %eax ; AVX512-NEXT: cmpl $1, %edi -; AVX512-NEXT: jne .LBB5_6 +; AVX512-NEXT: jne .LBB5_7 ; AVX512-NEXT: # %bb.2: ; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: jmp .LBB5_3 -; AVX512-NEXT: .LBB5_6: +; AVX512-NEXT: testb $1, %al +; AVX512-NEXT: jne .LBB5_5 +; AVX512-NEXT: jmp .LBB5_6 +; AVX512-NEXT: .LBB5_7: ; AVX512-NEXT: movl %eax, %edi ; AVX512-NEXT: andl $-2, %edi +; AVX512-NEXT: negq %rdi ; AVX512-NEXT: movl $32, %r8d ; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: .p2align 4 -; AVX512-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: .LBB5_8: # =>This Inner Loop Header: Depth=1 ; AVX512-NEXT: vmovdqa (%rsi,%r8), %ymm1 ; AVX512-NEXT: vpmaddwd -32(%rdx,%r8), %ymm0, %ymm2 ; AVX512-NEXT: vpaddd -32(%rsi,%r8), %ymm2, %ymm2 @@ -629,19 +660,20 @@ define void @bar_256(i32 %0, ptr %1, <4 x i64> %2, ptr %3) { ; AVX512-NEXT: vpmaddwd (%rdx,%r8), %ymm0, %ymm2 ; AVX512-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX512-NEXT: vmovdqa %ymm1, (%rsi,%r8) -; AVX512-NEXT: addq $2, %rcx +; AVX512-NEXT: addq $-2, %rcx ; AVX512-NEXT: addq $64, %r8 ; AVX512-NEXT: cmpq %rcx, %rdi -; AVX512-NEXT: jne .LBB5_7 -; AVX512-NEXT: .LBB5_3: +; AVX512-NEXT: jne .LBB5_8 +; AVX512-NEXT: # %bb.3: # %.loopexit +; AVX512-NEXT: negq %rcx ; AVX512-NEXT: testb $1, %al -; AVX512-NEXT: je .LBB5_5 -; AVX512-NEXT: # %bb.4: +; AVX512-NEXT: je .LBB5_6 +; AVX512-NEXT: .LBB5_5: ; AVX512-NEXT: shlq $5, %rcx ; AVX512-NEXT: vpmaddwd (%rdx,%rcx), %ymm0, %ymm0 ; AVX512-NEXT: vpaddd (%rsi,%rcx), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rsi,%rcx) -; AVX512-NEXT: .LBB5_5: +; AVX512-NEXT: .LBB5_6: ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %5 = icmp sgt i32 %0, 0 diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll index bf6b09674e187..4e7df799d527b 100644 --- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll +++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll @@ -96,6 +96,17 @@ entry: define void @_Z2x6v() local_unnamed_addr { ; CHECK-LABEL: _Z2x6v: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq x1@GOTPCREL(%rip), %rax +; CHECK-NEXT: movl (%rax), %eax +; CHECK-NEXT: andl $511, %eax # imm = 0x1FF +; CHECK-NEXT: leaq 1(%rax), %rdx +; CHECK-NEXT: movq x4@GOTPCREL(%rip), %rcx +; CHECK-NEXT: movl %edx, (%rcx) +; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rcx +; CHECK-NEXT: movl (%rcx), %ecx +; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: je .LBB1_18 +; CHECK-NEXT: # %bb.1: # %for.cond1thread-pre-split.lr.ph ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %r15 @@ -114,32 +125,21 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq x1@GOTPCREL(%rip), %rax -; CHECK-NEXT: movl (%rax), %ebx -; CHECK-NEXT: andl $511, %ebx # imm = 0x1FF -; CHECK-NEXT: leaq 1(%rbx), %rax -; CHECK-NEXT: movq x4@GOTPCREL(%rip), %rcx -; CHECK-NEXT: movl %eax, (%rcx) -; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rcx -; CHECK-NEXT: movl (%rcx), %ecx -; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: je .LBB1_18 -; CHECK-NEXT: # %bb.1: # %for.cond1thread-pre-split.lr.ph -; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rdx -; CHECK-NEXT: movq (%rdx), %rsi -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: notl %edx -; CHECK-NEXT: leaq 8(,%rdx,8), %rdi -; CHECK-NEXT: imulq %rax, %rdi +; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rsi +; CHECK-NEXT: movq (%rsi), %rsi +; CHECK-NEXT: movl %ecx, %edi +; CHECK-NEXT: notl %edi +; CHECK-NEXT: leaq 8(,%rdi,8), %rdi +; CHECK-NEXT: imulq %rdx, %rdi ; CHECK-NEXT: addq %rsi, %rdi ; CHECK-NEXT: movq x2@GOTPCREL(%rip), %r8 ; CHECK-NEXT: movl (%r8), %edx -; CHECK-NEXT: leal 8(,%rbx,8), %eax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: leaq 8(%rsi), %rax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: leal 8(,%rax,8), %r9d +; CHECK-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: leaq 8(%rsi), %r9 +; CHECK-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: leaq 32(%rsi), %r11 -; CHECK-NEXT: leaq 8(,%rbx,8), %rbx +; CHECK-NEXT: leaq 8(,%rax,8), %rbx ; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: movq x0@GOTPCREL(%rip), %r15 ; CHECK-NEXT: movq %rsi, %r12 @@ -195,8 +195,8 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: ja .LBB1_14 ; CHECK-NEXT: .LBB1_7: # %vector.body.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: leaq -4(%rdx), %rax -; CHECK-NEXT: btl $2, %eax +; CHECK-NEXT: leaq -4(%rdx), %r10 +; CHECK-NEXT: btl $2, %r10d ; CHECK-NEXT: jb .LBB1_8 ; CHECK-NEXT: # %bb.9: # %vector.body.prol.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 @@ -204,22 +204,21 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; CHECK-NEXT: movdqu %xmm0, (%r12,%r13,8) ; CHECK-NEXT: movdqu %xmm0, 16(%r12,%r13,8) -; CHECK-NEXT: movl $4, %r10d -; CHECK-NEXT: shrq $2, %rax +; CHECK-NEXT: movl $4, %eax +; CHECK-NEXT: shrq $2, %r10 ; CHECK-NEXT: jne .LBB1_11 ; CHECK-NEXT: jmp .LBB1_13 ; CHECK-NEXT: .LBB1_8: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: xorl %r10d, %r10d -; CHECK-NEXT: shrq $2, %rax +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: shrq $2, %r10 ; CHECK-NEXT: je .LBB1_13 ; CHECK-NEXT: .LBB1_11: # %vector.body.preheader.new ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: leaq (%rax,%r13), %r9 +; CHECK-NEXT: leaq (%r11,%r9,8), %r10 ; CHECK-NEXT: subq %rdx, %rax -; CHECK-NEXT: addq %r13, %r10 -; CHECK-NEXT: leaq (%r11,%r10,8), %r10 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_12: # %vector.body ; CHECK-NEXT: # Parent Loop BB1_2 Depth=1 @@ -253,7 +252,6 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: movq %rdi, (%rax) ; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rax ; CHECK-NEXT: movl $0, (%rax) -; CHECK-NEXT: .LBB1_18: # %for.end5 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: popq %r12 @@ -266,6 +264,13 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_restore %rbx +; CHECK-NEXT: .cfi_restore %r12 +; CHECK-NEXT: .cfi_restore %r13 +; CHECK-NEXT: .cfi_restore %r14 +; CHECK-NEXT: .cfi_restore %r15 +; CHECK-NEXT: .cfi_restore %rbp +; CHECK-NEXT: .LBB1_18: # %for.end5 ; CHECK-NEXT: retq entry: %0 = load i32, ptr @x1, align 4 diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce7.ll b/llvm/test/CodeGen/X86/loop-strength-reduce7.ll index 11473bacc56cf..c47f48d3bb929 100644 --- a/llvm/test/CodeGen/X86/loop-strength-reduce7.ll +++ b/llvm/test/CodeGen/X86/loop-strength-reduce7.ll @@ -11,14 +11,14 @@ define fastcc void @outer_loop(ptr nocapture %gfp, ptr nocapture %xr, i32 %targ_ ; CHECK-LABEL: outer_loop: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: pushl %esi -; CHECK-NEXT: movl $88, %eax -; CHECK-NEXT: movl $168, %ecx +; CHECK-NEXT: movl $168, %eax +; CHECK-NEXT: movl $88, %ecx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_2: ## %bb28.i37 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB0_3 Depth 2 ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: movl %ecx, %esi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_3: ## %bb29.i38 ; CHECK-NEXT: ## Parent Loop BB0_2 Depth=1 @@ -29,8 +29,8 @@ define fastcc void @outer_loop(ptr nocapture %gfp, ptr nocapture %xr, i32 %targ_ ; CHECK-NEXT: jbe LBB0_3 ; CHECK-NEXT: ## %bb.1: ## %bb28.i37.loopexit ; CHECK-NEXT: ## in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: addl $4, %eax -; CHECK-NEXT: addl $168, %ecx +; CHECK-NEXT: addl $168, %eax +; CHECK-NEXT: addl $4, %ecx ; CHECK-NEXT: jmp LBB0_2 entry: br label %bb4 diff --git a/llvm/test/CodeGen/X86/masked-iv-safe.ll b/llvm/test/CodeGen/X86/masked-iv-safe.ll index a4f5e52a27d8a..a9f5c78de92c6 100644 --- a/llvm/test/CodeGen/X86/masked-iv-safe.ll +++ b/llvm/test/CodeGen/X86/masked-iv-safe.ll @@ -53,7 +53,7 @@ return: define void @count_down(ptr %d, i64 %n) nounwind { ; CHECK-LABEL: count_down: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl $80, %eax +; CHECK-NEXT: movl $10, %eax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0] @@ -64,8 +64,8 @@ define void @count_down(ptr %d, i64 %n) nounwind { ; CHECK-NEXT: mulsd %xmm0, %xmm3 ; CHECK-NEXT: mulsd %xmm1, %xmm3 ; CHECK-NEXT: mulsd %xmm2, %xmm3 -; CHECK-NEXT: movsd %xmm3, (%rdi,%rax) -; CHECK-NEXT: addq $-8, %rax +; CHECK-NEXT: movsd %xmm3, (%rdi,%rax,8) +; CHECK-NEXT: decq %rax ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq @@ -147,7 +147,7 @@ return: define void @count_down_signed(ptr %d, i64 %n) nounwind { ; CHECK-LABEL: count_down_signed: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl $80, %eax +; CHECK-NEXT: movl $10, %eax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0] @@ -158,8 +158,8 @@ define void @count_down_signed(ptr %d, i64 %n) nounwind { ; CHECK-NEXT: mulsd %xmm0, %xmm3 ; CHECK-NEXT: mulsd %xmm1, %xmm3 ; CHECK-NEXT: mulsd %xmm2, %xmm3 -; CHECK-NEXT: movsd %xmm3, (%rdi,%rax) -; CHECK-NEXT: addq $-8, %rax +; CHECK-NEXT: movsd %xmm3, (%rdi,%rax,8) +; CHECK-NEXT: decq %rax ; CHECK-NEXT: jne .LBB3_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq @@ -195,7 +195,7 @@ return: define void @another_count_up(ptr %d, i64 %n) nounwind { ; CHECK-LABEL: another_count_up: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq $-8, %rax +; CHECK-NEXT: movq $-1, %rax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0] @@ -204,14 +204,14 @@ define void @another_count_up(ptr %d, i64 %n) nounwind { ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm0, %xmm3 -; CHECK-NEXT: movsd %xmm3, 2048(%rdi,%rax) +; CHECK-NEXT: movsd %xmm3, 2048(%rdi,%rax,8) ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm1, %xmm3 -; CHECK-NEXT: movsd %xmm3, 134217728(%rdi,%rax) +; CHECK-NEXT: movsd %xmm3, 134217728(%rdi,%rax,8) ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm2, %xmm3 -; CHECK-NEXT: movsd %xmm3, (%rdi,%rax) -; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: movsd %xmm3, (%rdi,%rax,8) +; CHECK-NEXT: incq %rax ; CHECK-NEXT: jne .LBB4_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq @@ -299,7 +299,7 @@ return: define void @another_count_up_signed(ptr %d, i64 %n) nounwind { ; CHECK-LABEL: another_count_up_signed: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq $-8, %rax +; CHECK-NEXT: movq $-1, %rax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0] @@ -310,8 +310,8 @@ define void @another_count_up_signed(ptr %d, i64 %n) nounwind { ; CHECK-NEXT: mulsd %xmm0, %xmm3 ; CHECK-NEXT: divsd %xmm1, %xmm3 ; CHECK-NEXT: mulsd %xmm2, %xmm3 -; CHECK-NEXT: movsd %xmm3, (%rdi,%rax) -; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: movsd %xmm3, (%rdi,%rax,8) +; CHECK-NEXT: incq %rax ; CHECK-NEXT: jne .LBB6_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll index 283c00e17f21a..c724b6a73c34f 100644 --- a/llvm/test/CodeGen/X86/optimize-max-0.ll +++ b/llvm/test/CodeGen/X86/optimize-max-0.ll @@ -84,11 +84,11 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload +; CHECK-NEXT: addl %esi, %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: addl $2, %esi ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload -; CHECK-NEXT: addl %esi, %ecx ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4 @@ -118,9 +118,9 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; CHECK-NEXT: incl %edi ; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload ; CHECK-NEXT: addl $2, %esi -; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; CHECK-NEXT: jl LBB0_9 ; CHECK-NEXT: LBB0_13: ## %bb20 @@ -517,11 +517,11 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: addl $2, %edx ; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: .p2align 4 @@ -551,9 +551,9 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload ; CHECK-NEXT: incl %ebx ; CHECK-NEXT: addl %ebp, %ecx +; CHECK-NEXT: addl %ebp, %eax ; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload ; CHECK-NEXT: addl $2, %edx -; CHECK-NEXT: addl %ebp, %eax ; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; CHECK-NEXT: jb LBB1_9 ; CHECK-NEXT: LBB1_13: ## %bb20 diff --git a/llvm/test/CodeGen/X86/pr42565.ll b/llvm/test/CodeGen/X86/pr42565.ll index 10071064057b8..65c1c0b17ba7e 100644 --- a/llvm/test/CodeGen/X86/pr42565.ll +++ b/llvm/test/CodeGen/X86/pr42565.ll @@ -5,16 +5,16 @@ define void @HUF_writeCTable_wksp() { ; CHECK-LABEL: HUF_writeCTable_wksp: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl $2, %eax -; CHECK-NEXT: movb $-2, %cl +; CHECK-NEXT: movb $-2, %al +; CHECK-NEXT: movl $2, %ecx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: leal 1(%rcx), %edx +; CHECK-NEXT: leal 1(%rax), %edx ; CHECK-NEXT: movb %dl, (%rax) -; CHECK-NEXT: movb %cl, (%rax) -; CHECK-NEXT: leaq 2(%rax), %rax -; CHECK-NEXT: addb $-2, %cl +; CHECK-NEXT: movb %al, (%rcx) +; CHECK-NEXT: addb $-2, %al +; CHECK-NEXT: leaq 2(%rcx), %rcx ; CHECK-NEXT: jmp .LBB0_1 entry: br label %for.body diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll index c24823538aa14..123585983cbc8 100644 --- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll +++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll @@ -195,7 +195,7 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) { ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 ; CHECK-NEXT: movl $2, %ebp ; CHECK-NEXT: jmp LBB0_20 -; CHECK-NEXT: LBB0_32: ## %if.end517.loopexitsplit +; CHECK-NEXT: LBB0_32: ## %do.body479.backedge.if.end517.loopexit_crit_edge ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 ; CHECK-NEXT: incq %rbx ; CHECK-NEXT: LBB0_33: ## %if.end517 diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll index aa954aeb0ad07..b59b8a0d25f52 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll @@ -154,66 +154,26 @@ for.exit: } define void @mixed_offsets_scalable_then_fixed(ptr %src, ptr %dst, i64 %count) #0 { -; BASE-LABEL: mixed_offsets_scalable_then_fixed: -; BASE: // %bb.0: // %entry -; BASE-NEXT: incb x0, all, mul #4 -; BASE-NEXT: ptrue p0.s -; BASE-NEXT: mov x8, #8 // =0x8 -; BASE-NEXT: .LBB3_1: // %for.body -; BASE-NEXT: // =>This Inner Loop Header: Depth=1 -; BASE-NEXT: ldr z0, [x0, #-4, mul vl] -; BASE-NEXT: ldr z1, [x0] -; BASE-NEXT: decw x2 -; BASE-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2] -; BASE-NEXT: incb x0 -; BASE-NEXT: add z0.s, z0.s, z1.s -; BASE-NEXT: add z0.s, z0.s, z2.s -; BASE-NEXT: str z0, [x1] -; BASE-NEXT: incb x1 -; BASE-NEXT: cbnz x2, .LBB3_1 -; BASE-NEXT: // %bb.2: // %for.exit -; BASE-NEXT: ret -; -; PREINDEX-LABEL: mixed_offsets_scalable_then_fixed: -; PREINDEX: // %bb.0: // %entry -; PREINDEX-NEXT: incb x0, all, mul #4 -; PREINDEX-NEXT: ptrue p0.s -; PREINDEX-NEXT: mov x8, #8 // =0x8 -; PREINDEX-NEXT: .LBB3_1: // %for.body -; PREINDEX-NEXT: // =>This Inner Loop Header: Depth=1 -; PREINDEX-NEXT: ldr z0, [x0, #-4, mul vl] -; PREINDEX-NEXT: ldr z1, [x0] -; PREINDEX-NEXT: decw x2 -; PREINDEX-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2] -; PREINDEX-NEXT: incb x0 -; PREINDEX-NEXT: add z0.s, z0.s, z1.s -; PREINDEX-NEXT: add z0.s, z0.s, z2.s -; PREINDEX-NEXT: str z0, [x1] -; PREINDEX-NEXT: incb x1 -; PREINDEX-NEXT: cbnz x2, .LBB3_1 -; PREINDEX-NEXT: // %bb.2: // %for.exit -; PREINDEX-NEXT: ret -; -; POSTINDEX-LABEL: mixed_offsets_scalable_then_fixed: -; POSTINDEX: // %bb.0: // %entry -; POSTINDEX-NEXT: incb x0, all, mul #4 -; POSTINDEX-NEXT: ptrue p0.s -; POSTINDEX-NEXT: mov x8, xzr -; POSTINDEX-NEXT: mov x9, #8 // =0x8 -; POSTINDEX-NEXT: .LBB3_1: // %for.body -; POSTINDEX-NEXT: // =>This Inner Loop Header: Depth=1 -; POSTINDEX-NEXT: ldr z0, [x0, #-4, mul vl] -; POSTINDEX-NEXT: ldr z1, [x0] -; POSTINDEX-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] -; POSTINDEX-NEXT: incb x0 -; POSTINDEX-NEXT: add z0.s, z0.s, z1.s -; POSTINDEX-NEXT: add z0.s, z0.s, z2.s -; POSTINDEX-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] -; POSTINDEX-NEXT: incw x8 -; POSTINDEX-NEXT: cmp x2, x8 -; POSTINDEX-NEXT: b.ne .LBB3_1 -; POSTINDEX-NEXT: // %bb.2: // %for.exit -; POSTINDEX-NEXT: ret +; COMMON-LABEL: mixed_offsets_scalable_then_fixed: +; COMMON: // %bb.0: // %entry +; COMMON-NEXT: incb x0, all, mul #4 +; COMMON-NEXT: ptrue p0.s +; COMMON-NEXT: mov x8, xzr +; COMMON-NEXT: mov x9, #8 // =0x8 +; COMMON-NEXT: .LBB3_1: // %for.body +; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 +; COMMON-NEXT: ldr z0, [x0, #-4, mul vl] +; COMMON-NEXT: ldr z1, [x0] +; COMMON-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; COMMON-NEXT: incb x0 +; COMMON-NEXT: add z0.s, z0.s, z1.s +; COMMON-NEXT: add z0.s, z0.s, z2.s +; COMMON-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; COMMON-NEXT: incw x8 +; COMMON-NEXT: cmp x2, x8 +; COMMON-NEXT: b.ne .LBB3_1 +; COMMON-NEXT: // %bb.2: // %for.exit +; COMMON-NEXT: ret entry: %vscale = tail call i64 @llvm.vscale.i64() %mul = shl nuw nsw i64 %vscale, 4 @@ -297,58 +257,24 @@ for.exit: ;; base (but in range of the other). ;; define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 { -; BASE-LABEL: three_access_wide_gap: -; BASE: // %bb.0: // %entry -; BASE-NEXT: .LBB5_1: // %for.body -; BASE-NEXT: // =>This Inner Loop Header: Depth=1 -; BASE-NEXT: ldr z0, [x0] -; BASE-NEXT: ldr z1, [x0, #4, mul vl] -; BASE-NEXT: decw x2 -; BASE-NEXT: ldr z2, [x0, #8, mul vl] -; BASE-NEXT: incb x0 -; BASE-NEXT: add z0.s, z0.s, z1.s -; BASE-NEXT: add z0.s, z0.s, z2.s -; BASE-NEXT: str z0, [x1] -; BASE-NEXT: incb x1 -; BASE-NEXT: cbnz x2, .LBB5_1 -; BASE-NEXT: // %bb.2: // %for.exit -; BASE-NEXT: ret -; -; PREINDEX-LABEL: three_access_wide_gap: -; PREINDEX: // %bb.0: // %entry -; PREINDEX-NEXT: .LBB5_1: // %for.body -; PREINDEX-NEXT: // =>This Inner Loop Header: Depth=1 -; PREINDEX-NEXT: ldr z0, [x0] -; PREINDEX-NEXT: ldr z1, [x0, #4, mul vl] -; PREINDEX-NEXT: decw x2 -; PREINDEX-NEXT: ldr z2, [x0, #8, mul vl] -; PREINDEX-NEXT: incb x0 -; PREINDEX-NEXT: add z0.s, z0.s, z1.s -; PREINDEX-NEXT: add z0.s, z0.s, z2.s -; PREINDEX-NEXT: str z0, [x1] -; PREINDEX-NEXT: incb x1 -; PREINDEX-NEXT: cbnz x2, .LBB5_1 -; PREINDEX-NEXT: // %bb.2: // %for.exit -; PREINDEX-NEXT: ret -; -; POSTINDEX-LABEL: three_access_wide_gap: -; POSTINDEX: // %bb.0: // %entry -; POSTINDEX-NEXT: ptrue p0.s -; POSTINDEX-NEXT: mov x8, xzr -; POSTINDEX-NEXT: .LBB5_1: // %for.body -; POSTINDEX-NEXT: // =>This Inner Loop Header: Depth=1 -; POSTINDEX-NEXT: ldr z0, [x0] -; POSTINDEX-NEXT: ldr z1, [x0, #4, mul vl] -; POSTINDEX-NEXT: ldr z2, [x0, #8, mul vl] -; POSTINDEX-NEXT: incb x0 -; POSTINDEX-NEXT: add z0.s, z0.s, z1.s -; POSTINDEX-NEXT: add z0.s, z0.s, z2.s -; POSTINDEX-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] -; POSTINDEX-NEXT: incw x8 -; POSTINDEX-NEXT: cmp x2, x8 -; POSTINDEX-NEXT: b.ne .LBB5_1 -; POSTINDEX-NEXT: // %bb.2: // %for.exit -; POSTINDEX-NEXT: ret +; COMMON-LABEL: three_access_wide_gap: +; COMMON: // %bb.0: // %entry +; COMMON-NEXT: ptrue p0.s +; COMMON-NEXT: mov x8, xzr +; COMMON-NEXT: .LBB5_1: // %for.body +; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 +; COMMON-NEXT: ldr z0, [x0] +; COMMON-NEXT: ldr z1, [x0, #4, mul vl] +; COMMON-NEXT: ldr z2, [x0, #8, mul vl] +; COMMON-NEXT: incb x0 +; COMMON-NEXT: add z0.s, z0.s, z1.s +; COMMON-NEXT: add z0.s, z0.s, z2.s +; COMMON-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; COMMON-NEXT: incw x8 +; COMMON-NEXT: cmp x2, x8 +; COMMON-NEXT: b.ne .LBB5_1 +; COMMON-NEXT: // %bb.2: // %for.exit +; COMMON-NEXT: ret entry: %vscale = tail call i64 @llvm.vscale.i64() %mul = mul nuw nsw i64 %vscale, 16 @@ -428,3 +354,7 @@ for.exit: } attributes #0 = { "target-features"="+sve2" vscale_range(1,16) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; BASE: {{.*}} +; POSTINDEX: {{.*}} +; PREINDEX: {{.*}} diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll index f2c00b095495c..b406e7ad2f5a9 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll @@ -5,16 +5,16 @@ ; OPT-LABEL: @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32( ; OPT: .lr.ph.preheader: -; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65532 +; OPT: %scevgep = getelementptr i8, ptr addrspace(3) %arg1, i32 65532 ; OPT: br label %.lr.ph ; OPT: .lr.ph: -; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] -; OPT: %lsr.iv1 = phi ptr addrspace(3) [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] -; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] -; OPT: %tmp4 = atomicrmw add ptr addrspace(3) %lsr.iv3, i32 undef seq_cst, align 4 -; OPT: %tmp7 = atomicrmw add ptr addrspace(3) %lsr.iv1, i32 undef seq_cst, align 4 -; OPT: %0 = atomicrmw add ptr addrspace(3) %lsr.iv1, i32 %tmp8 seq_cst, align 4 -; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 4 +; OPT: %lsr.iv4 = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] +; OPT: %lsr.iv2 = phi ptr addrspace(3) [ %scevgep3, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] +; OPT: %lsr.iv = phi ptr addrspace(3) [ %scevgep1, %.lr.ph ], [ %scevgep, %.lr.ph.preheader ] +; OPT: %tmp4 = atomicrmw add ptr addrspace(3) %lsr.iv, i32 undef seq_cst, align 4 +; OPT: %tmp7 = atomicrmw add ptr addrspace(3) %lsr.iv2, i32 undef seq_cst, align 4 +; OPT: %0 = atomicrmw add ptr addrspace(3) %lsr.iv2, i32 %tmp8 seq_cst, align 4 +; OPT: %scevgep3 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 4 ; OPT: br i1 %exitcond define amdgpu_kernel void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(3) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 { bb: @@ -47,14 +47,14 @@ bb: ; OPT-LABEL: test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32( ; OPT: .lr.ph.preheader: -; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65532 +; OPT: %scevgep = getelementptr i8, ptr addrspace(3) %arg1, i32 65532 ; OPT: br label %.lr.ph ; OPT: .lr.ph: -; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] -; OPT: %lsr.iv1 = phi ptr addrspace(3) [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] -; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] -; OPT: %tmp4 = cmpxchg ptr addrspace(3) %lsr.iv3, i32 undef, i32 undef seq_cst monotonic, align 4 -; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 4 +; OPT: %lsr.iv4 = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] +; OPT: %lsr.iv2 = phi ptr addrspace(3) [ %scevgep3, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] +; OPT: %lsr.iv = phi ptr addrspace(3) [ %scevgep1, %.lr.ph ], [ %scevgep, %.lr.ph.preheader ] +; OPT: %tmp4 = cmpxchg ptr addrspace(3) %lsr.iv, i32 undef, i32 undef seq_cst monotonic, align 4 +; OPT: %scevgep3 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 4 define amdgpu_kernel void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(3) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 { bb: %tmp = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll index 54233686feb87..67fca52e4ef5d 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll @@ -5,12 +5,12 @@ ; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_i32( ; OPT: .lr.ph.preheader: -; OPT: %scevgep2 = getelementptr i8, ptr addrspace(1) %arg1, i64 4095 +; OPT: %scevgep = getelementptr i8, ptr addrspace(1) %arg1, i64 4095 ; OPT: br label %.lr.ph ; OPT: {{^}}.lr.ph: -; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] -; OPT: load i8, ptr addrspace(1) %lsr.iv3, align 1 -; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 1 +; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] +; OPT: load i8, ptr addrspace(1) %lsr.iv1, align 1 +; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 4 define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(1) noalias nocapture readonly %arg1, i32 %n) #0 { bb: %tmp = icmp sgt i32 %n, 0 @@ -43,12 +43,12 @@ bb: ; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_p1_i32( ; OPT: {{^}}.lr.ph.preheader: -; OPT: %scevgep2 = getelementptr i8, ptr addrspace(1) %arg1, i64 4096 +; OPT: %scevgep = getelementptr i8, ptr addrspace(1) %arg1, i64 4096 ; OPT: br label %.lr.ph ; OPT: {{^}}.lr.ph: -; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] -; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 1 +; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] +; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 4 define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(1) noalias nocapture readonly %arg1, i32 %n) #0 { bb: %tmp = icmp sgt i32 %n, 0 @@ -81,12 +81,12 @@ bb: ; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_i32( ; OPT: .lr.ph.preheader: -; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65535 +; OPT: %scevgep = getelementptr i8, ptr addrspace(3) %arg1, i32 65535 ; OPT: br label %.lr.ph ; OPT: {{^}}.lr.ph -; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] -; OPT: %tmp4 = load i8, ptr addrspace(3) %lsr.iv3, align 1 -; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 1 +; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] +; OPT: %tmp4 = load i8, ptr addrspace(3) %lsr.iv1, align 1 +; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 4 define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 { bb: %tmp = icmp sgt i32 %n, 0 @@ -120,12 +120,12 @@ bb: ; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_p1_i32( ; OPT: {{^}}.lr.ph.preheader: -; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65536 +; OPT: %scevgep = getelementptr i8, ptr addrspace(3) %arg1, i32 65536 ; OPT: br label %.lr.ph ; OPT: {{^}}.lr.ph: -; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] -; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 1 +; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] +; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 4 define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 { bb: %tmp = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll index 810f5f27be076..8e2bd4d038ad0 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll @@ -10,10 +10,10 @@ define amdgpu_kernel void @local_cmp_user(i32 %arg0) nounwind { ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[ARG0:%.*]], 1 ; CHECK-NEXT: br label [[BB11:%.*]] ; CHECK: bb11: -; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i32 [ [[LSR_IV_NEXT2:%.*]], [[BB:%.*]] ], [ -2, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[BB]] ], [ [[TMP0]], [[ENTRY]] ] -; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -1 +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[BB:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i32 [ [[LSR_IV_NEXT2:%.*]], [[BB]] ], [ -2, [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV_NEXT2]] = add i32 [[LSR_IV1]], 2 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -1 ; CHECK-NEXT: [[C0:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0 ; CHECK-NEXT: br i1 [[C0]], label [[BB13:%.*]], label [[BB]] ; CHECK: bb: @@ -50,10 +50,10 @@ define amdgpu_kernel void @global_cmp_user(i64 %arg0) nounwind { ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[ARG0:%.*]], 1 ; CHECK-NEXT: br label [[BB11:%.*]] ; CHECK: bb11: -; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i64 [ [[LSR_IV_NEXT2:%.*]], [[BB:%.*]] ], [ -2, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[BB]] ], [ [[TMP0]], [[ENTRY]] ] -; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1 +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[BB:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i64 [ [[LSR_IV_NEXT2:%.*]], [[BB]] ], [ -2, [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV_NEXT2]] = add i64 [[LSR_IV1]], 2 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1 ; CHECK-NEXT: [[C0:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 ; CHECK-NEXT: br i1 [[C0]], label [[BB13:%.*]], label [[BB]] ; CHECK: bb: diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/complexity.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/complexity.ll index 1b64ade50f219..0f9092765985a 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/ARM/complexity.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/complexity.ll @@ -4,8 +4,8 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" ; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s ; CHECK-LABEL: for.body12.us.us: -; CHECK: [[LSR_IV6:%[^ ]+]] = phi ptr [ [[SCEVGEP7:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP5:%[^ ]+]], %for.cond9.preheader.us.us ] ; CHECK: phi i32 +; CHECK: [[LSR_IV6:%[^ ]+]] = phi ptr [ [[SCEVGEP7:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP5:%[^ ]+]], %for.cond9.preheader.us.us ] ; CHECK: [[LSR_IV:%[^ ]+]] = phi ptr [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP:%[^ ]+]], %for.cond9.preheader.us.us ] ; CHECK: phi i32 ; CHECK: [[SCEVGEP1]] = getelementptr i8, ptr [[LSR_IV]], i32 8 diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/illegal-addr-modes.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/illegal-addr-modes.ll index 3844e00ae0a4f..1dfd2812f2eca 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/ARM/illegal-addr-modes.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/illegal-addr-modes.ll @@ -113,8 +113,7 @@ define void @negativeFourCase(ptr %ptr1, ptr %ptr2) nounwind { ; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[C1_0104_US_I_I]], i32 [[LSR_IV]] ; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SCEVGEP4]], i32 4 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SCEVGEP5]], align 4 -; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[C1_0104_US_I_I]], i32 [[LSR_IV]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SCEVGEP3]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SCEVGEP4]], align 4 ; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i32 [[LSR_IV]] ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SCEVGEP2]], align 4 ; CHECK-NEXT: br label [[IF_END37_US_I_I]] diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/many-geps.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/many-geps.ll index 4914bb72d8945..c52232c1e9c99 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/RISCV/many-geps.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/many-geps.ll @@ -20,17 +20,17 @@ define i32 @main() { ; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[CALL]], align 4 ; CHECK-NEXT: ret i32 0 ; CHECK: [[BB2]]: -; CHECK-NEXT: [[LSR_IV30:%.*]] = phi i64 [ [[LSR_IV_NEXT31:%.*]], %[[BB2]] ], [ 8, [[BB:%.*]] ] -; CHECK-NEXT: [[LSR_IV27:%.*]] = phi i64 [ [[LSR_IV_NEXT28:%.*]], %[[BB2]] ], [ 12, [[BB]] ] -; CHECK-NEXT: [[LSR_IV24:%.*]] = phi i64 [ [[LSR_IV_NEXT25:%.*]], %[[BB2]] ], [ 16, [[BB]] ] -; CHECK-NEXT: [[LSR_IV21:%.*]] = phi i64 [ [[LSR_IV_NEXT22:%.*]], %[[BB2]] ], [ 20, [[BB]] ] -; CHECK-NEXT: [[LSR_IV18:%.*]] = phi i64 [ [[LSR_IV_NEXT19:%.*]], %[[BB2]] ], [ 24, [[BB]] ] -; CHECK-NEXT: [[LSR_IV15:%.*]] = phi i64 [ [[LSR_IV_NEXT16:%.*]], %[[BB2]] ], [ 28, [[BB]] ] -; CHECK-NEXT: [[LSR_IV12:%.*]] = phi i64 [ [[LSR_IV_NEXT13:%.*]], %[[BB2]] ], [ 32, [[BB]] ] -; CHECK-NEXT: [[LSR_IV9:%.*]] = phi i64 [ [[LSR_IV_NEXT10:%.*]], %[[BB2]] ], [ 36, [[BB]] ] -; CHECK-NEXT: [[LSR_IV4:%.*]] = phi i64 [ [[LSR_IV_NEXT5:%.*]], %[[BB2]] ], [ 40, [[BB]] ] -; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i64 [ [[LSR_IV_NEXT2:%.*]], %[[BB2]] ], [ 48, [[BB]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[BB2]] ], [ 72, [[BB]] ] +; CHECK-NEXT: [[LSR_IV30:%.*]] = phi i64 [ [[LSR_IV_NEXT31:%.*]], %[[BB2]] ], [ 8, [[TMP0:%.*]] ] +; CHECK-NEXT: [[LSR_IV27:%.*]] = phi i64 [ [[LSR_IV_NEXT28:%.*]], %[[BB2]] ], [ 12, [[TMP0]] ] +; CHECK-NEXT: [[LSR_IV24:%.*]] = phi i64 [ [[LSR_IV_NEXT25:%.*]], %[[BB2]] ], [ 16, [[TMP0]] ] +; CHECK-NEXT: [[LSR_IV21:%.*]] = phi i64 [ [[LSR_IV_NEXT22:%.*]], %[[BB2]] ], [ 20, [[TMP0]] ] +; CHECK-NEXT: [[LSR_IV18:%.*]] = phi i64 [ [[LSR_IV_NEXT19:%.*]], %[[BB2]] ], [ 24, [[TMP0]] ] +; CHECK-NEXT: [[LSR_IV15:%.*]] = phi i64 [ [[LSR_IV_NEXT16:%.*]], %[[BB2]] ], [ 28, [[TMP0]] ] +; CHECK-NEXT: [[LSR_IV12:%.*]] = phi i64 [ [[LSR_IV_NEXT13:%.*]], %[[BB2]] ], [ 32, [[TMP0]] ] +; CHECK-NEXT: [[LSR_IV9:%.*]] = phi i64 [ [[LSR_IV_NEXT10:%.*]], %[[BB2]] ], [ 36, [[TMP0]] ] +; CHECK-NEXT: [[LSR_IV4:%.*]] = phi i64 [ [[LSR_IV_NEXT5:%.*]], %[[BB2]] ], [ 48, [[TMP0]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i64 [ [[LSR_IV_NEXT2:%.*]], %[[BB2]] ], [ 72, [[TMP0]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[BB2]] ], [ 40, [[TMP0]] ] ; CHECK-NEXT: [[SCEVGEP32:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV30]] ; CHECK-NEXT: store i32 0, ptr [[SCEVGEP32]], align 8 ; CHECK-NEXT: [[SCEVGEP29:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV27]] @@ -47,13 +47,13 @@ define i32 @main() { ; CHECK-NEXT: store i32 0, ptr [[SCEVGEP14]], align 8 ; CHECK-NEXT: [[SCEVGEP11:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV9]] ; CHECK-NEXT: store i32 0, ptr [[SCEVGEP11]], align 4 -; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV4]] +; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV]] ; CHECK-NEXT: store i64 0, ptr [[SCEVGEP6]], align 8 -; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV1]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV4]] ; CHECK-NEXT: store i32 0, ptr [[SCEVGEP3]], align 8 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV1]] ; CHECK-NEXT: store i32 0, ptr [[SCEVGEP]], align 8 -; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV4]] +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV]] ; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr i8, ptr [[SCEVGEP7]], i64 40 ; CHECK-NEXT: store i64 0, ptr [[SCEVGEP8]], align 8 ; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], 88 diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll index c12d8135e5eba..2d75d175e8169 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -92,10 +92,10 @@ define i32 @user(ptr %a, ptr %b, i32 %x) nounwind { ; X64-LABEL: user: ; X64: # %bb.0: # %entry ; X64-NEXT: movslq %edx, %rcx -; X64-NEXT: movq %rcx, %rdx -; X64-NEXT: shlq $4, %rdx ; X64-NEXT: leaq (,%rcx,4), %rax -; X64-NEXT: leaq (%rax,%rax,2), %r8 +; X64-NEXT: leaq (%rax,%rax,2), %rdx +; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: shlq $4, %r8 ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: .p2align 4 ; X64-NEXT: .LBB1_1: # %loop @@ -103,9 +103,9 @@ define i32 @user(ptr %a, ptr %b, i32 %x) nounwind { ; X64-NEXT: addl (%rdi), %eax ; X64-NEXT: addl (%rdi,%rcx,4), %eax ; X64-NEXT: addl (%rdi,%rcx,8), %eax -; X64-NEXT: addl (%rdi,%r8), %eax +; X64-NEXT: addl (%rdi,%rdx), %eax ; X64-NEXT: movl %eax, (%rdi) -; X64-NEXT: addq %rdx, %rdi +; X64-NEXT: addq %r8, %rdi ; X64-NEXT: cmpq %rsi, %rdi ; X64-NEXT: jne .LBB1_1 ; X64-NEXT: # %bb.2: # %exit @@ -119,10 +119,10 @@ define i32 @user(ptr %a, ptr %b, i32 %x) nounwind { ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl %ecx, %edi -; X32-NEXT: shll $4, %edi ; X32-NEXT: leal (,%ecx,4), %eax -; X32-NEXT: leal (%eax,%eax,2), %ebx +; X32-NEXT: leal (%eax,%eax,2), %edi +; X32-NEXT: movl %ecx, %ebx +; X32-NEXT: shll $4, %ebx ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: .p2align 4 ; X32-NEXT: .LBB1_1: # %loop @@ -130,9 +130,9 @@ define i32 @user(ptr %a, ptr %b, i32 %x) nounwind { ; X32-NEXT: addl (%esi), %eax ; X32-NEXT: addl (%esi,%ecx,4), %eax ; X32-NEXT: addl (%esi,%ecx,8), %eax -; X32-NEXT: addl (%esi,%ebx), %eax +; X32-NEXT: addl (%esi,%edi), %eax ; X32-NEXT: movl %eax, (%esi) -; X32-NEXT: addl %edi, %esi +; X32-NEXT: addl %ebx, %esi ; X32-NEXT: cmpl %edx, %esi ; X32-NEXT: jne .LBB1_1 ; X32-NEXT: # %bb.2: # %exit diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/missing-phi-operand-update.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/missing-phi-operand-update.ll index ae24da06415cc..a1afefd5ea5a7 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/missing-phi-operand-update.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/missing-phi-operand-update.ll @@ -19,24 +19,24 @@ define i32 @foo(ptr %A, i32 %t) { ; CHECK-NEXT: br label [[LOOP_32:%.*]] ; CHECK: loop.exit.loopexitsplitsplitsplit: ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV1:%.*]], [[IFMERGE_34:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[LSR_IV]], -1 ; CHECK-NEXT: br label [[LOOP_EXIT_LOOPEXITSPLITSPLIT:%.*]] -; CHECK: ifmerge.38.loop.exit.loopexitsplitsplit_crit_edge: +; CHECK: then.34.loop.exit.loopexitsplitsplit_crit_edge: ; CHECK-NEXT: [[LSR_IV_LCSSA10:%.*]] = phi i64 [ [[LSR_IV1]], [[IFMERGE_38:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[LSR_IV_LCSSA10]], -2 ; CHECK-NEXT: br label [[LOOP_EXIT_LOOPEXITSPLITSPLIT]] ; CHECK: loop.exit.loopexitsplitsplit: -; CHECK-NEXT: [[INDVARS_IV_LCSSA_PH_PH_PH:%.*]] = phi i64 [ [[LSR_IV_LCSSA10]], [[IFMERGE_38_LOOP_EXIT_LOOPEXITSPLITSPLIT_CRIT_EDGE:%.*]] ], [ [[TMP0]], [[LOOP_EXIT_LOOPEXITSPLITSPLITSPLIT:%.*]] ] +; CHECK-NEXT: [[INDVARS_IV_LCSSA_PH_PH_PH:%.*]] = phi i64 [ [[TMP0]], [[THEN_34_LOOP_EXIT_LOOPEXITSPLITSPLIT_CRIT_EDGE:%.*]] ], [ [[LSR_IV]], [[LOOP_EXIT_LOOPEXITSPLITSPLITSPLIT:%.*]] ] ; CHECK-NEXT: br label [[LOOP_EXIT_LOOPEXITSPLIT:%.*]] -; CHECK: ifmerge.42.loop.exit.loopexitsplit_crit_edge: +; CHECK: ifmerge.34.loop.exit.loopexitsplit_crit_edge: ; CHECK-NEXT: [[LSR_IV_LCSSA11:%.*]] = phi i64 [ [[LSR_IV1]], [[IFMERGE_42:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[LSR_IV_LCSSA11]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[LSR_IV_LCSSA11]], -1 ; CHECK-NEXT: br label [[LOOP_EXIT_LOOPEXITSPLIT]] ; CHECK: loop.exit.loopexitsplit: ; CHECK-NEXT: [[INDVARS_IV_LCSSA_PH_PH:%.*]] = phi i64 [ [[TMP1]], [[IFMERGE_42_LOOP_EXIT_LOOPEXITSPLIT_CRIT_EDGE:%.*]] ], [ [[INDVARS_IV_LCSSA_PH_PH_PH]], [[LOOP_EXIT_LOOPEXITSPLITSPLIT]] ] ; CHECK-NEXT: br label [[LOOP_EXIT_LOOPEXIT:%.*]] -; CHECK: then.34.loop.exit.loopexit_crit_edge: +; CHECK: ifmerge.42.loop.exit.loopexit_crit_edge: ; CHECK-NEXT: [[LSR_IV_LCSSA:%.*]] = phi i64 [ [[LSR_IV1]], [[THEN_34:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[LSR_IV_LCSSA]], -2 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[LSR_IV_LCSSA]], 1 ; CHECK-NEXT: br label [[LOOP_EXIT_LOOPEXIT]] ; CHECK: loop.exit.loopexit: ; CHECK-NEXT: [[INDVARS_IV_LCSSA_PH:%.*]] = phi i64 [ [[TMP2]], [[THEN_34_LOOP_EXIT_LOOPEXIT_CRIT_EDGE:%.*]] ], [ [[INDVARS_IV_LCSSA_PH_PH]], [[LOOP_EXIT_LOOPEXITSPLIT]] ] @@ -56,14 +56,14 @@ define i32 @foo(ptr %A, i32 %t) { ; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr i8, ptr [[SCEVGEP7]], i64 -4 ; CHECK-NEXT: [[GEPLOAD:%.*]] = load i32, ptr [[SCEVGEP8]], align 4 ; CHECK-NEXT: [[CMP_34:%.*]] = icmp sgt i32 [[GEPLOAD]], [[T]] -; CHECK-NEXT: br i1 [[CMP_34]], label [[THEN_34]], label [[IFMERGE_34]] +; CHECK-NEXT: br i1 [[CMP_34]], label [[IFMERGE_38]], label [[IFMERGE_42]] ; CHECK: then.34: ; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[LSR_IV1]], 2 ; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]] ; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[SCEVGEP5]], i64 -8 ; CHECK-NEXT: [[GEPLOAD18:%.*]] = load i32, ptr [[SCEVGEP6]], align 4 ; CHECK-NEXT: [[CMP_35:%.*]] = icmp slt i32 [[GEPLOAD18]], [[T]] -; CHECK-NEXT: br i1 [[CMP_35]], label [[THEN_34_LOOP_EXIT_LOOPEXIT_CRIT_EDGE]], label [[IFMERGE_34]] +; CHECK-NEXT: br i1 [[CMP_35]], label [[THEN_34_LOOP_EXIT_LOOPEXITSPLITSPLIT_CRIT_EDGE]], label [[IFMERGE_42]] ; CHECK: ifmerge.34: ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[LSR_IV1]], 2 ; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]] @@ -71,7 +71,7 @@ define i32 @foo(ptr %A, i32 %t) { ; CHECK-NEXT: [[CMP_38:%.*]] = icmp sgt i32 [[GEPLOAD20]], [[T]] ; CHECK-NEXT: [[CMP_39:%.*]] = icmp slt i32 [[GEPLOAD]], [[T]] ; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP_38]], [[CMP_39]] -; CHECK-NEXT: br i1 [[OR_COND]], label [[LOOP_EXIT_LOOPEXITSPLITSPLITSPLIT]], label [[IFMERGE_38]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[IFMERGE_42_LOOP_EXIT_LOOPEXITSPLIT_CRIT_EDGE]], label [[IFMERGE_34]] ; CHECK: ifmerge.38: ; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[LSR_IV1]], 2 ; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] @@ -80,7 +80,7 @@ define i32 @foo(ptr %A, i32 %t) { ; CHECK-NEXT: [[CMP_42:%.*]] = icmp sgt i32 [[GEPLOAD24]], [[T]] ; CHECK-NEXT: [[CMP_43:%.*]] = icmp slt i32 [[GEPLOAD20]], [[T]] ; CHECK-NEXT: [[OR_COND55:%.*]] = and i1 [[CMP_42]], [[CMP_43]] -; CHECK-NEXT: br i1 [[OR_COND55]], label [[IFMERGE_38_LOOP_EXIT_LOOPEXITSPLITSPLIT_CRIT_EDGE]], label [[IFMERGE_42]] +; CHECK-NEXT: br i1 [[OR_COND55]], label [[LOOP_EXIT_LOOPEXITSPLITSPLITSPLIT]], label [[THEN_34]] ; CHECK: ifmerge.42: ; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[LSR_IV1]], 2 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP7]] @@ -89,7 +89,7 @@ define i32 @foo(ptr %A, i32 %t) { ; CHECK-NEXT: [[CMP_46:%.*]] = icmp sgt i32 [[GEPLOAD28]], [[T]] ; CHECK-NEXT: [[CMP_47:%.*]] = icmp slt i32 [[GEPLOAD24]], [[T]] ; CHECK-NEXT: [[OR_COND56:%.*]] = and i1 [[CMP_46]], [[CMP_47]] -; CHECK-NEXT: br i1 [[OR_COND56]], label [[IFMERGE_42_LOOP_EXIT_LOOPEXITSPLIT_CRIT_EDGE]], label [[IFMERGE_46]] +; CHECK-NEXT: br i1 [[OR_COND56]], label [[THEN_34_LOOP_EXIT_LOOPEXIT_CRIT_EDGE]], label [[IFMERGE_46]] ; CHECK: ifmerge.46: ; CHECK-NEXT: [[NEXTIVLOOP_32]] = add nuw nsw i64 [[I1_I64_0]], 1 ; CHECK-NEXT: [[LSR_IV_NEXT]] = add nuw nsw i64 [[LSR_IV1]], 4 diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll index ecc34e9dc9f5d..7da6633aaa7a9 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll @@ -14,33 +14,33 @@ define void @foo(i32 %size, i32 %nsteps, i32 %hsize, ptr %lined, ptr %maxarray) ; CHECK-NEXT: [[CMP215:%.*]] = icmp sgt i32 [[SIZE:%.*]], 1 ; CHECK-NEXT: [[T0:%.*]] = zext i32 [[SIZE]] to i64 ; CHECK-NEXT: [[T1:%.*]] = sext i32 [[NSTEPS:%.*]] to i64 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[MAXARRAY:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[T0]], -1 +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[MAXARRAY]], i64 [[TMP0]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i64 [ [[LSR_IV_NEXT2:%.*]], [[FOR_INC:%.*]] ], [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[LSR_IV3:%.*]] = phi ptr [ [[SCEVGEP1:%.*]], [[FOR_INC:%.*]] ], [ [[SCEVGEP]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT3:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br i1 [[CMP215]], label [[FOR_BODY2_PREHEADER:%.*]], label [[FOR_INC]] ; CHECK: for.body2.preheader: ; CHECK-NEXT: br label [[FOR_BODY2:%.*]] ; CHECK: for.body2: -; CHECK-NEXT: [[LSR_IV3:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY2]] ], [ [[MAXARRAY:%.*]], [[FOR_BODY2_PREHEADER]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY2]] ], [ [[TMP0]], [[FOR_BODY2_PREHEADER]] ] -; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[LSR_IV3]], i64 1 +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY2]] ], [ 0, [[FOR_BODY2_PREHEADER]] ] +; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[LSR_IV1]] ; CHECK-NEXT: [[V1:%.*]] = load i8, ptr [[SCEVGEP6]], align 1 -; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[LSR_IV3]], i64 [[TMP0]] +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SCEVGEP7]], i64 [[LSR_IV1]] ; CHECK-NEXT: [[V2:%.*]] = load i8, ptr [[SCEVGEP5]], align 1 ; CHECK-NEXT: [[TMPV:%.*]] = xor i8 [[V1]], [[V2]] ; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[LSR_IV3]], i64 [[LSR_IV1]] ; CHECK-NEXT: store i8 [[TMPV]], ptr [[SCEVGEP4]], align 1 -; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], -1 -; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV3]], i64 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[LSR_IV_NEXT]], 0 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add nuw nsw i64 [[LSR_IV1]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[TMP0]], [[LSR_IV_NEXT]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY2]], label [[FOR_INC_LOOPEXIT:%.*]] ; CHECK: for.inc.loopexit: ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: ; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1 -; CHECK-NEXT: [[LSR_IV_NEXT2]] = add nuw nsw i64 [[LSR_IV1]], [[T0]] +; CHECK-NEXT: [[SCEVGEP1]] = getelementptr i8, ptr [[LSR_IV3]], i64 [[T0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT3]], [[T1]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]] ; CHECK: for.end.loopexit: diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/normalization-during-scev-expansion.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/normalization-during-scev-expansion.ll index 3ee0833bc1300..292fd5a6cf234 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/normalization-during-scev-expansion.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/normalization-during-scev-expansion.ll @@ -7,38 +7,38 @@ target triple = "x86_64-apple-macos" declare i1 @cond() define ptr @test(ptr %dst, i64 %v4, i64 %v5, i64 %v6, i64 %v7) { -; CHECK-LABEL: define ptr @test -; CHECK-SAME: (ptr [[DST:%.*]], i64 [[V4:%.*]], i64 [[V5:%.*]], i64 [[V6:%.*]], i64 [[V7:%.*]]) { +; CHECK-LABEL: define ptr @test( +; CHECK-SAME: ptr [[DST:%.*]], i64 [[V4:%.*]], i64 [[V5:%.*]], i64 [[V6:%.*]], i64 [[V7:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[V5]], [[V4]] -; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[V7]], [[V6]] -; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], -8 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = shl nsw i64 [[V5]], 3 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[V4]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[V5]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[V7]], 3 -; CHECK-NEXT: [[TMP11:%.*]] = shl i64 [[V6]], 3 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[V4]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[V5]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[V7]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[V6]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], -8 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[V5]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[V5]], [[V4]] +; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[V7]], [[V6]] +; CHECK-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 3 ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], -8 ; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[LSR_IV4:%.*]] = phi ptr [ [[SCEVGEP5:%.*]], [[LOOP]] ], [ [[SCEVGEP3]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP1:%.*]], [[LOOP]] ], [ [[SCEVGEP]], [[ENTRY]] ] -; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[LSR_IV4]], i64 [[TMP9]] -; CHECK-NEXT: store i64 0, ptr [[SCEVGEP6]], align 8 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 [[TMP2]] +; CHECK-NEXT: store i64 0, ptr [[SCEVGEP2]], align 8 ; CHECK-NEXT: [[C:%.*]] = call i1 @cond() -; CHECK-NEXT: [[SCEVGEP1]] = getelementptr i8, ptr [[LSR_IV]], i64 [[TMP6]] -; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[TMP1]] -; CHECK-NEXT: [[SCEVGEP5]] = getelementptr i8, ptr [[LSR_IV4]], i64 [[TMP6]] +; CHECK-NEXT: [[SCEVGEP1]] = getelementptr i8, ptr [[LSR_IV]], i64 [[TMP8]] +; CHECK-NEXT: [[SCEVGEP5]] = getelementptr i8, ptr [[LSR_IV4]], i64 [[TMP8]] +; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[SCEVGEP5]], i64 [[TMP10]] ; CHECK-NEXT: br i1 [[C]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[RES:%.*]] = phi ptr [ [[SCEVGEP2]], [[LOOP]] ] +; CHECK-NEXT: [[RES:%.*]] = phi ptr [ [[SCEVGEP6]], [[LOOP]] ] ; CHECK-NEXT: ret ptr [[RES]] ; entry: @@ -64,8 +64,8 @@ exit: } define i32 @test_pr63678(i1 %c) { -; CHECK-LABEL: define i32 @test_pr63678 -; CHECK-SAME: (i1 [[C:%.*]]) { +; CHECK-LABEL: define i32 @test_pr63678( +; CHECK-SAME: i1 [[C:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP_1_PREHEADER:%.*]] ; CHECK: bb: diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/postinc-iv-used-by-urem-and-udiv.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/postinc-iv-used-by-urem-and-udiv.ll index 838b48aa56906..502f2474fbe93 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/postinc-iv-used-by-urem-and-udiv.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/postinc-iv-used-by-urem-and-udiv.ll @@ -13,8 +13,8 @@ define i32 @test_pr38847() { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i32 [ [[LSR_IV_NEXT2:%.*]], [[LOOP]] ], [ 1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[LOOP]] ], [ 1, [[ENTRY]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[LOOP]] ], [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i32 [ [[LSR_IV_NEXT2:%.*]], [[LOOP]] ], [ 1, [[ENTRY]] ] ; CHECK-NEXT: [[LSR_IV_NEXT2]] = add nsw i32 [[LSR_IV1]], -1 ; CHECK-NEXT: [[LSR:%.*]] = trunc i32 [[LSR_IV_NEXT2]] to i8 ; CHECK-NEXT: call void @use(i64 [[LSR_IV]]) @@ -47,19 +47,20 @@ define i64 @test_pr58039() { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[LOOP]] ], [ -4294967213, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[LOOP]] ], [ -75, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 ; CHECK-NEXT: call void @use.i32(i32 [[TMP2]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 4294967295 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add nuw nsw i32 [[LSR_IV]], 1 ; CHECK-NEXT: br i1 false, label [[LOOP]], label [[EXIT:%.*]] ; CHECK: exit: -; CHECK-NEXT: [[TMP0:%.*]] = udiv i64 [[LSR_IV_NEXT]], 12 -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 12 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 4294967221 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], [[IV_NEXT]] -; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = mul nsw i64 [[IV_NEXT]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], 83 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[I2:%.*]] = udiv i32 [[TMP1]], 12 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw nsw i32 [[I2]], 12 +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[LSR_IV_NEXT]], [[TMP3]] ; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i32 [[TMP]], 32 ; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i64 0, i64 [[IV_NEXT]] ; CHECK-NEXT: ret i64 [[SPEC_SELECT]] @@ -138,7 +139,6 @@ define i64 @test_normalization_failure_in_any_extend(ptr %i, i64 %i1, i8 %i25) { ; CHECK: loop.1.header: ; CHECK-NEXT: [[IV_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_1_LATCH:%.*]] ] ; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[I1]], [[ENTRY]] ], [ [[TMP1:%.*]], [[LOOP_1_LATCH]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[IV_2]], 2 ; CHECK-NEXT: br label [[LOOP_2:%.*]] ; CHECK: loop.2: ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[LOOP_2]] ], [ 2, [[LOOP_1_HEADER]] ] @@ -146,34 +146,33 @@ define i64 @test_normalization_failure_in_any_extend(ptr %i, i64 %i1, i8 %i25) { ; CHECK-NEXT: [[C_1:%.*]] = icmp sgt i32 [[LSR_IV_NEXT]], 0 ; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_2]], label [[LOOP_3_PREHEADER:%.*]] ; CHECK: loop.3.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[IV_2]], 1 ; CHECK-NEXT: br label [[LOOP_3:%.*]] ; CHECK: loop.3: -; CHECK-NEXT: [[LSR_IV5:%.*]] = phi i64 [ 0, [[LOOP_3_PREHEADER]] ], [ [[LSR_IV_NEXT6:%.*]], [[LOOP_3]] ] -; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i64 [ 2, [[LOOP_3_PREHEADER]] ], [ [[LSR_IV_NEXT2:%.*]], [[LOOP_3]] ] +; CHECK-NEXT: [[LSR_IV3:%.*]] = phi i64 [ 2, [[LOOP_3_PREHEADER]] ], [ [[LSR_IV_NEXT4:%.*]], [[LOOP_3]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i64 [ [[TMP0]], [[LOOP_3_PREHEADER]] ], [ [[LSR_IV_NEXT2:%.*]], [[LOOP_3]] ] ; CHECK-NEXT: [[IV_5:%.*]] = phi i32 [ [[IV_5_NEXT:%.*]], [[LOOP_3]] ], [ 1, [[LOOP_3_PREHEADER]] ] ; CHECK-NEXT: [[IV_5_NEXT]] = add nsw i32 [[IV_5]], -1 ; CHECK-NEXT: [[LSR:%.*]] = trunc i32 [[IV_5_NEXT]] to i8 -; CHECK-NEXT: [[LSR_IV_NEXT2]] = add nsw i64 [[LSR_IV1]], -1 -; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[LSR_IV_NEXT2]] to i32 -; CHECK-NEXT: [[LSR_IV_NEXT6]] = add nsw i64 [[LSR_IV5]], -1 +; CHECK-NEXT: [[LSR_IV_NEXT2]] = add i64 [[LSR_IV1]], 1 +; CHECK-NEXT: [[LSR_IV_NEXT4]] = add nsw i64 [[LSR_IV3]], -1 +; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[LSR_IV_NEXT4]] to i32 ; CHECK-NEXT: [[C_2:%.*]] = icmp sgt i32 [[TMP]], 0 ; CHECK-NEXT: br i1 [[C_2]], label [[LOOP_3]], label [[LOOP_1_LATCH]] ; CHECK: loop.1.latch: ; CHECK-NEXT: [[IV_1_NEXT]] = add nuw nsw i32 [[IV_1]], 1 -; CHECK-NEXT: [[TMP1]] = sub i64 [[TMP0]], [[LSR_IV_NEXT6]] +; CHECK-NEXT: [[TMP1]] = add i64 [[LSR_IV_NEXT2]], 1 ; CHECK-NEXT: [[C_3:%.*]] = icmp eq i32 [[IV_1_NEXT]], 8 ; CHECK-NEXT: br i1 [[C_3]], label [[EXIT:%.*]], label [[LOOP_1_HEADER]] ; CHECK: exit: ; CHECK-NEXT: call void @use.i32(i32 [[IV_5_NEXT]]) -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[IV_2]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], [[LSR_IV_NEXT6]] -; CHECK-NEXT: call void @use(i64 [[TMP3]]) ; CHECK-NEXT: call void @use(i64 [[LSR_IV_NEXT2]]) -; CHECK-NEXT: [[TMP4:%.*]] = udiv i32 [[IV_5_NEXT]], 53 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; CHECK-NEXT: [[TMP6:%.*]] = mul i8 [[TMP5]], 53 -; CHECK-NEXT: [[TMP7:%.*]] = sub i8 [[LSR]], [[TMP6]] -; CHECK-NEXT: call void @use.i8(i8 [[TMP7]]) +; CHECK-NEXT: call void @use(i64 [[LSR_IV_NEXT4]]) +; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[IV_5_NEXT]], 53 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 +; CHECK-NEXT: [[TMP4:%.*]] = mul i8 [[TMP3]], 53 +; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[LSR]], [[TMP4]] +; CHECK-NEXT: call void @use.i8(i8 [[TMP5]]) ; CHECK-NEXT: [[I26:%.*]] = xor i8 [[I25]], 5 ; CHECK-NEXT: [[I27:%.*]] = zext i8 [[I26]] to i64 ; CHECK-NEXT: ret i64 [[I27]] diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/pr62660-normalization-failure.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/pr62660-normalization-failure.ll index 2d9478e476cb1..e6ee9b467c5e0 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/pr62660-normalization-failure.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/pr62660-normalization-failure.ll @@ -9,18 +9,17 @@ define i64 @test_pr62660() { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[LOOP]] ], [ -1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[LSR_IV]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ [[LSR_IV_NEXT1:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[TMP0]] to i32 -; CHECK-NEXT: [[CONV1:%.*]] = and i32 [[TMP]], 65535 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[IV]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +; CHECK-NEXT: [[CONV1:%.*]] = and i32 [[TMP1]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP]], -1 ; CHECK-NEXT: [[SUB:%.*]] = add i32 [[ADD]], [[CONV1]] -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 -; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 1 +; CHECK-NEXT: [[LSR_IV_NEXT1]] = add nuw nsw i64 [[TMP0]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[SUB]], 8 ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] ; CHECK: exit: +; CHECK-NEXT: [[LSR_IV_NEXT:%.*]] = add i64 [[LSR_IV_NEXT1]], -1 ; CHECK-NEXT: ret i64 [[LSR_IV_NEXT]] ; entry: diff --git a/llvm/test/Transforms/LoopStrengthReduce/depth-limit-overrun.ll b/llvm/test/Transforms/LoopStrengthReduce/depth-limit-overrun.ll index 9c3698a740992..023813b4d6432 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/depth-limit-overrun.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/depth-limit-overrun.ll @@ -30,8 +30,8 @@ define void @test(i32 %A, i32 %B, i32 %C) { ; DEFAULT-NEXT: [[TMP7:%.*]] = add i32 [[TMP0]], [[PHI2]] ; DEFAULT-NEXT: br label [[INNER_LOOP:%.*]] ; DEFAULT: inner_loop: -; DEFAULT-NEXT: [[LSR_IV3:%.*]] = phi i32 [ [[LSR_IV_NEXT4:%.*]], [[INNER_LOOP]] ], [ [[TMP6]], [[PREHEADER]] ] ; DEFAULT-NEXT: [[LSR_IV1:%.*]] = phi i32 [ [[LSR_IV_NEXT2:%.*]], [[INNER_LOOP]] ], [ [[TMP5]], [[PREHEADER]] ] +; DEFAULT-NEXT: [[LSR_IV3:%.*]] = phi i32 [ [[LSR_IV_NEXT4:%.*]], [[INNER_LOOP]] ], [ [[TMP6]], [[PREHEADER]] ] ; DEFAULT-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ], [ [[TMP3]], [[PREHEADER]] ] ; DEFAULT-NEXT: [[PHI5:%.*]] = phi i32 [ [[PHI3]], [[PREHEADER]] ], [ [[I30:%.*]], [[INNER_LOOP]] ] ; DEFAULT-NEXT: [[PHI6:%.*]] = phi i32 [ [[PHI2]], [[PREHEADER]] ], [ [[I33:%.*]], [[INNER_LOOP]] ] @@ -58,8 +58,8 @@ define void @test(i32 %A, i32 %B, i32 %C) { ; DEFAULT-NEXT: [[I33]] = add i32 [[PHI6]], -3 ; DEFAULT-NEXT: [[ITER_SUB]] = add i32 [[ITER]], -1 ; DEFAULT-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 3 -; DEFAULT-NEXT: [[LSR_IV_NEXT2]] = add i32 [[LSR_IV1]], 3 ; DEFAULT-NEXT: [[LSR_IV_NEXT4]] = add i32 [[LSR_IV3]], -3 +; DEFAULT-NEXT: [[LSR_IV_NEXT2]] = add i32 [[LSR_IV1]], 3 ; DEFAULT-NEXT: [[ITER_CMP:%.*]] = icmp eq i32 [[ITER_SUB]], 0 ; DEFAULT-NEXT: br i1 [[ITER_CMP]], label [[OUTER_TAIL_LOOPEXIT:%.*]], label [[INNER_LOOP]] ; DEFAULT: outer_tail.loopexit: @@ -95,8 +95,8 @@ define void @test(i32 %A, i32 %B, i32 %C) { ; LIMIT-NEXT: [[TMP7:%.*]] = add i32 [[TMP0]], [[PHI2]] ; LIMIT-NEXT: br label [[INNER_LOOP:%.*]] ; LIMIT: inner_loop: -; LIMIT-NEXT: [[LSR_IV3:%.*]] = phi i32 [ [[LSR_IV_NEXT4:%.*]], [[INNER_LOOP]] ], [ [[TMP6]], [[PREHEADER]] ] ; LIMIT-NEXT: [[LSR_IV1:%.*]] = phi i32 [ [[LSR_IV_NEXT2:%.*]], [[INNER_LOOP]] ], [ [[TMP5]], [[PREHEADER]] ] +; LIMIT-NEXT: [[LSR_IV3:%.*]] = phi i32 [ [[LSR_IV_NEXT4:%.*]], [[INNER_LOOP]] ], [ [[TMP6]], [[PREHEADER]] ] ; LIMIT-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ], [ [[TMP3]], [[PREHEADER]] ] ; LIMIT-NEXT: [[PHI5:%.*]] = phi i32 [ [[PHI3]], [[PREHEADER]] ], [ [[I30:%.*]], [[INNER_LOOP]] ] ; LIMIT-NEXT: [[PHI6:%.*]] = phi i32 [ [[PHI2]], [[PREHEADER]] ], [ [[I33:%.*]], [[INNER_LOOP]] ] @@ -123,8 +123,8 @@ define void @test(i32 %A, i32 %B, i32 %C) { ; LIMIT-NEXT: [[I33]] = add i32 [[PHI6]], -3 ; LIMIT-NEXT: [[ITER_SUB]] = add i32 [[ITER]], -1 ; LIMIT-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 3 -; LIMIT-NEXT: [[LSR_IV_NEXT2]] = add i32 [[LSR_IV1]], 3 ; LIMIT-NEXT: [[LSR_IV_NEXT4]] = add i32 [[LSR_IV3]], -3 +; LIMIT-NEXT: [[LSR_IV_NEXT2]] = add i32 [[LSR_IV1]], 3 ; LIMIT-NEXT: [[ITER_CMP:%.*]] = icmp eq i32 [[ITER_SUB]], 0 ; LIMIT-NEXT: br i1 [[ITER_CMP]], label [[OUTER_TAIL_LOOPEXIT:%.*]], label [[INNER_LOOP]] ; LIMIT: outer_tail.loopexit: diff --git a/llvm/test/Transforms/LoopStrengthReduce/ivincs-hoist.ll b/llvm/test/Transforms/LoopStrengthReduce/ivincs-hoist.ll index 2a52388d267ac..2dc8116c378e2 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/ivincs-hoist.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/ivincs-hoist.ll @@ -12,13 +12,13 @@ define i32 @test(i32 %c, ptr %a, ptr %b) { ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[C]] to i64 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond: -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[LSR_IV:%.*]], i64 4 ; CHECK-NEXT: [[LSR_IV_NEXT:%.*]] = add nsw i64 [[LSR_IV1:%.*]], -1 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[LSR_IV:%.*]], i64 4 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[RETURN_LOOPEXIT:%.*]], label [[FOR_BODY]] ; CHECK: for.body: -; CHECK-NEXT: [[LSR_IV1]] = phi i64 [ [[LSR_IV_NEXT]], [[FOR_COND:%.*]] ], [ [[WIDE_TRIP_COUNT]], [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[LSR_IV]] = phi ptr [ [[UGLYGEP]], [[FOR_COND]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[LSR_IV]] = phi ptr [ [[UGLYGEP]], [[FOR_COND:%.*]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[LSR_IV1]] = phi i64 [ [[LSR_IV_NEXT]], [[FOR_COND]] ], [ [[WIDE_TRIP_COUNT]], [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[LSR_IV]], align 4 ; CHECK-NEXT: [[TOBOOL3_NOT:%.*]] = icmp eq i32 [[VAL]], 0 ; CHECK-NEXT: br i1 [[TOBOOL3_NOT]], label [[FOR_COND]], label [[RETURN_LOOPEXIT]] diff --git a/llvm/test/Transforms/LoopStrengthReduce/lsr-overflow.ll b/llvm/test/Transforms/LoopStrengthReduce/lsr-overflow.ll index 0bfc62e6d7711..475e80c783448 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/lsr-overflow.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/lsr-overflow.ll @@ -6,16 +6,16 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" define void @overflow1(i64 %a) { ; CHECK-LABEL: @overflow1( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[A:%.*]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[A]], -9223372036854775808 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[A:%.*]], -9223372036854775808 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[A]], -1 ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i64 [ [[LSR_IV_NEXT2:%.*]], [[BB1]] ], [ [[TMP1]], [[BB:%.*]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[BB1]] ], [ [[TMP0]], [[BB]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[BB1]] ], [ [[TMP0]], [[BB:%.*]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i64 [ [[LSR_IV_NEXT2:%.*]], [[BB1]] ], [ [[TMP1]], [[BB]] ] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i64 [[LSR_IV1]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = and i1 [[TMP4]], true -; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], 1 ; CHECK-NEXT: [[LSR_IV_NEXT2]] = add i64 [[LSR_IV1]], 1 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], 1 ; CHECK-NEXT: br i1 [[TMP5]], label [[BB1]], label [[BB7:%.*]] ; CHECK: bb7: ; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[LSR_IV_NEXT]], 1 diff --git a/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold-negative-testcase.ll b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold-negative-testcase.ll index 89ddba3343ffa..e79c6464e1f49 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold-negative-testcase.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold-negative-testcase.ll @@ -143,15 +143,15 @@ define void @NonSCEVableIV(float %init, ptr %A, i32 %N) { ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @fp_inc, align 4 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[A]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1, [[ENTRY]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[A]], [[ENTRY]] ] ; CHECK-NEXT: [[X_05:%.*]] = phi float [ [[INIT]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: store float [[X_05]], ptr [[LSR_IV1]], align 4 +; CHECK-NEXT: store float [[X_05]], ptr [[LSR_IV]], align 4 ; CHECK-NEXT: [[ADD]] = fsub float [[X_05]], [[TMP0]] -; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[LSR_IV]] to i32 +; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[LSR_IV1]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; CHECK-NEXT: [[LSR_IV_NEXT]] = add nuw nsw i64 [[LSR_IV]], 1 -; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add nuw nsw i64 [[LSR_IV1]], 1 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/LoopStrengthReduce/uglygep.ll b/llvm/test/Transforms/LoopStrengthReduce/uglygep.ll index 752ea1c698afb..f38438c4bc2bf 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/uglygep.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/uglygep.ll @@ -9,9 +9,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Check that LSR hoists %t2 computation outside the loop, ; folds %t3's add within the address ; and uses the induction variable (%t4) to access the right element. -define void @test(ptr %ptr.i8, ptr %ptr.float) { +define void @test(ptr %ptr.i8, ptr %ptr.float, i8 %param) { ; CHECK-LABEL: define void @test -; CHECK-SAME: (ptr [[PTR_I8:%.*]], ptr [[PTR_FLOAT:%.*]]) { +; CHECK-SAME: (ptr [[PTR_I8:%.*]], ptr [[PTR_FLOAT:%.*]], i8 [[PARAM:%.*]]) { ; CHECK-NEXT: bb: ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb1: @@ -24,14 +24,14 @@ define void @test(ptr %ptr.i8, ptr %ptr.float) { ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb10: ; CHECK-NEXT: [[T7:%.*]] = icmp eq i64 [[T4]], 0 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[PTR_I8]], i64 [[T4]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[PTR_I8]], i64 [[T4]] ; CHECK-NEXT: br label [[BB14:%.*]] ; CHECK: bb14: -; CHECK-NEXT: store i8 undef, ptr [[SCEVGEP]], align 1 +; CHECK-NEXT: store i8 [[PARAM]], ptr [[SCEVGEP2]], align 1 ; CHECK-NEXT: [[T6:%.*]] = load ptr, ptr [[PTR_FLOAT]], align 8 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[T6]], i64 16 -; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[T4]] -; CHECK-NEXT: store i8 undef, ptr [[SCEVGEP2]], align 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[T6]], i64 16 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[T4]] +; CHECK-NEXT: store i8 [[PARAM]], ptr [[SCEVGEP1]], align 1 ; CHECK-NEXT: br label [[BB14]] ; bb: @@ -55,19 +55,19 @@ bb10: ; preds = %bb9 bb14: ; preds = %bb14, %bb10 %t2 = getelementptr inbounds i8, ptr %ptr.i8, i64 %t4 ; [#uses=1] - store i8 undef, ptr %t2 + store i8 %param, ptr %t2 %t6 = load ptr, ptr %ptr.float %t9 = getelementptr inbounds i8, ptr %t6, i64 %t3 ; [#uses=1] - store i8 undef, ptr %t9 + store i8 %param, ptr %t9 br label %bb14 } ; Check that induction variable is initialized to -2. ; IVNEXT covers the uses of %i0 and %t0. ; Therefore, %t0 should be removed and the critical edge must be split. -define fastcc void @TransformLine() nounwind { +define fastcc void @TransformLine(i32 %param) nounwind { ; CHECK-LABEL: define fastcc void @TransformLine -; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-SAME: (i32 [[PARAM:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: bb: ; CHECK-NEXT: br label [[LOOP0:%.*]] ; CHECK: loop0: @@ -78,27 +78,21 @@ define fastcc void @TransformLine() nounwind { ; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop1: ; CHECK-NEXT: [[I1:%.*]] = phi i32 [ 0, [[BB0]] ], [ [[I1_NEXT:%.*]], [[BB5:%.*]] ] -; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[LOOP1_BB6_CRIT_EDGE:%.*]] +; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB6SPLIT:%.*]] ; CHECK: bb2: -; CHECK-NEXT: br i1 true, label [[BB6SPLITSPLIT:%.*]], label [[BB5]] +; CHECK-NEXT: br i1 true, label [[BB6SPLIT]], label [[BB5]] ; CHECK: bb5: ; CHECK-NEXT: [[I1_NEXT]] = add i32 [[I1]], 1 -; CHECK-NEXT: br i1 true, label [[BB5_BB6SPLIT_CRIT_EDGE:%.*]], label [[LOOP1]] -; CHECK: bb5.bb6split_crit_edge: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LSR_IV_NEXT]], [[I1_NEXT]] -; CHECK-NEXT: br label [[BB6SPLIT:%.*]] -; CHECK: bb6splitsplit: -; CHECK-NEXT: br label [[BB6SPLIT]] +; CHECK-NEXT: br i1 true, label [[BB5_BB6_CRIT_EDGE:%.*]], label [[LOOP1]] ; CHECK: bb6split: -; CHECK-NEXT: [[P8_PH:%.*]] = phi i32 [ [[TMP0]], [[BB5_BB6SPLIT_CRIT_EDGE]] ], [ undef, [[BB6SPLITSPLIT]] ] -; CHECK-NEXT: [[P9_PH:%.*]] = phi i32 [ undef, [[BB5_BB6SPLIT_CRIT_EDGE]] ], [ [[I1]], [[BB6SPLITSPLIT]] ] +; CHECK-NEXT: [[I1_LCSSA:%.*]] = phi i32 [ [[I1]], [[LOOP1]] ], [ [[I1]], [[BB2]] ] ; CHECK-NEXT: br label [[BB6:%.*]] -; CHECK: loop1.bb6_crit_edge: -; CHECK-NEXT: [[I1_LCSSA:%.*]] = phi i32 [ [[I1]], [[LOOP1]] ] +; CHECK: bb5.bb6_crit_edge: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LSR_IV_NEXT]], [[I1_NEXT]] ; CHECK-NEXT: br label [[BB6]] ; CHECK: bb6: -; CHECK-NEXT: [[P8:%.*]] = phi i32 [ undef, [[LOOP1_BB6_CRIT_EDGE]] ], [ [[P8_PH]], [[BB6SPLIT]] ] -; CHECK-NEXT: [[P9:%.*]] = phi i32 [ [[I1_LCSSA]], [[LOOP1_BB6_CRIT_EDGE]] ], [ [[P9_PH]], [[BB6SPLIT]] ] +; CHECK-NEXT: [[P8:%.*]] = phi i32 [ [[TMP0]], [[BB5_BB6_CRIT_EDGE]] ], [ [[PARAM]], [[BB6SPLIT]] ] +; CHECK-NEXT: [[P9:%.*]] = phi i32 [ [[PARAM]], [[BB5_BB6_CRIT_EDGE]] ], [ [[I1_LCSSA]], [[BB6SPLIT]] ] ; CHECK-NEXT: unreachable ; bb: @@ -125,7 +119,7 @@ bb5: ; preds = %bb2 br i1 true, label %bb6, label %loop1 bb6: ; preds = %bb5, %bb2, %loop1 - %p8 = phi i32 [ %t0, %bb5 ], [ undef, %loop1 ], [ undef, %bb2 ] ; [#uses=0] - %p9 = phi i32 [ undef, %bb5 ], [ %i1, %loop1 ], [ %i1, %bb2 ] ; [#uses=0] + %p8 = phi i32 [ %t0, %bb5 ], [ %param, %loop1 ], [ %param, %bb2 ] ; [#uses=0] + %p9 = phi i32 [ %param, %bb5 ], [ %i1, %loop1 ], [ %i1, %bb2 ] ; [#uses=0] unreachable } diff --git a/llvm/test/Transforms/LoopStrengthReduce/wrong-hoisting-iv.ll b/llvm/test/Transforms/LoopStrengthReduce/wrong-hoisting-iv.ll index 502042eaf9b9c..f7d4f8938a8ac 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/wrong-hoisting-iv.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/wrong-hoisting-iv.ll @@ -24,10 +24,10 @@ define void @test1() { ; CHECK-NEXT: [[TMP5:%.*]] = shl i32 [[VAL6]], 3 ; CHECK-NEXT: br label [[BB7:%.*]] ; CHECK: bb7: -; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i32 [ [[LSR_IV_NEXT2:%.*]], [[BB32:%.*]] ], [ 0, [[BB:%.*]] ] -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[BB32]] ], [ -8, [[BB]] ] -; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 8 +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[BB32:%.*]] ], [ -8, [[BB:%.*]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i32 [ [[LSR_IV_NEXT2:%.*]], [[BB32]] ], [ 0, [[BB]] ] ; CHECK-NEXT: [[LSR_IV_NEXT2]] = add nuw nsw i32 [[LSR_IV1]], [[TMP5]] +; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 8 ; CHECK-NEXT: [[VAL10:%.*]] = icmp ult i64 [[LSR_IV_NEXT]], 65536 ; CHECK-NEXT: br i1 [[VAL10]], label [[BB12:%.*]], label [[BB11:%.*]] ; CHECK: bb11: @@ -36,14 +36,8 @@ define void @test1() { ; CHECK-NEXT: [[VAL14:%.*]] = icmp slt i32 undef, undef ; CHECK-NEXT: br i1 [[VAL14]], label [[BB17:%.*]], label [[BB12_BB15SPLITSPLITSPLITSPLITSPLIT_CRIT_EDGE:%.*]] ; CHECK: bb15splitsplitsplitsplitsplitsplit: -; CHECK-NEXT: br label [[BB15SPLITSPLITSPLITSPLITSPLIT:%.*]] -; CHECK: bb12.bb15splitsplitsplitsplitsplit_crit_edge: -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[VAL6]], [[LSR_IV1]] -; CHECK-NEXT: br label [[BB15SPLITSPLITSPLITSPLITSPLIT]] -; CHECK: bb15splitsplitsplitsplitsplit: -; CHECK-NEXT: [[VAL16_PH_PH_PH_PH_PH:%.*]] = phi i32 [ [[TMP6]], [[BB12_BB15SPLITSPLITSPLITSPLITSPLIT_CRIT_EDGE]] ], [ [[VAL35:%.*]], [[BB15SPLITSPLITSPLITSPLITSPLITSPLIT:%.*]] ] ; CHECK-NEXT: br label [[BB15SPLITSPLITSPLITSPLIT:%.*]] -; CHECK: bb17.bb15splitsplitsplitsplit_crit_edge: +; CHECK: bb17.bb15splitsplitsplitsplitsplit_crit_edge: ; CHECK-NEXT: [[TMP7:%.*]] = shl i32 [[VAL]], 1 ; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[VAL1]], [[VAL2]] ; CHECK-NEXT: [[TMP9:%.*]] = shl i32 [[TMP8]], 1 @@ -52,10 +46,10 @@ define void @test1() { ; CHECK-NEXT: [[TMP12:%.*]] = sub i32 [[TMP10]], [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[LSR_IV1]] ; CHECK-NEXT: br label [[BB15SPLITSPLITSPLITSPLIT]] -; CHECK: bb15splitsplitsplitsplit: -; CHECK-NEXT: [[VAL16_PH_PH_PH_PH:%.*]] = phi i32 [ [[TMP13]], [[BB17_BB15SPLITSPLITSPLITSPLIT_CRIT_EDGE:%.*]] ], [ [[VAL16_PH_PH_PH_PH_PH]], [[BB15SPLITSPLITSPLITSPLITSPLIT]] ] +; CHECK: bb15splitsplitsplitsplitsplit: +; CHECK-NEXT: [[VAL16_PH_PH_PH_PH:%.*]] = phi i32 [ [[TMP13]], [[BB17_BB15SPLITSPLITSPLITSPLIT_CRIT_EDGE:%.*]] ], [ [[VAL16_PH_PH_PH_PH_PH:%.*]], [[BB15SPLITSPLITSPLITSPLITSPLIT:%.*]] ] ; CHECK-NEXT: br label [[BB15SPLITSPLITSPLIT:%.*]] -; CHECK: bb20.bb15splitsplitsplit_crit_edge: +; CHECK: bb20.bb15splitsplitsplitsplit_crit_edge: ; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[VAL]], 3 ; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[VAL1]], [[VAL2]] ; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 3 @@ -64,10 +58,10 @@ define void @test1() { ; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP17]], [[TMP18]] ; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], [[LSR_IV1]] ; CHECK-NEXT: br label [[BB15SPLITSPLITSPLIT]] -; CHECK: bb15splitsplitsplit: +; CHECK: bb15splitsplitsplitsplit: ; CHECK-NEXT: [[VAL16_PH_PH_PH:%.*]] = phi i32 [ [[TMP20]], [[BB20_BB15SPLITSPLITSPLIT_CRIT_EDGE:%.*]] ], [ [[VAL16_PH_PH_PH_PH]], [[BB15SPLITSPLITSPLITSPLIT]] ] ; CHECK-NEXT: br label [[BB15SPLITSPLIT:%.*]] -; CHECK: bb23.bb15splitsplit_crit_edge: +; CHECK: bb23.bb15splitsplitsplit_crit_edge: ; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[VAL]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[VAL1]], [[VAL2]] ; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 2 @@ -76,10 +70,10 @@ define void @test1() { ; CHECK-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[TMP26]], [[LSR_IV1]] ; CHECK-NEXT: br label [[BB15SPLITSPLIT]] -; CHECK: bb15splitsplit: +; CHECK: bb15splitsplitsplit: ; CHECK-NEXT: [[VAL16_PH_PH:%.*]] = phi i32 [ [[TMP27]], [[BB23_BB15SPLITSPLIT_CRIT_EDGE:%.*]] ], [ [[VAL16_PH_PH_PH]], [[BB15SPLITSPLITSPLIT]] ] ; CHECK-NEXT: br label [[BB15SPLIT:%.*]] -; CHECK: bb26.bb15split_crit_edge: +; CHECK: bb26.bb15splitsplit_crit_edge: ; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[VAL]], 5 ; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[VAL1]], [[VAL2]] ; CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 5 @@ -88,10 +82,10 @@ define void @test1() { ; CHECK-NEXT: [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP32]] ; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP33]], [[LSR_IV1]] ; CHECK-NEXT: br label [[BB15SPLIT]] -; CHECK: bb15split: +; CHECK: bb15splitsplit: ; CHECK-NEXT: [[VAL16_PH:%.*]] = phi i32 [ [[TMP34]], [[BB26_BB15SPLIT_CRIT_EDGE:%.*]] ], [ [[VAL16_PH_PH]], [[BB15SPLITSPLIT]] ] ; CHECK-NEXT: br label [[BB15:%.*]] -; CHECK: bb29.bb15_crit_edge: +; CHECK: bb29.bb15split_crit_edge: ; CHECK-NEXT: [[TMP35:%.*]] = mul i32 [[VAL]], 6 ; CHECK-NEXT: [[TMP36:%.*]] = mul i32 [[VAL1]], [[VAL2]] ; CHECK-NEXT: [[TMP37:%.*]] = mul i32 [[TMP36]], 6 @@ -100,8 +94,14 @@ define void @test1() { ; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP38]], [[TMP39]] ; CHECK-NEXT: [[TMP41:%.*]] = add i32 [[TMP40]], [[LSR_IV1]] ; CHECK-NEXT: br label [[BB15]] +; CHECK: bb15split: +; CHECK-NEXT: [[VAL16_PH1:%.*]] = phi i32 [ [[TMP41]], [[BB29_BB15_CRIT_EDGE:%.*]] ], [ [[VAL16_PH]], [[BB15SPLIT]] ] +; CHECK-NEXT: br label [[BB16:%.*]] +; CHECK: bb12.bb15_crit_edge: +; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[VAL6]], [[LSR_IV1]] +; CHECK-NEXT: br label [[BB16]] ; CHECK: bb15: -; CHECK-NEXT: [[VAL16:%.*]] = phi i32 [ [[TMP41]], [[BB29_BB15_CRIT_EDGE:%.*]] ], [ [[VAL16_PH]], [[BB15SPLIT]] ] +; CHECK-NEXT: [[VAL16:%.*]] = phi i32 [ [[TMP43]], [[BB12_BB15SPLITSPLITSPLITSPLITSPLIT_CRIT_EDGE]] ], [ [[VAL16_PH1]], [[BB15]] ] ; CHECK-NEXT: call void @widget() [ "deopt"(i32 [[VAL16]], i32 3, i32 [[VAL]]) ] ; CHECK-NEXT: unreachable ; CHECK: bb17: @@ -121,8 +121,8 @@ define void @test1() { ; CHECK-NEXT: br i1 [[VAL31]], label [[BB32]], label [[BB29_BB15_CRIT_EDGE]] ; CHECK: bb32: ; CHECK-NEXT: [[TMP42:%.*]] = add i32 [[TMP4]], [[LSR_IV1]] -; CHECK-NEXT: [[VAL35]] = add i32 [[TMP42]], [[VAL6]] -; CHECK-NEXT: br i1 false, label [[BB7]], label [[BB15SPLITSPLITSPLITSPLITSPLITSPLIT]] +; CHECK-NEXT: [[VAL16_PH_PH_PH_PH_PH]] = add i32 [[TMP42]], [[VAL6]] +; CHECK-NEXT: br i1 false, label [[BB7]], label [[BB15SPLITSPLITSPLITSPLITSPLIT]] ; bb: %val = load i32, ptr addrspace(3) undef, align 4