From 5851544d4484f373045dac52f7bf45e32ba5cdb9 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 8 May 2025 17:37:45 +0000 Subject: [PATCH 1/5] [AArch64][SME] Support split ZPR and PPR area allocation For a while we have supported the `-aarch64-stack-hazard-size=` option, which adds "hazard padding" between GPRs and FPR/ZPRs. However, there is currently a hole in this mitigation as PPR and FPR/ZPR accesses to the same area also cause streaming memory hazards (this is noted by `-pass-remarks-analysis=sme -aarch64-stack-hazard-remark-size=`), and the current stack layout places PPRs and ZPRs within the same area. Which looks like: ------------------------------------ Higher address | callee-saved gpr registers | |---------------------------------- | | lr,fp (a.k.a. "frame record") | |-----------------------------------| <- fp(=x29) | | |-----------------------------------| | callee-saved fp/simd/SVE regs | |-----------------------------------| | SVE stack objects | |-----------------------------------| | local variables of fixed size | | | | | | | ------------------------------------| <- sp | Lower address With this patch the stack (and hazard padding) is rearranged so that hazard padding is placed between the PPRs and ZPRs rather than within the (fixed size) callee-save region. Which looks something like this: ------------------------------------ Higher address | callee-saved gpr registers | |---------------------------------- | | lr,fp (a.k.a. "frame record") | |-----------------------------------| <- fp(=x29) | callee-saved PPRs | | PPR stack objects | (These are SVE predicates) |-----------------------------------| | | |-----------------------------------| | callee-saved ZPR regs | (These are SVE vectors) | ZPR stack objects | Note: FPRs are promoted to ZPRs |-----------------------------------| | local variables of fixed size | | | | | | | ------------------------------------| <- sp | Lower address This layout is only enabled if: * SplitSVEObjects are enabled (`-aarch64-split-sve-objects`) - (This may be enabled by default in a later patch) * Streaming memory hazards are present - (`-aarch64-stack-hazard-size=` != 0) * PPRs and FPRs/ZPRs are on the stack * There's no stack realignment or variable-sized objects - This is left as a TODO for now Additionally, any FPR callee-saves that are present will be promoted to ZPRs. This is to prevent stack hazards between FPRs and GRPs in the fixed size callee-save area (which would otherwise require more hazard padding, or moving the FPR callee-saves). This layout should resolve the hole in the hazard padding mitigation, and is not intended change codegen for non-SME code. --- .../Target/AArch64/AArch64FrameLowering.cpp | 633 +++++++++---- .../lib/Target/AArch64/AArch64FrameLowering.h | 3 +- .../AArch64/AArch64MachineFunctionInfo.h | 7 + .../CodeGen/AArch64/framelayout-split-sve.mir | 526 +++++++++++ .../AArch64/spill-fill-zpr-predicates.mir | 17 +- .../AArch64/split-sve-stack-frame-layout.ll | 751 +++++++++++++++ llvm/test/CodeGen/AArch64/stack-hazard.ll | 868 +++++++++++------- .../CodeGen/AArch64/sve-stack-frame-layout.ll | 2 - 8 files changed, 2279 insertions(+), 528 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/framelayout-split-sve.mir create mode 100644 llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 4da635676fba4..87d065b8d07e7 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -274,6 +274,11 @@ static cl::opt OrderFrameObjects("aarch64-order-frame-objects", cl::desc("sort stack allocations"), cl::init(true), cl::Hidden); +static cl::opt + SplitSVEObjects("aarch64-split-sve-objects", + cl::desc("Split allocation of ZPR & PPR objects"), + cl::init(false), cl::Hidden); + cl::opt EnableHomogeneousPrologEpilog( "homogeneous-prolog-epilog", cl::Hidden, cl::desc("Emit homogeneous prologue and epilogue for the size " @@ -339,8 +344,7 @@ enum class AssignObjectOffsets { No, Yes }; /// each object. If AssignOffsets is "Yes", the offsets get assigned (and SVE /// stack sizes set). Returns the size of the SVE stack. static SVEStackSizes determineSVEStackSizes(MachineFunction &MF, - AssignObjectOffsets AssignOffsets, - bool SplitSVEObjects = false); + AssignObjectOffsets AssignOffsets); static unsigned getStackHazardSize(const MachineFunction &MF) { return MF.getSubtarget().getStreamingHazardSize(); @@ -352,10 +356,13 @@ static StackOffset getZPRStackSize(const MachineFunction &MF) { return StackOffset::getScalable(AFI->getStackSizeZPR()); } -/// Returns the size of the entire PPR stackframe (calleesaves + spills). +/// Returns the size of the entire PPR stackframe (calleesaves + spills + hazard +/// padding). static StackOffset getPPRStackSize(const MachineFunction &MF) { const AArch64FunctionInfo *AFI = MF.getInfo(); - return StackOffset::getScalable(AFI->getStackSizePPR()); + return StackOffset::get(AFI->hasSplitSVEObjects() ? getStackHazardSize(MF) + : 0, + AFI->getStackSizePPR()); } /// Returns the size of the entire SVE stackframe (PPRs + ZPRs). @@ -363,6 +370,11 @@ static StackOffset getSVEStackSize(const MachineFunction &MF) { return getZPRStackSize(MF) + getPPRStackSize(MF); } +static bool hasSVEStackSize(const MachineFunction &MF) { + const AArch64FunctionInfo *AFI = MF.getInfo(); + return AFI->getStackSizeZPR() > 0 || AFI->getStackSizePPR() > 0; +} + /// Returns true if PPRs are spilled as ZPRs. static bool arePPRsSpilledAsZPR(const MachineFunction &MF) { return MF.getSubtarget().getRegisterInfo()->getSpillSize( @@ -531,6 +543,10 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; + const AArch64FunctionInfo *AFI = MF.getInfo(); + if (AFI->hasSplitSVEObjects()) + return false; + // Don't use the red zone if the function explicitly asks us not to. // This is typically used for kernel code. const AArch64Subtarget &Subtarget = MF.getSubtarget(); @@ -540,7 +556,6 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { return false; const MachineFrameInfo &MFI = MF.getFrameInfo(); - const AArch64FunctionInfo *AFI = MF.getInfo(); uint64_t NumBytes = AFI->getLocalStackSize(); // If neither NEON or SVE are available, a COPY from one Q-reg to @@ -734,9 +749,11 @@ void AArch64FrameLowering::emitCalleeSavedSVELocations( const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); AArch64FunctionInfo &AFI = *MF.getInfo(); CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); + StackOffset PPRStackSize = getPPRStackSize(MF); for (const auto &Info : CSI) { - if (!MFI.isScalableStackID(Info.getFrameIdx())) + int FI = Info.getFrameIdx(); + if (!MFI.isScalableStackID(FI)) continue; // Not all unwinders may know about SVE registers, so assume the lowest @@ -747,9 +764,13 @@ void AArch64FrameLowering::emitCalleeSavedSVELocations( continue; StackOffset Offset = - StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) - + StackOffset::getScalable(MFI.getObjectOffset(FI)) - StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI)); + if (AFI.hasSplitSVEObjects() && + MFI.getStackID(FI) == TargetStackID::ScalableVector) + Offset -= PPRStackSize; + CFIBuilder.insertCFIInst(createCFAOffset(TRI, Reg, Offset)); } } @@ -2324,73 +2345,96 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, StackOffset SVECalleeSavesSize = PPRCalleeSavesSize + ZPRCalleeSavesSize; StackOffset PPRLocalsSize = getPPRStackSize(MF) - PPRCalleeSavesSize; StackOffset ZPRLocalsSize = getZPRStackSize(MF) - ZPRCalleeSavesSize; + std::optional ZPRCalleeSavesBegin, + ZPRCalleeSavesEnd, PPRCalleeSavesBegin, PPRCalleeSavesEnd; StackOffset CFAOffset = StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); if (!FPAfterSVECalleeSaves) { - MachineBasicBlock::iterator ZPRCalleeSavesBegin = MBBI, - ZPRCalleeSavesEnd = MBBI; - MachineBasicBlock::iterator PPRCalleeSavesBegin = MBBI, - PPRCalleeSavesEnd = MBBI; - - // Process the SVE callee-saves to determine what space needs to be - // allocated. - + // Process the SVE callee-saves to find the starts/ends of the ZPR and PPR + // areas. + PPRCalleeSavesBegin = MBBI; if (PPRCalleeSavesSize) { LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = " << PPRCalleeSavesSize.getScalable() << "\n"); - PPRCalleeSavesBegin = MBBI; - assert(isPartOfPPRCalleeSaves(PPRCalleeSavesBegin) && + assert(isPartOfPPRCalleeSaves(*PPRCalleeSavesBegin) && "Unexpected instruction"); while (isPartOfPPRCalleeSaves(MBBI) && MBBI != MBB.getFirstTerminator()) ++MBBI; - PPRCalleeSavesEnd = MBBI; } - + PPRCalleeSavesEnd = ZPRCalleeSavesBegin = MBBI; if (ZPRCalleeSavesSize) { LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = " << ZPRCalleeSavesSize.getScalable() << "\n"); - ZPRCalleeSavesBegin = MBBI; - assert(isPartOfZPRCalleeSaves(ZPRCalleeSavesBegin) && + assert(isPartOfZPRCalleeSaves(*ZPRCalleeSavesBegin) && "Unexpected instruction"); while (isPartOfZPRCalleeSaves(MBBI) && MBBI != MBB.getFirstTerminator()) ++MBBI; - ZPRCalleeSavesEnd = MBBI; } - - // Allocate space for the callee saves (if any). - StackOffset LocalsSize = - PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes); - MachineBasicBlock::iterator CalleeSavesBegin = - AFI->getPPRCalleeSavedStackSize() ? PPRCalleeSavesBegin - : ZPRCalleeSavesBegin; - allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false, - nullptr, EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || LocalsSize); - - CalleeSavesEnd = AFI->getZPRCalleeSavedStackSize() ? ZPRCalleeSavesEnd - : PPRCalleeSavesEnd; + ZPRCalleeSavesEnd = CalleeSavesEnd = MBBI; } - CFAOffset += SVECalleeSavesSize; - if (EmitAsyncCFI) emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); - // Allocate space for the rest of the frame including SVE locals. Align the - // stack as necessary. - assert(!(canUseRedZone(MF) && NeedsRealignment) && - "Cannot use redzone with stack realignment"); - if (!canUseRedZone(MF)) { - // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have - // the correct value here, as NumBytes also includes padding bytes, - // which shouldn't be counted here. - StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize; - allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding, - SVELocalsSize + StackOffset::getFixed(NumBytes), - NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, - CFAOffset, MFI.hasVarSizedObjects()); + if (AFI->hasSplitSVEObjects()) { + assert(!FPAfterSVECalleeSaves && + "Cannot use FPAfterSVECalleeSaves with aarch64-split-sve-objects"); + assert(!canUseRedZone(MF) && + "Cannot use redzone with aarch64-split-sve-objects"); + // TODO: Handle HasWinCFI/NeedsWinCFI? + assert(!NeedsWinCFI && + "WinCFI with aarch64-split-sve-objects is not supported"); + + // Split ZPR and PPR allocation. + // Allocate PPR callee saves + allocateStackSpace(MBB, *PPRCalleeSavesBegin, 0, PPRCalleeSavesSize, false, + nullptr, EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects() || ZPRCalleeSavesSize || + ZPRLocalsSize || PPRLocalsSize); + CFAOffset += PPRCalleeSavesSize; + + // Allocate PPR locals + ZPR callee saves + assert(PPRCalleeSavesEnd == ZPRCalleeSavesBegin && + "Expected ZPR callee saves after PPR locals"); + allocateStackSpace(MBB, *PPRCalleeSavesEnd, RealignmentPadding, + PPRLocalsSize + ZPRCalleeSavesSize, false, nullptr, + EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects() || ZPRLocalsSize); + CFAOffset += PPRLocalsSize + ZPRCalleeSavesSize; + + // Allocate ZPR locals + allocateStackSpace(MBB, *ZPRCalleeSavesEnd, RealignmentPadding, + ZPRLocalsSize + StackOffset::getFixed(NumBytes), false, + nullptr, EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects()); + } else { + // Allocate space for the callee saves (if any). + StackOffset LocalsSize = + PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes); + MachineBasicBlock::iterator CalleeSavesBegin = + PPRCalleeSavesBegin.value_or(MBBI); + if (!FPAfterSVECalleeSaves) + allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false, + nullptr, EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects() || LocalsSize); + CFAOffset += SVECalleeSavesSize; + + // Allocate space for the rest of the frame including SVE locals. Align the + // stack as necessary. + assert(!(canUseRedZone(MF) && NeedsRealignment) && + "Cannot use redzone with stack realignment"); + if (!canUseRedZone(MF)) { + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have + // the correct value here, as NumBytes also includes padding bytes, + // which shouldn't be counted here. + StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize; + allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding, + SVELocalsSize + StackOffset::getFixed(NumBytes), + NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, + CFAOffset, MFI.hasVarSizedObjects()); + } } // If we need a base pointer, set it up here. It's whatever the value of the @@ -2638,7 +2682,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, } } - StackOffset SVEStackSize = getSVEStackSize(MF); + StackOffset ZPRStackSize = getZPRStackSize(MF); + StackOffset PPRStackSize = getPPRStackSize(MF); + StackOffset SVEStackSize = ZPRStackSize + PPRStackSize; // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { @@ -2659,110 +2705,185 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); - // Process the SVE callee-saves to determine what space needs to be - // deallocated. - StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; - MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI; - int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize(); - int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize(); - int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize; + if (!AFI->hasSplitSVEObjects()) { + // Process the SVE callee-saves to determine what space needs to be + // deallocated. + StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; + MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI; + int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize(); + int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize(); + int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize; + + if (SVECalleeSavedSize) { + if (FPAfterSVECalleeSaves) + RestoreEnd = MBB.getFirstTerminator(); + + RestoreBegin = std::prev(RestoreEnd); + while (RestoreBegin != MBB.begin() && + isPartOfSVECalleeSaves(std::prev(RestoreBegin))) + --RestoreBegin; + + assert(isPartOfSVECalleeSaves(RestoreBegin) && + isPartOfSVECalleeSaves(std::prev(RestoreEnd)) && + "Unexpected instruction"); - if (SVECalleeSavedSize) { - if (FPAfterSVECalleeSaves) - RestoreEnd = MBB.getFirstTerminator(); + StackOffset CalleeSavedSizeAsOffset = + StackOffset::getScalable(SVECalleeSavedSize); + DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; + DeallocateAfter = CalleeSavedSizeAsOffset; + } - RestoreBegin = std::prev(RestoreEnd); - while (RestoreBegin != MBB.begin() && - isPartOfSVECalleeSaves(std::prev(RestoreBegin))) - --RestoreBegin; + // Deallocate the SVE area. + if (FPAfterSVECalleeSaves) { + // If the callee-save area is before FP, restoring the FP implicitly + // deallocates non-callee-save SVE allocations. Otherwise, deallocate + // them explicitly. + if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + DeallocateBefore, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI); + } - assert(isPartOfSVECalleeSaves(RestoreBegin) && - isPartOfSVECalleeSaves(std::prev(RestoreEnd)) && - "Unexpected instruction"); + // Deallocate callee-save non-SVE registers. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(AFI->getCalleeSavedStackSize()), + TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI); - StackOffset CalleeSavedSizeAsOffset = - StackOffset::getScalable(SVECalleeSavedSize); - DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; - DeallocateAfter = CalleeSavedSizeAsOffset; - } + // Deallocate fixed objects. + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(FixedObject), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI); - // Deallocate the SVE area. - if (FPAfterSVECalleeSaves) { - // If the callee-save area is before FP, restoring the FP implicitly - // deallocates non-callee-save SVE allocations. Otherwise, deallocate - // them explicitly. - if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, - DeallocateBefore, TII, MachineInstr::FrameDestroy, false, + // Deallocate callee-save SVE registers. + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + DeallocateAfter, TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - } - - // Deallocate callee-save non-SVE registers. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); + } else if (SVEStackSize) { + int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize(); + // If we have stack realignment or variable-sized objects we must use the + // FP to restore SVE callee saves (as there is an unknown amount of + // data/padding between the SP and SVE CS area). + Register BaseForSVEDealloc = + (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP + : AArch64::SP; + if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) { + Register CalleeSaveBase = AArch64::FP; + if (int64_t CalleeSaveBaseOffset = + AFI->getCalleeSaveBaseToFrameRecordOffset()) { + // If we have have an non-zero offset to the non-SVE CS base we need + // to compute the base address by subtracting the offest in a + // temporary register first (to avoid briefly deallocating the SVE + // CS). + CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister( + &AArch64::GPR64RegClass); + emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, + StackOffset::getFixed(-CalleeSaveBaseOffset), TII, + MachineInstr::FrameDestroy); + } + // The code below will deallocate the stack space space by moving the + // SP to the start of the SVE callee-save area. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, + StackOffset::getScalable(-SVECalleeSavedSize), TII, + MachineInstr::FrameDestroy); + } else if (BaseForSVEDealloc == AArch64::SP) { + if (SVECalleeSavedSize) { + // Deallocate the non-SVE locals first before we can deallocate (and + // restore callee saves) from the SVE area. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(NumBytes), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI, EmitCFI && !hasFP(MF), + SVEStackSize + StackOffset::getFixed( + NumBytes + PrologueSaveSize)); + NumBytes = 0; + } - // Deallocate fixed objects. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(FixedObject), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + DeallocateBefore, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI, EmitCFI && !hasFP(MF), + SVEStackSize + + StackOffset::getFixed(NumBytes + PrologueSaveSize)); + + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + DeallocateAfter, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI, &HasWinCFI, EmitCFI && !hasFP(MF), + DeallocateAfter + + StackOffset::getFixed(NumBytes + PrologueSaveSize)); + } - // Deallocate callee-save SVE registers. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - DeallocateAfter, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + if (EmitCFI) + emitCalleeSavedSVERestores(MBB, RestoreEnd); + } } else if (SVEStackSize) { - int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize(); - // If we have stack realignment or variable-sized objects we must use the - // FP to restore SVE callee saves (as there is an unknown amount of - // data/padding between the SP and SVE CS area). - Register BaseForSVEDealloc = - (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP - : AArch64::SP; - if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) { - Register CalleeSaveBase = AArch64::FP; - if (int64_t CalleeSaveBaseOffset = - AFI->getCalleeSaveBaseToFrameRecordOffset()) { - // If we have have an non-zero offset to the non-SVE CS base we need to - // compute the base address by subtracting the offest in a temporary - // register first (to avoid briefly deallocating the SVE CS). - CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister( - &AArch64::GPR64RegClass); - emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, - StackOffset::getFixed(-CalleeSaveBaseOffset), TII, - MachineInstr::FrameDestroy); - } - // The code below will deallocate the stack space space by moving the - // SP to the start of the SVE callee-save area. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, - StackOffset::getScalable(-SVECalleeSavedSize), TII, - MachineInstr::FrameDestroy); - } else if (BaseForSVEDealloc == AArch64::SP) { - if (SVECalleeSavedSize) { - // Deallocate the non-SVE locals first before we can deallocate (and - // restore callee saves) from the SVE area. - emitFrameOffset( - MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy, - false, NeedsWinCFI, &HasWinCFI, EmitCFI && !hasFP(MF), - SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize)); - NumBytes = 0; - } + assert(!AFI->isStackRealigned() && !MFI.hasVarSizedObjects() && + "TODO: Support stack realigment / variable-sized objects"); + // SplitSVEObjects. Determine the sizes and starts/ends of the ZPR and PPR + // areas. + auto ZPRCalleeSavedSize = + StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize()); + auto PPRCalleeSavedSize = + StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize()); + StackOffset PPRLocalsSize = PPRStackSize - PPRCalleeSavedSize; + StackOffset ZPRLocalsSize = ZPRStackSize - ZPRCalleeSavedSize; + + MachineBasicBlock::iterator PPRRestoreBegin = LastPopI, + PPRRestoreEnd = LastPopI; + if (PPRCalleeSavedSize) { + PPRRestoreBegin = std::prev(PPRRestoreEnd); + while (PPRRestoreBegin != MBB.begin() && + isPartOfPPRCalleeSaves(std::prev(PPRRestoreBegin))) + --PPRRestoreBegin; + } - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - DeallocateBefore, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !hasFP(MF), - SVEStackSize + - StackOffset::getFixed(NumBytes + PrologueSaveSize)); + MachineBasicBlock::iterator ZPRRestoreBegin = PPRRestoreBegin, + ZPRRestoreEnd = PPRRestoreBegin; + if (ZPRCalleeSavedSize) { + ZPRRestoreBegin = std::prev(ZPRRestoreEnd); + while (ZPRRestoreBegin != MBB.begin() && + isPartOfZPRCalleeSaves(std::prev(ZPRRestoreBegin))) + --ZPRRestoreBegin; + } - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - DeallocateAfter, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !hasFP(MF), - DeallocateAfter + - StackOffset::getFixed(NumBytes + PrologueSaveSize)); + auto CFAOffset = + SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize); + if (PPRCalleeSavedSize || ZPRCalleeSavedSize) { + // Deallocate the non-SVE locals first before we can deallocate (and + // restore callee saves) from the SVE area. + auto NonSVELocals = StackOffset::getFixed(NumBytes); + emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP, + NonSVELocals, TII, MachineInstr::FrameDestroy, false, + false, nullptr, EmitCFI && !hasFP(MF), CFAOffset); + NumBytes = 0; + CFAOffset -= NonSVELocals; } + + if (ZPRLocalsSize) { + emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP, + ZPRLocalsSize, TII, MachineInstr::FrameDestroy, false, + false, nullptr, EmitCFI && !hasFP(MF), CFAOffset); + CFAOffset -= ZPRLocalsSize; + } + + if (PPRLocalsSize || ZPRCalleeSavedSize) { + assert(PPRRestoreBegin == ZPRRestoreEnd && + "Expected PPR restores after ZPR"); + emitFrameOffset(MBB, PPRRestoreBegin, DL, AArch64::SP, AArch64::SP, + PPRLocalsSize + ZPRCalleeSavedSize, TII, + MachineInstr::FrameDestroy, false, false, nullptr, + EmitCFI && !hasFP(MF), CFAOffset); + CFAOffset -= PPRLocalsSize + ZPRCalleeSavedSize; + } + if (PPRCalleeSavedSize) { + emitFrameOffset(MBB, PPRRestoreEnd, DL, AArch64::SP, AArch64::SP, + PPRCalleeSavedSize, TII, MachineInstr::FrameDestroy, + false, false, nullptr, EmitCFI && !hasFP(MF), CFAOffset); + } + + // We only emit CFI information for ZPRs so emit CFI after the ZPR restores. if (EmitCFI) - emitCalleeSavedSVERestores(MBB, RestoreEnd); + emitCalleeSavedSVERestores(MBB, ZPRRestoreEnd); } if (!hasFP(MF)) { @@ -2887,9 +3008,17 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize(); if (MFI.isScalableStackID(FI)) { if (FPAfterSVECalleeSaves && - -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) + -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) { + assert(!AFI->hasSplitSVEObjects() && + "split-sve-objects not supported with FPAfterSVECalleeSaves"); return StackOffset::getScalable(ObjectOffset); - return StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()), + } + StackOffset AccessOffset{}; + if (AFI->hasSplitSVEObjects() && + MFI.getStackID(FI) == TargetStackID::ScalableVector) + AccessOffset = -PPRStackSize; + return AccessOffset + + StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()), ObjectOffset); } @@ -2953,12 +3082,12 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference( bool isFixed = MFI.isFixedObjectIndex(FI); bool isSVE = MFI.isScalableStackID(FI); return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg, - PreferFP, ForSimm); + PreferFP, ForSimm, FI); } StackOffset AArch64FrameLowering::resolveFrameOffsetReference( const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE, - Register &FrameReg, bool PreferFP, bool ForSimm) const { + Register &FrameReg, bool PreferFP, bool ForSimm, int64_t FI) const { const auto &MFI = MF.getFrameInfo(); const auto *RegInfo = static_cast( MF.getSubtarget().getRegisterInfo()); @@ -2970,7 +3099,9 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( bool isCSR = !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI)); - const StackOffset SVEStackSize = getSVEStackSize(MF); + StackOffset ZPRStackSize = getZPRStackSize(MF); + StackOffset PPRStackSize = getPPRStackSize(MF); + StackOffset SVEStackSize = ZPRStackSize + PPRStackSize; // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't @@ -3045,17 +3176,26 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize(); if (isSVE) { + StackOffset AccessOffset{}; + if (AFI->hasSplitSVEObjects() && + MFI.getStackID(FI) == TargetStackID::ScalableVector) + AccessOffset = -PPRStackSize; + StackOffset FPOffset = - StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset); - StackOffset SPOffset = - SVEStackSize + - StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(), + AccessOffset + + StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset); if (FPAfterSVECalleeSaves) { assert(-ObjectOffset > (int64_t)AFI->getSVECalleeSavedStackSize() && "Math isn't correct for CSRs with FPAfterSVECalleeSaves"); FPOffset += StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()); } + + StackOffset SPOffset = + SVEStackSize + AccessOffset + + StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(), + ObjectOffset); + // Always use the FP for SVE spills if available and beneficial. if (hasFP(MF) && (SPOffset.getFixed() || FPOffset.getScalable() < SPOffset.getScalable() || @@ -3063,13 +3203,13 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( FrameReg = RegInfo->getFrameRegister(MF); return FPOffset; } - FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() : (unsigned)AArch64::SP; + return SPOffset; } - StackOffset ScalableOffset = {}; + StackOffset SVEAreaOffset = {}; if (FPAfterSVECalleeSaves) { // In this stack layout, the FP is in between the callee saves and other // SVE allocations. @@ -3077,25 +3217,25 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()); if (UseFP) { if (isFixed) - ScalableOffset = SVECalleeSavedStack; + SVEAreaOffset = SVECalleeSavedStack; else if (!isCSR) - ScalableOffset = SVECalleeSavedStack - SVEStackSize; + SVEAreaOffset = SVECalleeSavedStack - SVEStackSize; } else { if (isFixed) - ScalableOffset = SVEStackSize; + SVEAreaOffset = SVEStackSize; else if (isCSR) - ScalableOffset = SVEStackSize - SVECalleeSavedStack; + SVEAreaOffset = SVEStackSize - SVECalleeSavedStack; } } else { if (UseFP && !(isFixed || isCSR)) - ScalableOffset = -SVEStackSize; + SVEAreaOffset = -SVEStackSize; if (!UseFP && (isFixed || isCSR)) - ScalableOffset = SVEStackSize; + SVEAreaOffset = SVEStackSize; } if (UseFP) { FrameReg = RegInfo->getFrameRegister(MF); - return StackOffset::getFixed(FPOffset) + ScalableOffset; + return StackOffset::getFixed(FPOffset) + SVEAreaOffset; } // Use the base pointer if we have one. @@ -3112,7 +3252,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( Offset -= AFI->getLocalStackSize(); } - return StackOffset::getFixed(Offset) + ScalableOffset; + return StackOffset::getFixed(Offset) + SVEAreaOffset; } static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { @@ -3265,14 +3405,25 @@ static void computeCalleeSaveRegisterPairs( RegInc = -1; FirstReg = Count - 1; } + bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize(); - int ScalableByteOffset = FPAfterSVECalleeSaves - ? 0 - : AFI->getZPRCalleeSavedStackSize() + - AFI->getPPRCalleeSavedStackSize(); + + int ZPRByteOffset = 0; + int PPRByteOffset = 0; + bool SplitPPRs = AFI->hasSplitSVEObjects(); + if (SplitPPRs) { + ZPRByteOffset = AFI->getZPRCalleeSavedStackSize(); + PPRByteOffset = AFI->getPPRCalleeSavedStackSize(); + } else if (!FPAfterSVECalleeSaves) { + ZPRByteOffset = + AFI->getZPRCalleeSavedStackSize() + AFI->getPPRCalleeSavedStackSize(); + // Unused: Everything goes in ZPR space. + PPRByteOffset = 0; + } + bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace(); Register LastReg = 0; - bool HasCSHazardPadding = AFI->hasStackHazardSlotIndex(); + bool HasCSHazardPadding = AFI->hasStackHazardSlotIndex() && !SplitPPRs; // When iterating backwards, the loop condition relies on unsigned wraparound. for (unsigned i = FirstReg; i < Count; i += RegInc) { @@ -3301,6 +3452,10 @@ static void computeCalleeSaveRegisterPairs( llvm_unreachable("Unsupported register class."); } + int &ScalableByteOffset = RPI.Type == RegPairInfo::PPR && SplitPPRs + ? PPRByteOffset + : ZPRByteOffset; + // Add the stack hazard size as we transition from GPR->FPR CSRs. if (HasCSHazardPadding && (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) && @@ -3879,6 +4034,13 @@ static std::optional getLdStFrameID(const MachineInstr &MI, return getMMOFrameID(*MI.memoperands_begin(), MFI); } +// Returns true if the LDST MachineInstr \p MI is a PPR access. +static bool isPPRAccess(const MachineInstr &MI) { + return MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO && + MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO && + AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()); +} + // Check if a Hazard slot is needed for the current function, and if so create // one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex, // which can be used to determine if any hazard padding is needed. @@ -3902,25 +4064,49 @@ void AArch64FrameLowering::determineStackHazardSlot( bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) { return AArch64::FPR64RegClass.contains(Reg) || AArch64::FPR128RegClass.contains(Reg) || - AArch64::ZPRRegClass.contains(Reg) || - AArch64::PPRRegClass.contains(Reg); + AArch64::ZPRRegClass.contains(Reg); + }); + bool HasPPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) { + return AArch64::PPRRegClass.contains(Reg); }); bool HasFPRStackObjects = false; - if (!HasFPRCSRs) { - std::vector FrameObjects(MFI.getObjectIndexEnd()); + bool HasPPRStackObjects = false; + if (!HasFPRCSRs || SplitSVEObjects) { + enum SlotType : uint8_t { + Unknown = 0, + ZPRorFPR = 1 << 0, + PPR = 1 << 1, + GPR = 1 << 2, + LLVM_MARK_AS_BITMASK_ENUM(GPR) + }; + + // Find stack slots solely used for one kind of register (ZPR, PPR, etc.), + // based on the kinds of accesses used in the function. + SmallVector SlotTypes(MFI.getObjectIndexEnd(), SlotType::Unknown); for (auto &MBB : MF) { for (auto &MI : MBB) { std::optional FI = getLdStFrameID(MI, MFI); - if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) { - if (MFI.isScalableStackID(*FI) || AArch64InstrInfo::isFpOrNEON(MI)) - FrameObjects[*FI] |= 2; - else - FrameObjects[*FI] |= 1; + if (!FI || FI < 0 || FI > int(SlotTypes.size())) + continue; + bool IsScalable = MFI.isScalableStackID(*FI); + bool IsPPR = IsScalable && isPPRAccess(MI); + if (IsScalable || AArch64InstrInfo::isFpOrNEON(MI)) { + SlotTypes[*FI] |= IsPPR ? SlotType::PPR : SlotType::ZPRorFPR; + } else { + SlotTypes[*FI] |= SlotType::GPR; } } } - HasFPRStackObjects = - any_of(FrameObjects, [](unsigned B) { return (B & 3) == 2; }); + + for (int FI = 0; FI < int(SlotTypes.size()); ++FI) { + HasFPRStackObjects |= SlotTypes[FI] == SlotType::ZPRorFPR; + // For SplitSVEObjects remember that this stack slot is a predicate, this + // will be needed later when determining the frame layout. + if (SlotTypes[FI] == SlotType::PPR) { + MFI.setStackID(FI, TargetStackID::ScalablePredVector); + HasPPRStackObjects = true; + } + } } if (HasFPRCSRs || HasFPRStackObjects) { @@ -3929,6 +4115,70 @@ void AArch64FrameLowering::determineStackHazardSlot( << StackHazardSize << "\n"); AFI->setStackHazardSlotIndex(ID); } + + // Determine if we should use SplitSVEObjects. This should only be used if + // there's a possibility of a stack hazard between PPRs and ZPRs or FPRs. + if (SplitSVEObjects) { + if (!HasPPRCSRs && !HasPPRStackObjects) { + LLVM_DEBUG( + dbgs() << "Not using SplitSVEObjects as no PPRs are on the stack\n"); + return; + } + + if (!HasFPRCSRs && !HasFPRStackObjects) { + LLVM_DEBUG( + dbgs() + << "Not using SplitSVEObjects as no FPRs or ZPRs are on the stack\n"); + return; + } + + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) { + LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable " + "sized objects or realignment\n"); + return; + } + + if (arePPRsSpilledAsZPR(MF)) { + LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with " + "-aarch64-enable-zpr-predicate-spills"); + return; + } + + [[maybe_unused]] const AArch64Subtarget &Subtarget = + MF.getSubtarget(); + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Expected SVE to be available for PPRs"); + + // With SplitSVEObjects the CS hazard padding is placed between the + // PPRs and ZPRs. If there are any FPR CS there would be a hazard between + // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs. + BitVector FPRZRegs(SavedRegs.size()); + for (size_t Reg = 0, E = SavedRegs.size(); HasFPRCSRs && Reg < E; ++Reg) { + BitVector::reference RegBit = SavedRegs[Reg]; + if (!RegBit) + continue; + unsigned SubRegIdx = 0; + if (AArch64::FPR64RegClass.contains(Reg)) + SubRegIdx = AArch64::dsub; + else if (AArch64::FPR128RegClass.contains(Reg)) + SubRegIdx = AArch64::zsub; // TODO: Is the the right sub-register? + else + continue; + // Clear the bit for the FPR save. + RegBit = false; + // Mark that we should save the corresponding ZPR. + Register ZReg = + TRI->getMatchingSuperReg(Reg, SubRegIdx, &AArch64::ZPRRegClass); + FPRZRegs.set(ZReg); + } + SavedRegs |= FPRZRegs; + + // FIXME: Avoid setting setting the CC. Since we've replaced FPRs with ZPRs + // we need to set this or later in PEI the ZPR CS will be masked out. + AFI->setIsSVECC(true); + AFI->setSplitSVEObjects(true); + } } void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, @@ -4062,6 +4312,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(AArch64::X18); } + // Determine if a Hazard slot should be used and where it should go. + // If SplitSVEObjects is used, the hazard padding is placed between the PPRs + // and ZPRs. Otherwise, it goes in the callee save area. + determineStackHazardSlot(MF, SavedRegs); + // Calculates the callee saved stack size. unsigned CSStackSize = 0; unsigned ZPRCSStackSize = 0; @@ -4086,10 +4341,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // only 64-bit GPRs can be added to SavedRegs. unsigned NumSavedRegs = SavedRegs.count(); - // Determine if a Hazard slot should be used, and increase the CSStackSize by - // StackHazardSize if so. - determineStackHazardSlot(MF, SavedRegs); - if (AFI->hasStackHazardSlotIndex()) + // If we have hazard padding in the CS area add that to the size. + if (AFI->hasStackHazardSlotIndex() && !AFI->hasSplitSVEObjects()) CSStackSize += getStackHazardSize(MF); // Increase the callee-saved stack size if the function has streaming mode @@ -4282,7 +4535,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); // Create a hazard slot as we switch between GPR and FPR CSRs. - if (AFI->hasStackHazardSlotIndex() && + if (AFI->hasStackHazardSlotIndex() && !AFI->hasSplitSVEObjects() && (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) && AArch64InstrInfo::isFpOrNEON(Reg)) { assert(HazardSlotIndex == std::numeric_limits::max() && @@ -4321,7 +4574,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( } // Add hazard slot in the case where no FPR CSRs are present. - if (AFI->hasStackHazardSlotIndex() && + if (AFI->hasStackHazardSlotIndex() && !AFI->hasSplitSVEObjects() && HazardSlotIndex == std::numeric_limits::max()) { HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true); LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex @@ -4376,8 +4629,7 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI, } static SVEStackSizes determineSVEStackSizes(MachineFunction &MF, - AssignObjectOffsets AssignOffsets, - bool SplitSVEObjects) { + AssignObjectOffsets AssignOffsets) { MachineFrameInfo &MFI = MF.getFrameInfo(); auto *AFI = MF.getInfo(); @@ -4388,7 +4640,7 @@ static SVEStackSizes determineSVEStackSizes(MachineFunction &MF, // are included in the SVE vector area. uint64_t &ZPRStackTop = SVEStack.ZPRStackSize; uint64_t &PPRStackTop = - SplitSVEObjects ? SVEStack.PPRStackSize : SVEStack.ZPRStackSize; + AFI->hasSplitSVEObjects() ? SVEStack.PPRStackSize : SVEStack.ZPRStackSize; #ifndef NDEBUG // First process all fixed stack objects. @@ -5438,10 +5690,12 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) { void AArch64FrameLowering::orderFrameObjects( const MachineFunction &MF, SmallVectorImpl &ObjectsToAllocate) const { - if (!OrderFrameObjects || ObjectsToAllocate.empty()) + const AArch64FunctionInfo &AFI = *MF.getInfo(); + + if ((!OrderFrameObjects && !AFI.hasSplitSVEObjects()) || + ObjectsToAllocate.empty()) return; - const AArch64FunctionInfo &AFI = *MF.getInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); std::vector FrameObjects(MFI.getObjectIndexEnd()); for (auto &Obj : ObjectsToAllocate) { @@ -5460,7 +5714,8 @@ void AArch64FrameLowering::orderFrameObjects( if (AFI.hasStackHazardSlotIndex()) { std::optional FI = getLdStFrameID(MI, MFI); if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) { - if (MFI.isScalableStackID(*FI) || AArch64InstrInfo::isFpOrNEON(MI)) + if (MFI.getStackID(*FI) == TargetStackID::ScalableVector || + AArch64InstrInfo::isFpOrNEON(MI)) FrameObjects[*FI].Accesses |= FrameObject::AccessFPR; else FrameObjects[*FI].Accesses |= FrameObject::AccessGPR; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 971a3c116027d..6ebef621795ea 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -57,7 +57,8 @@ class AArch64FrameLowering : public TargetFrameLowering { StackOffset resolveFrameOffsetReference(const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE, Register &FrameReg, - bool PreferFP, bool ForSimm) const; + bool PreferFP, bool ForSimm, + int64_t FI = -1) const; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef CSI, diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 4b0f5cdf4d4ff..f2a0a2f80887a 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -137,6 +137,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { uint64_t StackSizeZPR = 0; uint64_t StackSizePPR = 0; + /// Are SVE objects (vectors and predicates) split into separate regions on + /// the stack. + bool SplitSVEObjects = false; + /// HasCalculatedStackSizeSVE indicates whether StackSizeZPR/PPR is valid. bool HasCalculatedStackSizeSVE = false; @@ -313,6 +317,9 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { return StackSizePPR; } + bool hasSplitSVEObjects() const { return SplitSVEObjects; } + void setSplitSVEObjects(bool s) { SplitSVEObjects = s; } + bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } bool hasSVEStackSize() const { diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir new file mode 100644 index 0000000000000..e91aa1b6eae1d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir @@ -0,0 +1,526 @@ +# RUN: llc -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -o - | FileCheck %s --check-prefix=ASM +# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -filetype=obj -o %t +# RUN: llvm-objdump --dwarf=frames %t | FileCheck %s --check-prefix=UNWINDINFO +# RUN: rm -rf %t +# +# Test allocation and deallocation of SVE objects on the stack with +# split-sve-objects (and hazard padding) enabled. This also tests using a +# combination of scalable and non-scalable offsets to access the SVE on the +# stack. +# +# With split-sve-objects (which implies hazard padding) the SVE area is split +# into PPR and ZPR areas with (fixed-size) hazard padding between them. The PPR +# area holds all scalable predicate callee saves and locals, and the ZPR area +# holds all scalable vector callee saves and locals. Additionally, any FPR +# callee save is promoted to a ZPR callee save (to avoid needing additional +# hazard padding in the callee save area). +# +# +-------------+ +# | stack arg | +# +-------------+ <- SP before call +# | Callee Saves| +# | Frame record| (if available) +# |-------------| <- FP (if available) +# | PPR area | +# |-------------| +# |/////////////| hazard padding +# |-------------| +# | ZPR area | +# +-------------+ +# | : | +# | Stack objs | +# | : | +# +-------------+ <- SP after call and frame-setup +# +--- | + + define void @test_allocate_split_sve() uwtable { entry: unreachable } + define void @test_allocate_split_sve_realigned() uwtable { entry: unreachable } + define void @test_address_split_sve() uwtable { entry: unreachable } + define void @test_address_split_sve_fp() uwtable { entry: unreachable } + define aarch64_sve_vector_pcs void @save_restore_ppr_zpr() uwtable { entry: unreachable } + +... +# +----------+ +# |scratchreg| // x29 is used as scratch reg. +# |----------| +# | %stack.0 | // scalable predicate of n * 12 bytes, aligned to 16 bytes +# | | // to be materialized with 1*ADDVL (<=> n * 16 bytes) +# |----------| +# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area +# |//////////| // Note: This is currently not included in the "stackSize" +# +----------+ +# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes) +# +----------+ +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.1 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_allocate_split_sve +# CHECK: stackSize: 1056 + +# CHECK: bb.0.entry: +# CHECK: liveins: $z0, $p0, $fp +# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.4) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x08, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# +# CHECK-NEXT: $x8 = ADDXri $sp, 1040, 0 +# CHECK-NEXT: $x8 = ADDPL_XXI $x8, 7, implicit $vg +# CHECK-NEXT: STR_ZXI $z0, killed $x8, 0 :: (store () into %stack.0) +# CHECK-NEXT: $x8 = ADDXri $sp, 2064, 0 +# CHECK-NEXT: STR_PXI $p0, killed $x8, 18 :: (store () into %stack.1) +# +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x08, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.4) +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: test_allocate_split_sve: +# ASM: .cfi_def_cfa_offset 16 +# ASM-NEXT: .cfi_offset w29, -16 +# ASM: sub sp, sp, #1024 +# ASM: .cfi_def_cfa_offset 1040 +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x08, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1040 + 8 * VG +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2080 + 8 * VG +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2080 + 24 * VG +# +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2080 + 8 * VG +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x08, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1056 + 8 * VG +# ASM: .cfi_def_cfa wsp, 1056 +# ASM: .cfi_def_cfa_offset 16 +# ASM: .cfi_def_cfa_offset 0 +# ASM: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +1040, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +2080, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +2080, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +2080, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +1056, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056 +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 + +name: test_allocate_split_sve +stack: + - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 } + - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 } + - { id: 2, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + liveins: $z0, $p0 + STR_ZXI $z0, %stack.0, 0 :: (store () into %stack.0) + STR_PXI $p0, %stack.1, 0 :: (store () into %stack.1) + RET_ReallyLR +--- +... + +# Stack realignment is not supported with split-sve-objects, so we fallback to +# the default hazard padding implementation. This does not prevent hazards +# between ZPRs and PPRs (TODO: support this case). +# +# +----------+ +# | lr, fp | // frame record +# |----------| +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.0 | // scalable predicate of n * 12 bytes, aligned to 16 bytes +# | | // to be materialized with 1*ADDVL (<=> n * 16 bytes) +# +----------+ +# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes) +# +----------+ +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.1 | // not scalable +# +----------+ <- SP + +name: test_allocate_split_sve_realigned +stack: + - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 } + - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 } + - { id: 2, stack-id: default, size: 16, alignment: 32 } +body: | + bb.0.entry: + liveins: $z0, $p0 + STR_ZXI $z0, %stack.0, 0 :: (store () into %stack.0) + STR_PXI $p0, %stack.1, 0 :: (store () into %stack.1) + RET_ReallyLR + +# CHECK-LABEL: name: test_allocate_split_sve_realigned +# CHECK: stackSize: 2080 + +# CHECK: bb.0.entry: +# CHECK: liveins: $z0, $p0, $lr +# CHECK: $sp = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 +# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5) +# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4) +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg +# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930 +# +# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0 +# CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg +# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store () into %stack.0) +# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0 +# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store () into %stack.1) +# +# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040 +# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4) +# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5) +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: test_allocate_split_sve_realigned +# ASM: .cfi_def_cfa_offset 1040 +# ASM: .cfi_def_cfa w29, 16 +# ASM-NEXT: .cfi_offset w30, -8 +# ASM-NEXT: .cfi_offset w29, -16 +# +# ASM: .cfi_def_cfa wsp, 1040 +# ASM: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w30 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 +# UNWINDINFO: DW_CFA_def_cfa: reg29 +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# +# UNWINDINFO: DW_CFA_def_cfa: reg31 +1040 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg30 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 +--- +... + +# +----------+ +# |scratchreg| // x29 is used as scratch reg. +# +----------+ +# | %stack.2 | // scalable predicate @ SP + 2064b + 46 scalable bytes +# |----------| +# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area +# |//////////| // Note: This is currently not included in the "stackSize" +# |----------| +# | %stack.0 | // scalable vector @ SP + 1040b + 16 scalable bytes +# | %stack.1 | // scalable vector @ SP + 1040b +# +----------+ +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.3 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_address_split_sve +# CHECK: stackSize: 1056 + +# CHECK: bb.0.entry: +# CHECK-NEXT: liveins: +# CHECK-NEXT: {{ $}} +# CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.5) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x08, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# +# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0 +# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], 1 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0 +# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], 0 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 2064, 0 +# CHECK-NEXT: STR_PXI $p0, killed $[[TMP]], 23 +# +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x08, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.5) +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: test_address_split_sve +# ASM: .cfi_def_cfa_offset 16 +# ASM-NEXT: .cfi_offset w29, -16 +# ASM: .cfi_def_cfa_offset 1040 +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x08, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1040 + 8 * VG +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2080 + 8 * VG +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2080 + 24 * VG +# +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2080 + 8 * VG +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x08, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1056 + 8 * VG +# ASM: .cfi_def_cfa wsp, 1056 +# ASM: .cfi_def_cfa_offset 16 +# ASM: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +1040, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +2080, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +2080, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +2080, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +1056, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056 +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 + +name: test_address_split_sve +frameInfo: + maxAlignment: 16 +stack: + - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 } + - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 } + - { id: 2, stack-id: scalable-vector, size: 2, alignment: 2 } + - { id: 3, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + liveins: $z0, $z1, $p0 + + STR_ZXI $z0, %stack.0, 0 :: (store () into %stack.0) + STR_ZXI $z1, %stack.1, 0 :: (store () into %stack.1) + STR_PXI $p0, %stack.2, 0 :: (store () into %stack.2) + + RET_ReallyLR +--- +... +# +----------+ +# | lr, fp | // frame record +# +----------+ <- FP +# | %stack.2 | // scalable predicate @ FP - 2 scalable bytes +# |----------| +# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area +# |//////////| // Note: This is currently not included in the "stackSize" +# |----------| +# | %stack.0 | // scalable vector @ FP - 1024b - 32 scalable bytes +# | %stack.1 | // scalable vector @ FP - 1024b - 48 scalable bytes +# +----------+ +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.3 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_address_split_sve_fp +# CHECK: stackSize: 1056 +# +# CHECK: bb.0.entry: +# CHECK-NEXT: liveins: +# CHECK-NEXT: {{ $}} +# CHECK-NEXT: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.6), (store (s64) into %stack.5) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg +# +# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0 +# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], -2 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0 +# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], -3 +# CHECK-NEXT: STR_PXI $p0, $fp, -1 +# +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 +# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.6), (load (s64) from %stack.5) +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: test_address_split_sve_fp +# ASM: .cfi_def_cfa_offset 16 +# ASM: .cfi_def_cfa w29, 16 +# ASM-NEXT: .cfi_offset w30, -8 +# ASM-NEXT: .cfi_offset w29, -16 +# +# ASM: .cfi_def_cfa wsp, 16 +# ASM: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w30 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO: DW_CFA_def_cfa: reg29 +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# +# UNWINDINFO: DW_CFA_def_cfa: reg31 +16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg30 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 + +name: test_address_split_sve_fp +frameInfo: + maxAlignment: 16 + isFrameAddressTaken: true +stack: + - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 } + - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 } + - { id: 2, stack-id: scalable-vector, size: 2, alignment: 2 } + - { id: 3, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + liveins: $z0, $z1, $p0 + + STR_ZXI $z0, %stack.0, 0 :: (store () into %stack.0) + STR_ZXI $z1, %stack.1, 0 :: (store () into %stack.1) + STR_PXI $p0, %stack.2, 0 :: (store () into %stack.2) + + RET_ReallyLR +--- +... +# CHECK-LABEL: name: save_restore_ppr_zpr +# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.8) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: frame-setup STR_PXI killed $p6, $sp, 5 :: (store (s16) into %stack.7) +# CHECK-NEXT: frame-setup STR_PXI killed $p5, $sp, 6 :: (store (s16) into %stack.6) +# CHECK-NEXT: frame-setup STR_PXI killed $p4, $sp, 7 :: (store (s16) into %stack.5) +# +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x08, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x08, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: frame-setup STR_ZXI killed $z10, $sp, 0 :: (store (s128) into %stack.4) +# CHECK-NEXT: frame-setup STR_ZXI killed $z9, $sp, 1 :: (store (s128) into %stack.3) +# CHECK-NEXT: frame-setup STR_ZXI killed $z8, $sp, 2 :: (store (s128) into %stack.2) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x49, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4a, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1056, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xb0, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# +# +# CHECK: $sp = frame-destroy ADDXri $sp, 1056, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x08, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.4) +# CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.3) +# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.2) +# +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10 +# CHECK-NEXT: $p6 = frame-destroy LDR_PXI $sp, 5 :: (load (s16) from %stack.7) +# CHECK-NEXT: $p5 = frame-destroy LDR_PXI $sp, 6 :: (load (s16) from %stack.6) +# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7 :: (load (s16) from %stack.5) +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 +# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.8) +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: save_restore_ppr_zpr: +# ASM: .cfi_def_cfa_offset 16 +# ASM-NEXT: .cfi_offset w29, -16 +# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x08, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1040 + 8 * VG +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x08, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1040 + 32 * VG +# ASM: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 16 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1040 - 24 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1040 - 32 * VG +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xb0, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2096 + 32 * VG +# +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x08, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1040 + 32 * VG +# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +# ASM-NEXT: .cfi_restore z8 +# ASM-NEXT: .cfi_restore z9 +# ASM-NEXT: .cfi_restore z10 +# ASM: .cfi_def_cfa wsp, 16 +# ASM: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +1040, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +1040, DW_OP_plus, DW_OP_consts +32, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_expression: reg72 DW_OP_consts -1040, DW_OP_plus, DW_OP_consts -16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_expression: reg73 DW_OP_consts -1040, DW_OP_plus, DW_OP_consts -24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_expression: reg74 DW_OP_consts -1040, DW_OP_plus, DW_OP_consts -32, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +2096, DW_OP_plus, DW_OP_consts +32, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +1040, DW_OP_plus, DW_OP_consts +32, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +32, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg104 +# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg105 +# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg106 +# UNWINDINFO: DW_CFA_def_cfa: reg31 +16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 + +name: save_restore_ppr_zpr +stack: + - { id: 0, stack-id: default, size: 32, alignment: 16 } +body: | + bb.0.entry: + + $p4 = IMPLICIT_DEF + $p5 = IMPLICIT_DEF + $p6 = IMPLICIT_DEF + $z8 = IMPLICIT_DEF + $z9 = IMPLICIT_DEF + $z10 = IMPLICIT_DEF + + RET_ReallyLR \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir index bff0cacfd5190..65b7793597c5d 100644 --- a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir +++ b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-zpr-predicate-spills -run-pass=greedy %s -o - | FileCheck %s # RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-zpr-predicate-spills -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=EXPAND + --- | source_filename = "" target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -983,26 +984,22 @@ body: | ; EXPAND-LABEL: name: zpr_predicate_spill_p4_saved ; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p8, $p4 ; EXPAND-NEXT: {{ $}} - ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 - ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.3) + ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2) ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0 - ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.2) + ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.1) ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0 - ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.1) - ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 + ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0) ; ; EXPAND-NEXT: $p8 = IMPLICIT_DEF ; - ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 - ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.2) + ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.1) ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv - ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.1) + ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.0) ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg - ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.3) - ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 + ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2) ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3 ; If we spill a register above p8, p4 must also be saved, so we can guarantee diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll new file mode 100644 index 0000000000000..c0b233c55a5c2 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll @@ -0,0 +1,751 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -pass-remarks-analysis=stack-frame-layout 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK-FRAMELAYOUT + +; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024 + +; +; %ppr_local sp+2048+30*vscale (= #15, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding sp+2048+16*vscale +; sp+1024+16*vscale +; %zpr_local sp+1024 +; +; -> sp +define void @zpr_and_ppr_local( %pred, %vector) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: zpr_and_ppr_local: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x90, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2064 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x8, sp, #2048 +; CHECK-NEXT: str p0, [x8, #15, mul vl] +; CHECK-NEXT: add x8, sp, #1024 +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca + %zpr_local = alloca + store volatile %pred, ptr %ppr_local + store volatile %vector, ptr %zpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local_fp +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024 + +; +; -> fp +; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding fp-16*vscale +; fp-1024-16*vscale +; %zpr_local fp-1024-32*vscale (= #-2, mul vl for str/ldr ZPR) +; +; -> sp +define void @zpr_and_ppr_local_fp( %pred, %vector) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { +; CHECK-LABEL: zpr_and_ppr_local_fp: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: str p0, [x29, #-1, mul vl] +; CHECK-NEXT: str z0, [x8, #-2, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca + %zpr_local = alloca + store volatile %pred, ptr %ppr_local + store volatile %vector, ptr %zpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024 + +; +; %ppr_local sp+2064+14*vscale (= #7, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding sp+2064 +; sp+1040 +; %fpr_local sp+1032 +; 8 bytes of padding sp+1024 +; +; -> sp +define void @fpr_and_ppr_local( %pred, double %double) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: fpr_and_ppr_local: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2080 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x8, sp, #2064 +; CHECK-NEXT: str p0, [x8, #7, mul vl] +; CHECK-NEXT: str d0, [sp, #1032] +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca + %fpr_local = alloca double + store volatile %pred, ptr %ppr_local + store volatile double %double, ptr %fpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local_fp +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024 + +; +; -> fp +; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding +; +; %fpr_local sp+1032 +; 8 bytes of padding sp+1024 +; +; -> sp +define void @fpr_and_ppr_local_fp( %pred, double %double) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { +; CHECK-LABEL: fpr_and_ppr_local_fp: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: str p0, [x29, #-1, mul vl] +; CHECK-NEXT: str d0, [sp, #1032] +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca + %fpr_local = alloca double + store volatile %pred, ptr %ppr_local + store volatile double %double, ptr %fpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8 + +; +; %ppr_local sp+2064+30*vscale (= #15, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding +; sp+1040+16*vscale +; sp+1040 +; sp+16 +; %gpr_local sp+8 +; 8 bytes of padding +; -> sp +define void @gpr_and_ppr_local( %pred, i64 %int) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: gpr_and_ppr_local: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xa0, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2080 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 16 * VG +; CHECK-NEXT: add x8, sp, #2064 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x8, #15, mul vl] +; CHECK-NEXT: str x0, [sp, #8] +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed + %ppr_local = alloca + %gpr_local = alloca i64 + store volatile %pred, ptr %ppr_local + store volatile i64 %int, ptr %gpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local_fp +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8 + +; +; -> fp +; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding +; +; +; +; %gpr_local sp+8 +; 8 bytes of padding +; -> sp +define void @gpr_and_ppr_local_fp( %pred, i64 %int) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { +; CHECK-LABEL: gpr_and_ppr_local_fp: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 16 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x29, #-1, mul vl] +; CHECK-NEXT: str x0, [sp, #8] +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed + %ppr_local = alloca + %gpr_local = alloca i64 + store volatile %pred, ptr %ppr_local + store volatile i64 %int, ptr %gpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-320 x vscale], Type: Variable, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-320 x vscale], Type: Variable, Align: 16, Size: 1024 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2088-320 x vscale], Type: Variable, Align: 8, Size: 8 + +; +; +; %ppr_local sp+2080+286*vscale (addvl #17, addpl #7) +; 14 * vscale bytes of padding sp+2080+272*vscale +; sp+1056+272*vscale +; sp+1056+16*vscale +; %zpr_local sp+1056 +; %fpr_local sp+1048 +; 8 bytes of padding sp+1040 +; sp+16 +; %gpr_local sp+8 +; 8 bytes of padding sp +; -> sp +define void @all_stack_areas( %pred, double %fp) { +; CHECK-LABEL: all_stack_areas: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1056 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2096 + 160 * VG +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1040 - 32 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1040 - 40 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1040 - 48 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1040 - 56 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xf0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1040 - 64 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x11, 0xf0, 0x77, 0x22, 0x11, 0xb8, 0x7f, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1040 - 72 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x11, 0xf0, 0x77, 0x22, 0x11, 0xb0, 0x7f, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1040 - 80 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x11, 0xf0, 0x77, 0x22, 0x11, 0xa8, 0x7f, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1040 - 88 * VG +; CHECK-NEXT: add x0, sp, #2080 +; CHECK-NEXT: add x8, sp, #2080 +; CHECK-NEXT: add x1, sp, #1056 +; CHECK-NEXT: addvl x0, x0, #17 +; CHECK-NEXT: add x2, sp, #1048 +; CHECK-NEXT: add x3, sp, #8 +; CHECK-NEXT: addpl x0, x0, #7 +; CHECK-NEXT: str d0, [sp, #1048] +; CHECK-NEXT: str p0, [x8, #143, mul vl] +; CHECK-NEXT: bl foo +; CHECK-NEXT: add sp, sp, #1056 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca + %zpr_local = alloca + %fpr_local = alloca double + ; // Needed to sort %fpr_local into the FPR region + store double %fp, ptr %fpr_local + ; // Needed to sort %ppr_local into the PPR region + store %pred, ptr %ppr_local + %gpr_local = alloca i64 + call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local) + ret void +} +declare void @foo(ptr, ptr, ptr, ptr) + +; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas_fp +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1064-320 x vscale], Type: Variable, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2096-320 x vscale], Type: Variable, Align: 16, Size: 1024 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2104-320 x vscale], Type: Variable, Align: 8, Size: 8 + +; +; -> fp +; fp-32*vscale +; %ppr_local fp-34*vscale (addpl #-17) +; 14 * vscale bytes of padding fp-48*vscale +; fp-1024-48*vscale +; fp-1024-304*vscale +; %zpr_local sp-1024-320*vscale (addvl #-20) +; %fpr_local sp+1048 +; 8 bytes of padding sp+1040 +; sp+16 +; %gpr_local sp+8 +; 8 bytes of padding sp +; -> sp +define void @all_stack_areas_fp( %pred, double %fp) "frame-pointer"="all" { +; CHECK-LABEL: all_stack_areas_fp: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1056 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w28, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xe0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1056 - 32 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xe0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1056 - 40 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xe0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1056 - 48 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xe0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1056 - 56 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xe0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1056 - 64 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x11, 0xe0, 0x77, 0x22, 0x11, 0xb8, 0x7f, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1056 - 72 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x11, 0xe0, 0x77, 0x22, 0x11, 0xb0, 0x7f, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1056 - 80 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x11, 0xe0, 0x77, 0x22, 0x11, 0xa8, 0x7f, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1056 - 88 * VG +; CHECK-NEXT: sub x1, x29, #1024 +; CHECK-NEXT: addpl x0, x29, #-17 +; CHECK-NEXT: add x2, sp, #1048 +; CHECK-NEXT: addvl x1, x1, #-20 +; CHECK-NEXT: add x3, sp, #8 +; CHECK-NEXT: str d0, [sp, #1048] +; CHECK-NEXT: str p0, [x29, #-17, mul vl] +; CHECK-NEXT: bl foo +; CHECK-NEXT: add sp, sp, #1056 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca + %zpr_local = alloca + %fpr_local = alloca double + ; // Needed to sort %fpr_local into the FPR region + store double %fp, ptr %fpr_local + ; // Needed to sort %ppr_local into the PPR region + store %pred, ptr %ppr_local + %gpr_local = alloca i64 + call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local) + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: svecc_call +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-48 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1072-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2096-288 x vscale], Type: Variable, Align: 16, Size: 1024 + +define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, i16 %P4) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: svecc_call: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x27, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w27, -16 +; CHECK-NEXT: .cfi_offset w28, -24 +; CHECK-NEXT: .cfi_offset w30, -40 +; CHECK-NEXT: .cfi_offset w29, -48 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 16 * VG +; CHECK-NEXT: addvl sp, sp, #-16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1072 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1072 - 32 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1072 - 40 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1072 - 48 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1072 - 56 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1072 - 64 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x11, 0xd0, 0x77, 0x22, 0x11, 0xb8, 0x7f, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1072 - 72 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x11, 0xd0, 0x77, 0x22, 0x11, 0xb0, 0x7f, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1072 - 80 * VG +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2096 + 144 * VG +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: .cfi_offset vg, -32 +; CHECK-NEXT: tbz w19, #0, .LBB8_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB8_2: // %entry +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov w1, #45 // =0x2d +; CHECK-NEXT: mov w2, #37 // =0x25 +; CHECK-NEXT: bl memset +; CHECK-NEXT: tbz w19, #0, .LBB8_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB8_4: // %entry +; CHECK-NEXT: mov w0, #22647 // =0x5877 +; CHECK-NEXT: movk w0, #59491, lsl #16 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 144 * VG +; CHECK-NEXT: addvl sp, sp, #16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: .cfi_restore z12 +; CHECK-NEXT: .cfi_restore z13 +; CHECK-NEXT: .cfi_restore z14 +; CHECK-NEXT: .cfi_restore z15 +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: .cfi_def_cfa wsp, 48 +; CHECK-NEXT: ldp x27, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w27 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37) + ret i32 -396142473 +} +declare ptr @memset(ptr, i32, i32) diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll index 3a33405200132..6421f94c38d51 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK,CHECK0 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK,CHECK64 -; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024 +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-NOSPLITSVE +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-split-sve-objects -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-SPLITSVE define i32 @basic(i32 noundef %num) { ; CHECK-LABEL: basic: @@ -1497,72 +1498,24 @@ define [2 x ] @sve_signature_pred_2xv4i1([2 x } define [2 x ] @sve_signature_pred_2xv4i1_caller([2 x ] %arg1, [2 x ] %arg2) nounwind "aarch64_pstate_sm_compatible" { -; CHECK0-LABEL: sve_signature_pred_2xv4i1_caller: -; CHECK0: // %bb.0: -; CHECK0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK0-NEXT: addvl sp, sp, #-1 -; CHECK0-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: mov p5.b, p0.b -; CHECK0-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: mov p4.b, p1.b -; CHECK0-NEXT: mov p0.b, p2.b -; CHECK0-NEXT: mov p1.b, p3.b -; CHECK0-NEXT: mov p2.b, p5.b -; CHECK0-NEXT: mov p3.b, p4.b -; CHECK0-NEXT: bl sve_signature_pred_2xv4i1 -; CHECK0-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: addvl sp, sp, #1 -; CHECK0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK0-NEXT: ret -; -; CHECK64-LABEL: sve_signature_pred_2xv4i1_caller: -; CHECK64: // %bb.0: -; CHECK64-NEXT: sub sp, sp, #80 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: addvl sp, sp, #-1 -; CHECK64-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: sub sp, sp, #64 -; CHECK64-NEXT: mov p4.b, p1.b -; CHECK64-NEXT: mov p5.b, p0.b -; CHECK64-NEXT: mov p0.b, p2.b -; CHECK64-NEXT: mov p1.b, p3.b -; CHECK64-NEXT: mov p2.b, p5.b -; CHECK64-NEXT: mov p3.b, p4.b -; CHECK64-NEXT: bl sve_signature_pred_2xv4i1 -; CHECK64-NEXT: add sp, sp, #64 -; CHECK64-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: addvl sp, sp, #1 -; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #80 -; CHECK64-NEXT: ret -; -; CHECK1024-LABEL: sve_signature_pred_2xv4i1_caller: -; CHECK1024: // %bb.0: -; CHECK1024-NEXT: sub sp, sp, #1040 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: addvl sp, sp, #-1 -; CHECK1024-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: sub sp, sp, #1024 -; CHECK1024-NEXT: mov p4.b, p1.b -; CHECK1024-NEXT: mov p5.b, p0.b -; CHECK1024-NEXT: mov p0.b, p2.b -; CHECK1024-NEXT: mov p1.b, p3.b -; CHECK1024-NEXT: mov p2.b, p5.b -; CHECK1024-NEXT: mov p3.b, p4.b -; CHECK1024-NEXT: bl sve_signature_pred_2xv4i1 -; CHECK1024-NEXT: add sp, sp, #1024 -; CHECK1024-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #1 -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1040 -; CHECK1024-NEXT: ret +; CHECK-LABEL: sve_signature_pred_2xv4i1_caller: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p5.b, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p4.b, p1.b +; CHECK-NEXT: mov p0.b, p2.b +; CHECK-NEXT: mov p1.b, p3.b +; CHECK-NEXT: mov p2.b, p5.b +; CHECK-NEXT: mov p3.b, p4.b +; CHECK-NEXT: bl sve_signature_pred_2xv4i1 +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret %res = call [2 x ] @sve_signature_pred_2xv4i1([2 x ] %arg2, [2 x ] %arg1) ret [2 x ] %res } @@ -2102,136 +2055,268 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, ; CHECK64-NEXT: .cfi_restore w29 ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_call: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1072 -; CHECK1024-NEXT: .cfi_def_cfa_offset 1072 -; CHECK1024-NEXT: cntd x9 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: .cfi_offset w19, -8 -; CHECK1024-NEXT: .cfi_offset w27, -16 -; CHECK1024-NEXT: .cfi_offset w28, -24 -; CHECK1024-NEXT: .cfi_offset w30, -40 -; CHECK1024-NEXT: .cfi_offset w29, -48 -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1072 - 8 * VG -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1072 - 16 * VG -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1072 - 24 * VG -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1072 - 32 * VG -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1072 - 40 * VG -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1072 - 48 * VG -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1072 - 56 * VG -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1072 - 64 * VG -; CHECK1024-NEXT: sub sp, sp, #1024 -; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2096 + 144 * VG -; CHECK1024-NEXT: mov x8, x0 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: bl __arm_sme_state -; CHECK1024-NEXT: and x19, x0, #0x1 -; CHECK1024-NEXT: .cfi_offset vg, -32 -; CHECK1024-NEXT: tbz w19, #0, .LBB28_2 -; CHECK1024-NEXT: // %bb.1: // %entry -; CHECK1024-NEXT: smstop sm -; CHECK1024-NEXT: .LBB28_2: // %entry -; CHECK1024-NEXT: mov x0, x8 -; CHECK1024-NEXT: mov w1, #45 // =0x2d -; CHECK1024-NEXT: mov w2, #37 // =0x25 -; CHECK1024-NEXT: bl memset -; CHECK1024-NEXT: tbz w19, #0, .LBB28_4 -; CHECK1024-NEXT: // %bb.3: // %entry -; CHECK1024-NEXT: smstart sm -; CHECK1024-NEXT: .LBB28_4: // %entry -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: .cfi_restore vg -; CHECK1024-NEXT: add sp, sp, #1024 -; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #18 -; CHECK1024-NEXT: .cfi_def_cfa wsp, 1072 -; CHECK1024-NEXT: .cfi_restore z8 -; CHECK1024-NEXT: .cfi_restore z9 -; CHECK1024-NEXT: .cfi_restore z10 -; CHECK1024-NEXT: .cfi_restore z11 -; CHECK1024-NEXT: .cfi_restore z12 -; CHECK1024-NEXT: .cfi_restore z13 -; CHECK1024-NEXT: .cfi_restore z14 -; CHECK1024-NEXT: .cfi_restore z15 -; CHECK1024-NEXT: ldr x19, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1072 -; CHECK1024-NEXT: .cfi_def_cfa_offset 0 -; CHECK1024-NEXT: .cfi_restore w19 -; CHECK1024-NEXT: .cfi_restore w27 -; CHECK1024-NEXT: .cfi_restore w28 -; CHECK1024-NEXT: .cfi_restore w30 -; CHECK1024-NEXT: .cfi_restore w29 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_call: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1072 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1072 +; CHECK1024-NOSPLITSVE-NEXT: cntd x9 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -48 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1072 - 8 * VG +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1072 - 16 * VG +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1072 - 24 * VG +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1072 - 32 * VG +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1072 - 40 * VG +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1072 - 48 * VG +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1072 - 56 * VG +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1072 - 64 * VG +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2096 + 144 * VG +; CHECK1024-NOSPLITSVE-NEXT: mov x8, x0 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-NOSPLITSVE-NEXT: and x19, x0, #0x1 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -32 +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB28_2 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstop sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB28_2: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov x0, x8 +; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NOSPLITSVE-NEXT: mov w2, #37 // =0x25 +; CHECK1024-NOSPLITSVE-NEXT: bl memset +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB28_4 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstart sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB28_4: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #18 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1072 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1072 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_call: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 48 +; CHECK1024-SPLITSVE-NEXT: cntd x9 +; CHECK1024-SPLITSVE-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x27, x19, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -40 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -48 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 16 * VG +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1072 - 24 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1072 - 32 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1072 - 40 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1072 - 48 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1072 - 56 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1072 - 64 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x11, 0xd0, 0x77, 0x22, 0x11, 0xb8, 0x7f, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1072 - 72 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x11, 0xd0, 0x77, 0x22, 0x11, 0xb0, 0x7f, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1072 - 80 * VG +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2096 + 144 * VG +; CHECK1024-SPLITSVE-NEXT: mov x8, x0 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-SPLITSVE-NEXT: and x19, x0, #0x1 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -32 +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB28_2 +; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-SPLITSVE-NEXT: smstop sm +; CHECK1024-SPLITSVE-NEXT: .LBB28_2: // %entry +; CHECK1024-SPLITSVE-NEXT: mov x0, x8 +; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-SPLITSVE-NEXT: mov w2, #37 // =0x25 +; CHECK1024-SPLITSVE-NEXT: bl memset +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB28_4 +; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-SPLITSVE-NEXT: smstart sm +; CHECK1024-SPLITSVE-NEXT: .LBB28_4: // %entry +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x08, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 1072 + 144 * VG +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 144 * VG +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #16 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #2 +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 48 +; CHECK1024-SPLITSVE-NEXT: ldp x27, x19, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-SPLITSVE-NEXT: ret entry: tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37) @@ -2487,135 +2572,266 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, Date: Wed, 4 Jun 2025 10:11:01 +0000 Subject: [PATCH 2/5] Remove default --- llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp | 3 ++- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 2 +- llvm/lib/Target/AArch64/AArch64FrameLowering.h | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 36f3a670808d4..3f9df81605d56 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1556,7 +1556,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference( MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg, /*PreferFP=*/false, - /*ForSimm=*/true); + /*ForSimm=*/true, + /*FI=*/-1); Register SrcReg = FrameReg; if (FrameRegOffset) { // Use output register as temporary. diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 87d065b8d07e7..b0c0f0fab5655 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -5323,7 +5323,7 @@ void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, Register Reg; FrameRegOffset = TFI->resolveFrameOffsetReference( *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg, - /*PreferFP=*/false, /*ForSimm=*/true); + /*PreferFP=*/false, /*ForSimm=*/true, /*FI=*/-1); FrameReg = Reg; FrameRegUpdate = std::nullopt; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 6ebef621795ea..02288f0bbdac2 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -58,7 +58,7 @@ class AArch64FrameLowering : public TargetFrameLowering { int64_t ObjectOffset, bool isFixed, bool isSVE, Register &FrameReg, bool PreferFP, bool ForSimm, - int64_t FI = -1) const; + int64_t FI) const; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef CSI, From ca74899e2786f8801558ff1931d458a6751abcd9 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 5 Jun 2025 15:37:07 +0000 Subject: [PATCH 3/5] Tidy PPR/ZPR check Change-Id: I30e2cf5ea7a1df932f145e685a3fbd39cd974d4d --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index b0c0f0fab5655..2b2082d44f827 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -4088,12 +4088,13 @@ void AArch64FrameLowering::determineStackHazardSlot( std::optional FI = getLdStFrameID(MI, MFI); if (!FI || FI < 0 || FI > int(SlotTypes.size())) continue; - bool IsScalable = MFI.isScalableStackID(*FI); - bool IsPPR = IsScalable && isPPRAccess(MI); - if (IsScalable || AArch64InstrInfo::isFpOrNEON(MI)) { - SlotTypes[*FI] |= IsPPR ? SlotType::PPR : SlotType::ZPRorFPR; + if (MFI.isScalableStackID(*FI)) { + SlotTypes[*FI] |= + isPPRAccess(MI) ? SlotType::PPR : SlotType::ZPRorFPR; } else { - SlotTypes[*FI] |= SlotType::GPR; + SlotTypes[*FI] |= AArch64InstrInfo::isFpOrNEON(MI) + ? SlotType::ZPRorFPR + : SlotType::GPR; } } } From 7ac674c4f1f0b65934d9894e06abcb810d10aed7 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 26 Jun 2025 09:18:49 +0000 Subject: [PATCH 4/5] Rename StackID Change-Id: I5c7b5c731ef030fbd8580ae8e55e896f9927741c --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 2b2082d44f827..4694e96430167 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -4104,7 +4104,7 @@ void AArch64FrameLowering::determineStackHazardSlot( // For SplitSVEObjects remember that this stack slot is a predicate, this // will be needed later when determining the frame layout. if (SlotTypes[FI] == SlotType::PPR) { - MFI.setStackID(FI, TargetStackID::ScalablePredVector); + MFI.setStackID(FI, TargetStackID::ScalablePredicateVector); HasPPRStackObjects = true; } } From 18e44c24ef010ca5c59f69410d9ff292685d054e Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 9 Jul 2025 12:05:52 +0000 Subject: [PATCH 5/5] Rebase Change-Id: I1208aebeabbc4df1c81f1c8f5a222688c28ef985 --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 4694e96430167..97765c08522fd 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -370,11 +370,6 @@ static StackOffset getSVEStackSize(const MachineFunction &MF) { return getZPRStackSize(MF) + getPPRStackSize(MF); } -static bool hasSVEStackSize(const MachineFunction &MF) { - const AArch64FunctionInfo *AFI = MF.getInfo(); - return AFI->getStackSizeZPR() > 0 || AFI->getStackSizePPR() > 0; -} - /// Returns true if PPRs are spilled as ZPRs. static bool arePPRsSpilledAsZPR(const MachineFunction &MF) { return MF.getSubtarget().getRegisterInfo()->getSpillSize(