From b67a6dddb7cc6d1a1bb08df8e71891e11643ddbd Mon Sep 17 00:00:00 2001 From: Austin Kerbow Date: Tue, 17 Jun 2025 22:17:23 -0700 Subject: [PATCH] [AMDGPU] Examine instructions in pending queues during scheduling Examine instructions in the pending queue when scheduling. This makes instructions visible to scheduling heuristics even when they aren't immediately issuable due to hardware resource constraints. The scheduler has two hardware resource modeling modes: an in-order mode where instructions must be ready to issue before scheduling, and out-of-order models where instructions are always visible to heuristics. Special handling exists for unbuffered processor resources in out-of-order models. These resources can cause pipeline stalls when used back-to-back, so they're typically avoided. However, for AMDGPU targets, managing register pressure and reducing spilling is critical enough to justify exceptions to this approach. This change enables examination of instructions that can't be immediately issued because they use an already occupied unbuffered resource. By making these instructions visible to scheduling heuristics anyway, we gain more flexibility in scheduling decisions, potentially allowing better register pressure and hardware resouce management. --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 170 +- llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 21 +- .../AMDGPU/gfx-callable-return-types.ll | 4 +- .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 1858 +++++----- .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 780 ++-- .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 328 +- .../AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir | 8 +- .../AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir | 4 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 116 +- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 832 ++--- ...ine-scheduler-sink-trivial-remats-attr.mir | 6 +- .../CodeGen/AMDGPU/memintrinsic-unroll.ll | 3214 ++++++++--------- .../AMDGPU/mfma-no-register-aliasing.ll | 102 +- .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 101 +- .../CodeGen/AMDGPU/sched-barrier-pre-RA.mir | 30 +- .../sched-group-barrier-pipeline-solver.mir | 16 +- .../AMDGPU/sched-group-barrier-pre-RA.mir | 44 +- 17 files changed, 3888 insertions(+), 3746 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index fce8f36d45969..35886eb04c711 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -68,6 +68,14 @@ static cl::opt GCNTrackers( cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false)); +static cl::opt ExaminePendingQueue( + "amdgpu-examine-pending-queue", cl::Hidden, + cl::desc( + "Examine instructions in the pending the pending queue when " + "scheduling. This makes instructions visible to heuristics that cannot " + "immediately be issued due to hardware resource constraints."), + cl::init(true)); + const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) @@ -319,17 +327,45 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, } } +static bool shouldCheckPending(SchedBoundary &Zone, + const TargetSchedModel *SchedModel) { + const unsigned ReadyListLimit = 256; + bool HasBufferedModel = + SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize(); + return ExaminePendingQueue && + Zone.Available.size() + Zone.Pending.size() <= ReadyListLimit && + HasBufferedModel; +} + +static SUnit *pickOnlyChoice(SchedBoundary &Zone, + const TargetSchedModel *SchedModel) { + if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty()) + return Zone.pickOnlyChoice(); + return nullptr; +} + +#ifndef NDEBUG +void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current, + const SchedCandidate &Preferred) { + LLVM_DEBUG(dbgs() << "Prefer:\t\t"; DAG->dumpNode(*Preferred.SU)); + if (Current.SU) + LLVM_DEBUG(dbgs() << "Not:\t"; DAG->dumpNode(*Current.SU)); + LLVM_DEBUG(dbgs() << "Reason:\t\t"; traceCandidate(Preferred)); +} +#endif + // This function is mostly cut and pasted from // GenericScheduler::pickNodeFromQueue() void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand, + SchedCandidate &Cand, bool &IsPending, bool IsBottomUp) { const SIRegisterInfo *SRI = static_cast(TRI); ArrayRef Pressure = RPTracker.getRegSetPressureAtPos(); unsigned SGPRPressure = 0; unsigned VGPRPressure = 0; + IsPending = false; if (DAG->isTrackingPressure()) { if (!GCNTrackers) { SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; @@ -342,8 +378,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, VGPRPressure = T->getPressure().getArchVGPRNum(); } } - ReadyQueue &Q = Zone.Available; - for (SUnit *SU : Q) { + LLVM_DEBUG(dbgs() << "Available Q:\n"); + ReadyQueue &AQ = Zone.Available; + for (SUnit *SU : AQ) { SchedCandidate TryCand(ZonePolicy); initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, @@ -355,27 +392,59 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, // Initialize resource delta if needed in case future heuristics query it. if (TryCand.ResDelta == SchedResourceDelta()) TryCand.initResourceDelta(Zone.DAG, SchedModel); + LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); Cand.setBest(TryCand); - LLVM_DEBUG(traceCandidate(Cand)); } +#ifndef NDEBUG + else + printCandidateDecision(TryCand, Cand); +#endif + } + + if (!shouldCheckPending(Zone, SchedModel)) + return; + + LLVM_DEBUG(dbgs() << "Pending Q:\n"); + ReadyQueue &PQ = Zone.Pending; + for (SUnit *SU : PQ) { + + SchedCandidate TryCand(ZonePolicy); + initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, + VGPRPressure, IsBottomUp); + // Pass SchedBoundary only when comparing nodes from the same boundary. + SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; + tryPendingCandidate(Cand, TryCand, ZoneArg); + if (TryCand.Reason != NoCand) { + // Initialize resource delta if needed in case future heuristics query it. + if (TryCand.ResDelta == SchedResourceDelta()) + TryCand.initResourceDelta(Zone.DAG, SchedModel); + LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); + IsPending = true; + Cand.setBest(TryCand); + } +#ifndef NDEBUG + else + printCandidateDecision(TryCand, Cand); +#endif } } // This function is mostly cut and pasted from // GenericScheduler::pickNodeBidirectional() -SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { +SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode, + bool &PickedPending) { // Schedule as far as possible in the direction of no choice. This is most // efficient, but also provides the best heuristics for CriticalPSets. - if (SUnit *SU = Bot.pickOnlyChoice()) { + if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) { IsTopNode = false; return SU; } - if (SUnit *SU = Top.pickOnlyChoice()) { + if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) { IsTopNode = true; return SU; } - // Set the bottom-up policy based on the state of the current bottom zone and - // the instructions outside the zone, including the top zone. + // Set the bottom-up policy based on the state of the current bottom zone + // and the instructions outside the zone, including the top zone. CandPolicy BotPolicy; setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); // Set the top-down policy based on the state of the current top zone and @@ -383,12 +452,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { CandPolicy TopPolicy; setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); + bool BotPending = false; // See if BotCand is still valid (because we previously scheduled from Top). LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); if (!BotCand.isValid() || BotCand.SU->isScheduled || BotCand.Policy != BotPolicy) { BotCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand, + BotPending, /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find the first candidate"); } else { @@ -398,6 +469,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { SchedCandidate TCand; TCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, + BotPending, /*IsBottomUp=*/true); assert(TCand.SU == BotCand.SU && "Last pick result should correspond to re-picking right now"); @@ -405,12 +477,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { #endif } + bool TopPending = false; // Check if the top Q has a better candidate. LLVM_DEBUG(dbgs() << "Picking from Top:\n"); if (!TopCand.isValid() || TopCand.SU->isScheduled || TopCand.Policy != TopPolicy) { TopCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand, + TopPending, /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find the first candidate"); } else { @@ -420,6 +494,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { SchedCandidate TCand; TCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, + TopPending, /*IsBottomUp=*/false); assert(TCand.SU == TopCand.SU && "Last pick result should correspond to re-picking right now"); @@ -430,12 +505,21 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { // Pick best from BotCand and TopCand. LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); dbgs() << "Bot Cand: "; traceCandidate(BotCand);); - SchedCandidate Cand = BotCand; - TopCand.Reason = NoCand; - tryCandidate(Cand, TopCand, nullptr); - if (TopCand.Reason != NoCand) { - Cand.setBest(TopCand); + SchedCandidate Cand = BotPending ? TopCand : BotCand; + SchedCandidate TryCand = BotPending ? BotCand : TopCand; + PickedPending = BotPending && TopPending; + + TryCand.Reason = NoCand; + if (BotPending || TopPending) { + PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr); + } else { + tryCandidate(Cand, TryCand, nullptr); } + + if (TryCand.Reason != NoCand) { + Cand.setBest(TryCand); + } + LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); IsTopNode = Cand.AtTop; @@ -450,35 +534,46 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); return nullptr; } + bool PickedPending; SUnit *SU; do { + PickedPending = false; if (RegionPolicy.OnlyTopDown) { - SU = Top.pickOnlyChoice(); + SU = pickOnlyChoice(Top, SchedModel); if (!SU) { CandPolicy NoPolicy; TopCand.reset(NoPolicy); pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand, + PickedPending, /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find a candidate"); SU = TopCand.SU; } IsTopNode = true; } else if (RegionPolicy.OnlyBottomUp) { - SU = Bot.pickOnlyChoice(); + SU = pickOnlyChoice(Bot, SchedModel); if (!SU) { CandPolicy NoPolicy; BotCand.reset(NoPolicy); pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand, + PickedPending, /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find a candidate"); SU = BotCand.SU; } IsTopNode = false; } else { - SU = pickNodeBidirectional(IsTopNode); + SU = pickNodeBidirectional(IsTopNode, PickedPending); } } while (SU->isScheduled); + if (PickedPending) { + unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle; + SchedBoundary &Zone = IsTopNode ? Top : Bot; + Zone.bumpCycle(ReadyCycle); + Zone.releasePending(); + } + if (SU->isTopReady()) Top.removeReady(SU); if (SU->isBottomReady()) @@ -524,6 +619,47 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const { return *std::next(CurrentStage); } +bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) const { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + // Bias PhysReg Defs and copies to their uses and defined respectively. + if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), + biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) + return TryCand.Reason != NoCand; + + // Avoid exceeding the target's limit. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, + RegExcess, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + // Avoid increasing the max critical pressure in the scheduled region. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, + TryCand, Cand, RegCritical, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + bool SameBoundary = Zone != nullptr; + if (SameBoundary) { + TryCand.initResourceDelta(DAG, SchedModel); + if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand, ResourceReduce)) + return TryCand.Reason != NoCand; + if (tryGreater(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, TryCand, Cand, + ResourceDemand)) + return TryCand.Reason != NoCand; + } + + return false; +} + GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( const MachineSchedContext *C, bool IsLegacyScheduler) : GCNSchedStrategy(C) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 94cd795bbc8f6..c78835c8d5a77 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -44,17 +44,34 @@ raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID); /// heuristics to determine excess/critical pressure sets. class GCNSchedStrategy : public GenericScheduler { protected: - SUnit *pickNodeBidirectional(bool &IsTopNode); + SUnit *pickNodeBidirectional(bool &IsTopNode, bool &PickedPending); void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand, bool IsBottomUp); + SchedCandidate &Cand, bool &IsPending, + bool IsBottomUp); void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp); + /// Evaluates instructions in the pending queue using a subset of scheduling + /// heuristics. + /// + /// Instructions that cannot be issued due to hardware constraints are placed + /// in the pending queue rather than the available queue, making them normally + /// invisible to scheduling heuristics. However, in certain scenarios (such as + /// avoiding register spilling), it may be beneficial to consider scheduling + /// these not-yet-ready instructions. + bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const; + +#ifndef NDEBUG + void printCandidateDecision(const SchedCandidate &Current, + const SchedCandidate &Preferred); +#endif + std::vector Pressure; std::vector MaxPressure; diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 668219875db72..86505107587f1 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -947,6 +947,7 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1020 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1028 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2044 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2040 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2036 @@ -1201,7 +1202,6 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1040 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1036 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1032 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1028 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1024 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1016 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1012 @@ -1466,6 +1466,7 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1020 +; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1028 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2044 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2040 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2036 @@ -1720,7 +1721,6 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1040 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1036 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1032 -; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1028 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1024 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1016 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1012 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index aad6e031aa9ed..ac91dadc07995 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -6,1145 +6,1149 @@ define amdgpu_kernel void @largeInterleave() #0 { ret void } ; GCN-LABEL: largeInterleave: ; GCN: ; %bb.0: + ; GCN-NEXT: ; implicit-def: $sgpr17 + ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: ; implicit-def: $vgpr66 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: ; implicit-def: $vgpr0 - ; GCN-NEXT: ; implicit-def: $vgpr2 - ; GCN-NEXT: ; implicit-def: $vgpr1 - ; GCN-NEXT: ; implicit-def: $vgpr8 + ; GCN-NEXT: ; implicit-def: $vgpr65 + ; GCN-NEXT: ; implicit-def: $vgpr72 + ; GCN-NEXT: ; implicit-def: $vgpr238 + ; GCN-NEXT: ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155 + ; GCN-NEXT: ; implicit-def: $vgpr80 + ; GCN-NEXT: ; implicit-def: $vgpr81 + ; GCN-NEXT: ; implicit-def: $vgpr82 + ; GCN-NEXT: ; implicit-def: $vgpr83 + ; GCN-NEXT: ; implicit-def: $vgpr84 + ; GCN-NEXT: ; implicit-def: $vgpr85 + ; GCN-NEXT: ; implicit-def: $vgpr86 + ; GCN-NEXT: ; implicit-def: $vgpr87 + ; GCN-NEXT: ; implicit-def: $vgpr88 + ; GCN-NEXT: ; implicit-def: $vgpr89 + ; GCN-NEXT: ; implicit-def: $vgpr90 + ; GCN-NEXT: ; implicit-def: $vgpr91 + ; GCN-NEXT: ; implicit-def: $vgpr92 + ; GCN-NEXT: ; implicit-def: $vgpr93 ; GCN-NEXT: ; implicit-def: $vgpr94 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ; implicit-def: $vgpr106 - ; GCN-NEXT: ; implicit-def: $vgpr132 - ; GCN-NEXT: ; implicit-def: $vgpr133 - ; GCN-NEXT: ; implicit-def: $vgpr139 - ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 - ; GCN-NEXT: ; iglp_opt mask(0x00000002) - ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: ; implicit-def: $vgpr73 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) - ; GCN-NEXT: v_readfirstlane_b32 s7, v0 + ; GCN-NEXT: v_add_u32_e32 v232, v73, v80 + ; GCN-NEXT: v_readfirstlane_b32 s17, v64 + ; GCN-NEXT: ; implicit-def: $sgpr15 ; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: ; implicit-def: $sgpr5 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_lshl_add_u32 v0, s7, 4, v2 - ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 - ; GCN-NEXT: v_add_lshl_u32 v92, v0, v1, 1 - ; GCN-NEXT: v_add_u32_e32 v93, s0, v92 - ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v234, v73, v81 + ; GCN-NEXT: v_add_u32_e32 v235, v73, v82 + ; GCN-NEXT: v_lshl_add_u32 v64, s17, 4, v66 + ; GCN-NEXT: v_mul_lo_u32 v64, v64, s6 + ; GCN-NEXT: v_add_lshl_u32 v222, v64, v65, 1 + ; GCN-NEXT: v_add_u32_e32 v95, s15, v222 + ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v222, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v95, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: s_lshl_b32 s0, s7, 7 - ; GCN-NEXT: v_add_lshl_u32 v95, v8, s0, 1 - ; GCN-NEXT: v_add_u32_e32 v8, 64, v93 - ; GCN-NEXT: ; kill: killed $vgpr8 + ; GCN-NEXT: s_lshl_b32 s6, s17, 7 + ; GCN-NEXT: v_add_lshl_u32 v240, v72, s6, 1 + ; GCN-NEXT: v_add_u32_e32 v72, 64, v95 + ; GCN-NEXT: v_add_u32_e32 v206, 0x80, v95 + ; GCN-NEXT: v_add_u32_e32 v226, 0xc0, v95 + ; GCN-NEXT: v_add_u32_e32 v241, v73, v89 + ; GCN-NEXT: v_add_u32_e32 v242, v73, v83 + ; GCN-NEXT: v_add_u32_e32 v243, v73, v84 + ; GCN-NEXT: v_add_u32_e32 v244, v73, v85 + ; GCN-NEXT: v_add_u32_e32 v188, v73, v94 + ; GCN-NEXT: v_add_u32_e32 v189, v73, v86 + ; GCN-NEXT: v_add_u32_e32 v190, v73, v87 + ; GCN-NEXT: v_add_u32_e32 v191, v73, v88 + ; GCN-NEXT: v_add_u32_e32 v184, v73, v93 + ; GCN-NEXT: v_add_u32_e32 v185, v73, v90 + ; GCN-NEXT: v_add_u32_e32 v186, v73, v91 + ; GCN-NEXT: v_add_u32_e32 v187, v73, v92 + ; GCN-NEXT: ; implicit-def: $vgpr74 + ; GCN-NEXT: ; implicit-def: $sgpr16 + ; GCN-NEXT: ; implicit-def: $vgpr75 + ; GCN-NEXT: ; implicit-def: $vgpr76 + ; GCN-NEXT: ; implicit-def: $vgpr77 + ; GCN-NEXT: ; implicit-def: $vgpr78 + ; GCN-NEXT: ; implicit-def: $vgpr79 + ; GCN-NEXT: v_add_u32_e32 v230, v73, v79 + ; GCN-NEXT: v_add_u32_e32 v74, s17, v74 + ; GCN-NEXT: v_and_b32_e32 v74, 0x1fffffff, v74 + ; GCN-NEXT: v_mul_lo_u32 v74, v74, s16 + ; GCN-NEXT: v_add_lshl_u32 v183, v75, v74, 1 + ; GCN-NEXT: v_lshl_add_u32 v180, v76, 1, v183 + ; GCN-NEXT: v_lshl_add_u32 v181, v77, 1, v180 + ; GCN-NEXT: v_lshl_add_u32 v182, v78, 1, v181 + ; GCN-NEXT: ; implicit-def: $vgpr239 + ; GCN-NEXT: ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159 + ; GCN-NEXT: ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; kill: killed $vgpr92 - ; GCN-NEXT: ; implicit-def: $sgpr6 + ; GCN-NEXT: ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147 + ; GCN-NEXT: ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143 + ; GCN-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: ; implicit-def: $sgpr7 + ; GCN-NEXT: ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135 + ; GCN-NEXT: ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131 + ; GCN-NEXT: ; implicit-def: $sgpr14 + ; GCN-NEXT: ; implicit-def: $vgpr176 + ; GCN-NEXT: ; implicit-def: $sgpr12_sgpr13 + ; GCN-NEXT: ; implicit-def: $vgpr192 + ; GCN-NEXT: v_max_f32_e32 v193, v192, v192 + ; GCN-NEXT: ; implicit-def: $vgpr179 + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; GCN-NEXT: ; implicit-def: $vgpr178 + ; GCN-NEXT: ; implicit-def: $vgpr177 + ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[0:3] + ; GCN-NEXT: ds_write_b128 v240, v[64:67] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[4:7] offset:1024 + ; GCN-NEXT: ds_write_b128 v240, v[68:71] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v222, s[8:11], 0 offen offset:64 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[64:67], v238 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127] + ; GCN-NEXT: ds_read_b128 v[64:67], v238 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111] + ; GCN-NEXT: ds_read_b128 v[64:67], v238 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0 - ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[168:171], v238 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: ds_read_b128 v[172:175], v239 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[194:197], v239 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[84:87], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[198:201], v239 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:1536 + ; GCN-NEXT: ds_read_b128 v[202:205], v239 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_add_u32_e32 v72, 0x80, v93 + ; GCN-NEXT: ds_write_b128 v240, v[160:163] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v240, v[164:167] offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v222, s[8:11], 0 offen offset:128 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v206, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[206:209], v238 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: ds_read_b128 v[164:167], v238 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[168:171], v238 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[210:213], v238 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[202:203], v[156:157], v[64:79] + ; GCN-NEXT: ds_read_b128 v[214:217], v239 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[218:221], v239 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[198:199], v[156:157], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[204:205], v[158:159], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[200:201], v[158:159], v[80:95] + ; GCN-NEXT: ds_read_b128 v[198:201], v239 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: ds_read_b128 v[202:205], v239 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: ds_write_b128 v240, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 - ; GCN-NEXT: ; implicit-def: $vgpr64 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 - ; GCN-NEXT: ; implicit-def: $vgpr73 - ; GCN-NEXT: v_add_u32_e32 v76, v132, v64 + ; GCN-NEXT: ds_write_b128 v240, v[160:163] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[222:225], v222, s[8:11], 0 offen offset:192 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[210:211], v[148:149], v[64:79] + ; GCN-NEXT: buffer_load_dwordx4 v[226:229], v226, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v73 - ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[230:231], v230, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[232:233], v232, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr74 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v74 - ; GCN-NEXT: ; implicit-def: $vgpr75 - ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_perm_b32 v236, v232, v230, s5 + ; GCN-NEXT: buffer_load_dwordx2 v[210:211], v234, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v75 - ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[148:149], v[80:95] + ; GCN-NEXT: buffer_load_dwordx2 v[234:235], v235, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[152:155], v238 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr76 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ; implicit-def: $sgpr8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512 + ; GCN-NEXT: v_perm_b32 v237, v234, v210, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[150:151], v[80:95] + ; GCN-NEXT: ds_read_b128 v[168:171], v238 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[160:163], v238 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[212:213], v[150:151], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[156:157], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[198:199], v[144:145], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[202:203], v[144:145], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[196:197], v[158:159], v[96:111] + ; GCN-NEXT: ds_read_b128 v[194:197], v238 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[200:201], v[146:147], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[204:205], v[146:147], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[160:161], v[140:141], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[194:195], v[140:141], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[206:207], v[148:149], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[162:163], v[142:143], v[80:95] + ; GCN-NEXT: ds_read_b128 v[160:163], v239 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[196:197], v[142:143], v[64:79] + ; GCN-NEXT: ds_read_b128 v[194:197], v239 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[172:175], v239 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: ds_read_b128 v[198:201], v239 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: ds_write_b128 v240, v[222:225] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v240, v[226:229] offset:1024 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[164:165], v[148:149], v[96:111] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[64:67], v94 + ; GCN-NEXT: ds_read_b128 v[156:159], v238 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[90:93], v94 offset:512 + ; GCN-NEXT: ds_read_b128 v[202:205], v238 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71 - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: v_perm_b32 v226, v232, v230, s7 + ; GCN-NEXT: v_perm_b32 v227, v234, v210, s7 + ; GCN-NEXT: v_perm_b32 v228, v233, v231, s5 + ; GCN-NEXT: v_perm_b32 v230, v233, v231, s7 + ; GCN-NEXT: v_perm_b32 v229, v235, v211, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[208:209], v[150:151], v[112:127] + ; GCN-NEXT: v_perm_b32 v231, v235, v211, s7 + ; GCN-NEXT: ds_read_b128 v[210:213], v238 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ds_read_b128 v[76:79], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[222:225], v238 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[94:97], v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[166:167], v[150:151], v[96:111] + ; GCN-NEXT: ds_read_b128 v[164:167], v239 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63] - ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47] - ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[214:215], v[144:145], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[198:199], v[136:137], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[144:145], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[216:217], v[146:147], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[200:201], v[138:139], v[64:79] + ; GCN-NEXT: ds_read_b128 v[198:201], v239 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[146:147], v[96:111] + ; GCN-NEXT: ds_read_b128 v[218:221], v239 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[140:141], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[222:223], v[132:133], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[168:169], v[140:141], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[142:143], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[224:225], v[134:135], v[64:79] + ; GCN-NEXT: ds_read_b128 v[222:225], v239 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63] - ; GCN-NEXT: v_perm_b32 v94, v102, v98, s5 - ; GCN-NEXT: v_perm_b32 v98, v102, v98, s8 - ; GCN-NEXT: v_perm_b32 v102, v103, v99, s5 - ; GCN-NEXT: v_perm_b32 v95, v104, v100, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63] - ; GCN-NEXT: v_perm_b32 v96, v103, v99, s8 - ; GCN-NEXT: v_perm_b32 v99, v104, v100, s8 - ; GCN-NEXT: v_perm_b32 v103, v105, v101, s5 - ; GCN-NEXT: v_perm_b32 v97, v105, v101, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47] - ; GCN-NEXT: s_nop 5 - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v48 - ; GCN-NEXT: v_mul_f32_e32 v101, s4, v49 - ; GCN-NEXT: v_max3_f32 v92, v100, s6, v101 - ; GCN-NEXT: v_mul_f32_e32 v93, s4, v50 - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v51 - ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 - ; GCN-NEXT: v_mul_f32_e32 v93, s4, v52 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31] - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v53 - ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v54 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v55 - ; GCN-NEXT: v_max3_f32 v84, v92, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v56 - ; GCN-NEXT: v_mul_f32_e32 v92, s4, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15] - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v92 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v58 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v59 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v60 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v61 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v62 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v63 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31] - ; GCN-NEXT: s_nop 6 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v32 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v33 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v34 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v35 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v37 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v38 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v39 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v40 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v41 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31] - ; GCN-NEXT: v_max3_f32 v80, v84, v85, v80 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v42 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v43 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v44 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v45 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v46 - ; GCN-NEXT: v_mul_f32_e32 v82, s4, v47 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v16 - ; GCN-NEXT: v_mul_f32_e32 v82, s4, v17 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 - ; GCN-NEXT: v_max3_f32 v68, v80, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v20 - ; GCN-NEXT: v_mul_f32_e32 v76, s4, v21 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v76 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v22 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v23 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v24 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v25 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v26 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v27 - ; GCN-NEXT: v_max3_f32 v64, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v28 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v29 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v30 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v31 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v0 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v2 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v3 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v4 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v5 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v7 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v8 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v10 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v11 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v12 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v13 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v14 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v15 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: ; implicit-def: $vgpr65 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $vgpr68 - ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_add_u32_e32 v65, s7, v65 - ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65 - ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6 - ; GCN-NEXT: v_add_lshl_u32 v135, v66, v65, 1 - ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_lshl_add_u32 v136, v66, 1, v135 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_lshl_add_u32 v137, v66, 1, v136 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 - ; GCN-NEXT: v_lshl_add_u32 v138, v66, 1, v137 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v135, v[94:95] - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v64, v64, v65 - ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64 + ; GCN-NEXT: ds_write_b64 v183, v[236:237] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[98:99] + ; GCN-NEXT: ds_write_b64 v180, v[226:227] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[102:103] + ; GCN-NEXT: ds_write_b64 v181, v[228:229] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[96:97] - ; GCN-NEXT: v_add_u32_e32 v68, v132, v68 - ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7] - ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 - ; GCN-NEXT: ; implicit-def: $vgpr65 - ; GCN-NEXT: v_max_f32_e32 v66, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v134, v66, v64 - ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: ds_write_b64 v182, v[230:231] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[226:227], v241, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v64 - ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[228:229], v242, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v66 - ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[230:231], v243, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v67 - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[232:233], v244, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 - ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134 - ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 - ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134 - ; GCN-NEXT: v_exp_f32_e32 v163, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 - ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134 - ; GCN-NEXT: v_exp_f32_e32 v164, v57 - ; GCN-NEXT: v_exp_f32_e32 v49, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134 - ; GCN-NEXT: v_exp_f32_e32 v50, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v66 - ; GCN-NEXT: v_fma_f32 v68, s4, v52, -v134 - ; GCN-NEXT: v_exp_f32_e32 v51, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_fma_f32 v69, s4, v53, -v134 - ; GCN-NEXT: v_exp_f32_e32 v52, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[170:171], v[142:143], v[96:111] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_fma_f32 v70, s4, v54, -v134 - ; GCN-NEXT: v_exp_f32_e32 v53, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v69 - ; GCN-NEXT: v_fma_f32 v71, s4, v55, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v54, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70 - ; GCN-NEXT: v_exp_f32_e32 v55, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71 - ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134 - ; GCN-NEXT: v_exp_f32_e32 v56, v48 - ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49 - ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50 - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51 - ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 - ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v48, v48 - ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58 - ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67 - ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66 - ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55 - ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56 - ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 - ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 - ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] - ; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 - ; GCN-NEXT: v_exp_f32_e32 v58, v58 - ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95] - ; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59 - ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53 - ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54 - ; GCN-NEXT: v_exp_f32_e32 v59, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111] - ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134 - ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134 - ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134 - ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60 - ; GCN-NEXT: ; implicit-def: $vgpr57 - ; GCN-NEXT: ds_read_b128 v[60:63], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v160, v149 - ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148 - ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79] - ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134 - ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134 - ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134 - ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162 - ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163 - ; GCN-NEXT: v_exp_f32_e32 v162, v146 - ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164 - ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134 - ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111] - ; GCN-NEXT: v_exp_f32_e32 v151, v33 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59 - ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 - ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134 - ; GCN-NEXT: v_exp_f32_e32 v153, v33 - ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134 - ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5 - ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161 - ; GCN-NEXT: v_exp_f32_e32 v165, v60 - ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8 - ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v161, v61 - ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8 - ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5 - ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8 - ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8 + ; GCN-NEXT: v_perm_b32 v170, v228, v226, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[136:137], v[112:127] + ; GCN-NEXT: v_perm_b32 v168, v228, v226, s7 + ; GCN-NEXT: v_perm_b32 v171, v232, v230, s5 + ; GCN-NEXT: v_perm_b32 v169, v232, v230, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[136:137], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[138:139], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[136:137], v[80:95] + ; GCN-NEXT: v_perm_b32 v172, v229, v227, s5 + ; GCN-NEXT: v_perm_b32 v173, v233, v231, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[196:197], v[138:139], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[138:139], v[80:95] + ; GCN-NEXT: v_perm_b32 v174, v229, v227, s7 + ; GCN-NEXT: v_perm_b32 v175, v233, v231, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[202:203], v[132:133], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[210:211], v[132:133], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[204:205], v[134:135], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[164:165], v[128:129], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[212:213], v[134:135], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[198:199], v[128:129], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[166:167], v[130:131], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[222:223], v[128:129], v[64:79] + ; GCN-NEXT: s_nop 7 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v116 + ; GCN-NEXT: v_mul_f32_e32 v133, s4, v117 + ; GCN-NEXT: v_mul_f32_e32 v134, s4, v118 + ; GCN-NEXT: v_mul_f32_e32 v135, s4, v119 + ; GCN-NEXT: v_mul_f32_e32 v136, s4, v120 + ; GCN-NEXT: v_mul_f32_e32 v137, s4, v121 + ; GCN-NEXT: v_mul_f32_e32 v138, s4, v122 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[218:219], v[128:129], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v128, s4, v112 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v113 + ; GCN-NEXT: v_max3_f32 v128, v128, s14, v129 + ; GCN-NEXT: v_mul_f32_e32 v139, s4, v123 + ; GCN-NEXT: v_mul_f32_e32 v140, s4, v124 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v125 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v126 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[200:201], v[130:131], v[96:111] + ; GCN-NEXT: v_mul_f32_e32 v143, s4, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[224:225], v[130:131], v[64:79] + ; GCN-NEXT: s_nop 7 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_mul_f32_e32 v151, s4, v96 + ; GCN-NEXT: v_mul_f32_e32 v198, s4, v97 + ; GCN-NEXT: v_mul_f32_e32 v199, s4, v98 + ; GCN-NEXT: v_mul_f32_e32 v200, s4, v99 + ; GCN-NEXT: v_mul_f32_e32 v201, s4, v100 + ; GCN-NEXT: v_mul_f32_e32 v206, s4, v101 + ; GCN-NEXT: v_mul_f32_e32 v207, s4, v102 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[220:221], v[130:131], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v114 + ; GCN-NEXT: v_mul_f32_e32 v131, s4, v115 + ; GCN-NEXT: v_max3_f32 v128, v128, v130, v131 + ; GCN-NEXT: v_max3_f32 v128, v128, v132, v133 + ; GCN-NEXT: v_max3_f32 v128, v128, v134, v135 + ; GCN-NEXT: v_max3_f32 v128, v128, v136, v137 + ; GCN-NEXT: v_max3_f32 v128, v128, v138, v139 + ; GCN-NEXT: v_max3_f32 v128, v128, v140, v141 + ; GCN-NEXT: v_max3_f32 v128, v128, v142, v143 + ; GCN-NEXT: v_max3_f32 v128, v128, v151, v198 + ; GCN-NEXT: v_max3_f32 v128, v128, v199, v200 + ; GCN-NEXT: v_mul_f32_e32 v208, s4, v103 + ; GCN-NEXT: v_max3_f32 v128, v128, v201, v206 + ; GCN-NEXT: v_mul_f32_e32 v209, s4, v104 + ; GCN-NEXT: v_mul_f32_e32 v144, s4, v105 + ; GCN-NEXT: v_max3_f32 v128, v128, v207, v208 + ; GCN-NEXT: v_mul_f32_e32 v145, s4, v106 + ; GCN-NEXT: v_mul_f32_e32 v146, s4, v107 + ; GCN-NEXT: v_max3_f32 v128, v128, v209, v144 + ; GCN-NEXT: v_mul_f32_e32 v147, s4, v108 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v109 + ; GCN-NEXT: v_max3_f32 v128, v128, v145, v146 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v110 + ; GCN-NEXT: v_mul_f32_e32 v216, s4, v111 + ; GCN-NEXT: v_max3_f32 v128, v128, v147, v214 + ; GCN-NEXT: v_mul_f32_e32 v196, s4, v80 + ; GCN-NEXT: v_mul_f32_e32 v197, s4, v81 + ; GCN-NEXT: v_max3_f32 v128, v128, v215, v216 + ; GCN-NEXT: v_mul_f32_e32 v202, s4, v82 + ; GCN-NEXT: v_mul_f32_e32 v203, s4, v83 + ; GCN-NEXT: v_max3_f32 v128, v128, v196, v197 + ; GCN-NEXT: v_mul_f32_e32 v211, s4, v84 + ; GCN-NEXT: v_mul_f32_e32 v212, s4, v85 + ; GCN-NEXT: v_max3_f32 v128, v128, v202, v203 + ; GCN-NEXT: v_mul_f32_e32 v213, s4, v86 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v87 + ; GCN-NEXT: v_max3_f32 v128, v128, v211, v212 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v88 + ; GCN-NEXT: v_mul_f32_e32 v204, s4, v89 + ; GCN-NEXT: v_max3_f32 v128, v128, v213, v218 + ; GCN-NEXT: v_mul_f32_e32 v205, s4, v90 + ; GCN-NEXT: v_mul_f32_e32 v220, s4, v91 + ; GCN-NEXT: v_max3_f32 v128, v128, v219, v204 + ; GCN-NEXT: v_mul_f32_e32 v221, s4, v92 + ; GCN-NEXT: v_mul_f32_e32 v148, s4, v93 + ; GCN-NEXT: v_max3_f32 v128, v128, v205, v220 + ; GCN-NEXT: v_mul_f32_e32 v149, s4, v94 + ; GCN-NEXT: v_mul_f32_e32 v150, s4, v95 + ; GCN-NEXT: v_max3_f32 v128, v128, v221, v148 + ; GCN-NEXT: v_mul_f32_e32 v222, s4, v64 + ; GCN-NEXT: v_mul_f32_e32 v223, s4, v65 + ; GCN-NEXT: v_max3_f32 v128, v128, v149, v150 + ; GCN-NEXT: v_mul_f32_e32 v224, s4, v66 + ; GCN-NEXT: v_mul_f32_e32 v225, s4, v67 + ; GCN-NEXT: v_max3_f32 v128, v128, v222, v223 + ; GCN-NEXT: v_mul_f32_e32 v226, s4, v68 + ; GCN-NEXT: v_mul_f32_e32 v227, s4, v69 + ; GCN-NEXT: v_max3_f32 v128, v128, v224, v225 + ; GCN-NEXT: v_mul_f32_e32 v228, s4, v70 + ; GCN-NEXT: v_mul_f32_e32 v229, s4, v71 + ; GCN-NEXT: v_max3_f32 v128, v128, v226, v227 + ; GCN-NEXT: v_mul_f32_e32 v230, s4, v72 + ; GCN-NEXT: v_mul_f32_e32 v231, s4, v73 + ; GCN-NEXT: v_max3_f32 v128, v128, v228, v229 + ; GCN-NEXT: v_mul_f32_e32 v232, s4, v74 + ; GCN-NEXT: v_mul_f32_e32 v233, s4, v75 + ; GCN-NEXT: v_max3_f32 v128, v128, v230, v231 + ; GCN-NEXT: v_mul_f32_e32 v234, s4, v76 + ; GCN-NEXT: v_mul_f32_e32 v194, s4, v77 + ; GCN-NEXT: v_max3_f32 v128, v128, v232, v233 + ; GCN-NEXT: v_mul_f32_e32 v195, s4, v78 + ; GCN-NEXT: v_mul_f32_e32 v210, s4, v79 + ; GCN-NEXT: v_max3_f32 v128, v128, v234, v194 + ; GCN-NEXT: v_max3_f32 v128, v128, v195, v210 + ; GCN-NEXT: ds_bpermute_b32 v129, v176, v128 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_max_f32_e32 v129, v129, v129 + ; GCN-NEXT: v_max_f32_e32 v128, v128, v129 + ; GCN-NEXT: ds_bpermute_b32 v129, v176, v128 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v128, v129, v128, s[12:13] + ; GCN-NEXT: v_max_f32_e32 v128, v128, v128 + ; GCN-NEXT: v_max_f32_e32 v128, v193, v128 + ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112 + ; GCN-NEXT: v_fma_f32 v113, s4, v113, -v128 + ; GCN-NEXT: v_fma_f32 v114, s4, v114, -v128 + ; GCN-NEXT: v_mul_f32_e32 v113, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_fma_f32 v114, s4, v115, -v128 + ; GCN-NEXT: v_exp_f32_e32 v112, v112 + ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_fma_f32 v114, s4, v116, -v128 + ; GCN-NEXT: v_exp_f32_e32 v113, v113 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_fma_f32 v114, s4, v117, -v128 + ; GCN-NEXT: v_exp_f32_e32 v129, v129 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_fma_f32 v114, s4, v118, -v128 + ; GCN-NEXT: v_exp_f32_e32 v134, v134 + ; GCN-NEXT: v_mul_f32_e32 v118, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_fma_f32 v114, s4, v119, -v128 + ; GCN-NEXT: v_add_f32_e32 v138, 0, v112 + ; GCN-NEXT: v_exp_f32_e32 v140, v135 + ; GCN-NEXT: v_mul_f32_e32 v119, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_sub_f32_e32 v114, v192, v128 + ; GCN-NEXT: v_add_f32_e32 v138, v113, v138 + ; GCN-NEXT: v_exp_f32_e32 v142, v136 + ; GCN-NEXT: v_mul_f32_e32 v137, 0x3fb8aa3b, v114 + ; GCN-NEXT: ds_read_b128 v[114:117], v179 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_f32_e32 v138, v129, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v112 + ; GCN-NEXT: v_fma_f32 v112, s4, v120, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v120, v129 + ; GCN-NEXT: v_exp_f32_e32 v129, v118 + ; GCN-NEXT: ds_read_b128 v[130:133], v179 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_f32_e32 v138, v134, v138 + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v112 + ; GCN-NEXT: v_fma_f32 v112, s4, v121, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v118, v134 + ; GCN-NEXT: v_exp_f32_e32 v144, v119 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v112 + ; GCN-NEXT: v_add_f32_e32 v112, v140, v138 + ; GCN-NEXT: v_fma_f32 v121, s4, v122, -v128 + ; GCN-NEXT: v_add_f32_e32 v112, v142, v112 + ; GCN-NEXT: v_cvt_f16_f32_e32 v113, v113 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v121 + ; GCN-NEXT: v_fma_f32 v121, s4, v123, -v128 + ; GCN-NEXT: v_add_f32_e32 v119, v129, v112 + ; GCN-NEXT: v_exp_f32_e32 v112, v137 + ; GCN-NEXT: v_mul_f32_e32 v145, 0x3fb8aa3b, v121 + ; GCN-NEXT: v_add_f32_e32 v146, v144, v119 + ; GCN-NEXT: v_pack_b32_f16 v123, v120, v118 + ; GCN-NEXT: ds_read_b128 v[118:121], v179 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pack_b32_f16 v122, v139, v113 + ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[114:115], v[122:123], v[0:15] + ; GCN-NEXT: ds_read_b128 v[134:137], v179 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v113, v140 + ; GCN-NEXT: v_exp_f32_e32 v139, v141 + ; GCN-NEXT: v_cvt_f16_f32_e32 v115, v142 + ; GCN-NEXT: v_fma_f32 v114, s4, v124, -v128 + ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_add_f32_e32 v114, v139, v146 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[130:131], v[122:123], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v141, v143 + ; GCN-NEXT: v_pack_b32_f16 v130, v113, v115 + ; GCN-NEXT: v_fma_f32 v115, s4, v126, -v128 + ; GCN-NEXT: v_fma_f32 v124, s4, v125, -v128 + ; GCN-NEXT: v_add_f32_e32 v113, v141, v114 + ; GCN-NEXT: v_cvt_f16_f32_e32 v114, v129 + ; GCN-NEXT: v_fma_f32 v96, s4, v96, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[118:119], v[122:123], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v119, 0x3fb8aa3b, v115 + ; GCN-NEXT: v_cvt_f16_f32_e32 v115, v144 + ; GCN-NEXT: v_exp_f32_e32 v118, v138 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v124 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v139 + ; GCN-NEXT: v_pack_b32_f16 v131, v114, v115 + ; GCN-NEXT: v_add_f32_e32 v113, v118, v113 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[134:135], v[122:123], v[48:63] + ; GCN-NEXT: v_fma_f32 v122, s4, v127, -v128 + ; GCN-NEXT: v_exp_f32_e32 v127, v145 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v96 + ; GCN-NEXT: v_fma_f32 v97, s4, v97, -v128 + ; GCN-NEXT: v_fma_f32 v98, s4, v98, -v128 + ; GCN-NEXT: v_add_f32_e32 v113, v127, v113 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[116:117], v[130:131], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v134, v140 + ; GCN-NEXT: ds_read_b128 v[114:117], v178 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v99, s4, v99, -v128 + ; GCN-NEXT: ds_read_b128 v[122:125], v178 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_f32_e32 v96, v134, v113 + ; GCN-NEXT: v_cvt_f16_f32_e32 v113, v141 + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v97 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[132:133], v[130:131], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v140, v142 + ; GCN-NEXT: v_cvt_f16_f32_e32 v97, v118 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v98 + ; GCN-NEXT: v_cvt_f16_f32_e32 v98, v127 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v113 + ; GCN-NEXT: v_add_f32_e32 v96, v140, v96 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v99 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[120:121], v[130:131], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v113, v119 + ; GCN-NEXT: v_pack_b32_f16 v127, v97, v98 + ; GCN-NEXT: v_fma_f32 v100, s4, v100, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v144, v134 + ; GCN-NEXT: v_add_f32_e32 v96, v113, v96 + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v100 + ; GCN-NEXT: v_fma_f32 v101, s4, v101, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[130:131], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v129, v129 + ; GCN-NEXT: v_fma_f32 v104, s4, v104, -v128 + ; GCN-NEXT: v_fma_f32 v105, s4, v105, -v128 + ; GCN-NEXT: v_fma_f32 v109, s4, v109, -v128 + ; GCN-NEXT: v_add_f32_e32 v130, v129, v96 + ; GCN-NEXT: ds_read_b128 v[96:99], v178 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[118:121], v178 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[114:115], v[126:127], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v145, v135 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111] - ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151 - ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153 - ; GCN-NEXT: v_exp_f32_e32 v159, v33 - ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38 - ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_exp_f32_e32 v152, v38 + ; GCN-NEXT: ds_write_b64 v183, v[170:171] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[60:61] + ; GCN-NEXT: ds_write_b64 v180, v[168:169] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[32:33] - ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: ; implicit-def: $vgpr38 + ; GCN-NEXT: ds_write_b64 v181, v[172:173] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[140:141] - ; GCN-NEXT: v_add_u32_e32 v38, v132, v38 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v33 + ; GCN-NEXT: ds_write_b64 v182, v[174:175] + ; GCN-NEXT: v_add_f32_e32 v100, v145, v130 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[114:115], v188, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v189, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr36 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v36 - ; GCN-NEXT: ; implicit-def: $vgpr37 - ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[132:133], v190, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v37 - ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v191, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[122:123], v[126:127], v[16:31] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165 - ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156 - ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134 - ; GCN-NEXT: ds_read_b128 v[36:39], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v154, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 - ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134 - ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v155, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157 - ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161 - ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159 - ; GCN-NEXT: v_exp_f32_e32 v157, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142 - ; GCN-NEXT: v_exp_f32_e32 v146, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134 - ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40 - ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v147, v36 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v143, v36 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142 - ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157 - ; GCN-NEXT: v_exp_f32_e32 v156, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146 - ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32 - ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v129, v36 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147 - ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: ds_read_b128 v[36:39], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v142, v40 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143 - ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95] - ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v63, v40 - ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61 - ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134 - ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156 - ; GCN-NEXT: v_exp_f32_e32 v158, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129 - ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v128, v17 - ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8 - ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62 - ; GCN-NEXT: v_exp_f32_e32 v167, v36 - ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8 - ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v130, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158 - ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5 - ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63 + ; GCN-NEXT: v_fma_f32 v80, s4, v80, -v128 + ; GCN-NEXT: v_fma_f32 v81, s4, v81, -v128 + ; GCN-NEXT: v_fma_f32 v82, s4, v82, -v128 + ; GCN-NEXT: v_fma_f32 v83, s4, v83, -v128 + ; GCN-NEXT: v_fma_f32 v84, s4, v84, -v128 + ; GCN-NEXT: v_fma_f32 v85, s4, v85, -v128 + ; GCN-NEXT: v_fma_f32 v88, s4, v88, -v128 + ; GCN-NEXT: v_fma_f32 v89, s4, v89, -v128 + ; GCN-NEXT: v_fma_f32 v93, s4, v93, -v128 + ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128 + ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128 + ; GCN-NEXT: v_fma_f32 v66, s4, v66, -v128 + ; GCN-NEXT: v_fma_f32 v67, s4, v67, -v128 + ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128 + ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128 + ; GCN-NEXT: v_fma_f32 v72, s4, v72, -v128 + ; GCN-NEXT: v_fma_f32 v73, s4, v73, -v128 + ; GCN-NEXT: v_fma_f32 v77, s4, v77, -v128 + ; GCN-NEXT: v_perm_b32 v136, v130, v114, s5 + ; GCN-NEXT: v_perm_b32 v138, v130, v114, s7 + ; GCN-NEXT: v_perm_b32 v137, v134, v132, s5 + ; GCN-NEXT: v_perm_b32 v139, v134, v132, s7 + ; GCN-NEXT: v_exp_f32_e32 v134, v141 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[96:97], v[126:127], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v96, v113 + ; GCN-NEXT: v_exp_f32_e32 v113, v142 + ; GCN-NEXT: v_perm_b32 v130, v131, v115, s5 + ; GCN-NEXT: v_perm_b32 v132, v131, v115, s7 + ; GCN-NEXT: v_perm_b32 v131, v135, v133, s5 + ; GCN-NEXT: v_perm_b32 v133, v135, v133, s7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v114, v140 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v101 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[118:119], v[126:127], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v101, v129 + ; GCN-NEXT: v_exp_f32_e32 v119, v143 + ; GCN-NEXT: v_add_f32_e32 v100, v134, v100 + ; GCN-NEXT: v_fma_f32 v97, s4, v102, -v128 + ; GCN-NEXT: v_add_f32_e32 v100, v113, v100 + ; GCN-NEXT: v_fma_f32 v102, s4, v103, -v128 + ; GCN-NEXT: v_pack_b32_f16 v122, v144, v114 + ; GCN-NEXT: v_mul_f32_e32 v126, 0x3fb8aa3b, v102 + ; GCN-NEXT: v_pack_b32_f16 v123, v96, v101 + ; GCN-NEXT: v_add_f32_e32 v96, v119, v100 + ; GCN-NEXT: ds_read_b128 v[100:103], v179 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[116:117], v[122:123], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v127, v146 + ; GCN-NEXT: v_mul_f32_e32 v97, 0x3fb8aa3b, v97 + ; GCN-NEXT: ds_read_b128 v[114:117], v179 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v118, v145 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v104 + ; GCN-NEXT: v_cvt_f16_f32_e32 v104, v134 + ; GCN-NEXT: v_add_f32_e32 v96, v127, v96 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[124:125], v[122:123], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v124, v135 + ; GCN-NEXT: v_pack_b32_f16 v118, v118, v104 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v105 + ; GCN-NEXT: v_add_f32_e32 v96, v124, v96 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[98:99], v[122:123], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v98, v113 + ; GCN-NEXT: v_exp_f32_e32 v113, v97 + ; GCN-NEXT: v_cvt_f16_f32_e32 v97, v119 + ; GCN-NEXT: v_fma_f32 v99, s4, v106, -v128 + ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v99 + ; GCN-NEXT: v_add_f32_e32 v96, v113, v96 + ; GCN-NEXT: v_fma_f32 v99, s4, v107, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[120:121], v[122:123], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v120, v126 + ; GCN-NEXT: v_mul_f32_e32 v121, 0x3fb8aa3b, v99 + ; GCN-NEXT: v_pack_b32_f16 v119, v98, v97 + ; GCN-NEXT: v_add_f32_e32 v122, v120, v96 + ; GCN-NEXT: ds_read_b128 v[96:99], v179 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[104:107], v179 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[100:101], v[118:119], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v123, v129 + ; GCN-NEXT: v_fma_f32 v101, s4, v108, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v100, v127 + ; GCN-NEXT: v_mul_f32_e32 v126, 0x3fb8aa3b, v101 + ; GCN-NEXT: v_add_f32_e32 v101, v123, v122 + ; GCN-NEXT: v_cvt_f16_f32_e32 v108, v124 + ; GCN-NEXT: v_mul_f32_e32 v124, 0x3fb8aa3b, v109 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[114:115], v[118:119], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v122, v125 + ; GCN-NEXT: v_pack_b32_f16 v114, v100, v108 + ; GCN-NEXT: v_add_f32_e32 v100, v122, v101 + ; GCN-NEXT: v_cvt_f16_f32_e32 v101, v120 + ; GCN-NEXT: v_mul_f32_e32 v120, 0x3fb8aa3b, v80 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[96:97], v[118:119], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v96, v113 + ; GCN-NEXT: v_exp_f32_e32 v113, v134 + ; GCN-NEXT: v_fma_f32 v97, s4, v110, -v128 + ; GCN-NEXT: v_mul_f32_e32 v97, 0x3fb8aa3b, v97 + ; GCN-NEXT: v_pack_b32_f16 v115, v96, v101 + ; GCN-NEXT: v_add_f32_e32 v100, v113, v100 + ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v84 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[104:105], v[118:119], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v105, v121 + ; GCN-NEXT: v_fma_f32 v104, s4, v111, -v128 + ; GCN-NEXT: v_mul_f32_e32 v118, 0x3fb8aa3b, v104 + ; GCN-NEXT: v_cvt_f16_f32_e32 v104, v123 + ; GCN-NEXT: v_add_f32_e32 v96, v105, v100 + ; GCN-NEXT: v_mul_f32_e32 v123, 0x3fb8aa3b, v81 + ; GCN-NEXT: v_cvt_f16_f32_e32 v81, v113 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[102:103], v[114:115], v[0:15] + ; GCN-NEXT: ds_read_b128 v[100:103], v178 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v119, v126 + ; GCN-NEXT: ds_read_b128 v[108:111], v178 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v126, 0x3fb8aa3b, v83 + ; GCN-NEXT: v_add_f32_e32 v80, v119, v96 + ; GCN-NEXT: v_cvt_f16_f32_e32 v96, v122 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[116:117], v[114:115], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v122, v124 + ; GCN-NEXT: v_mul_f32_e32 v124, 0x3fb8aa3b, v82 + ; GCN-NEXT: v_cvt_f16_f32_e32 v82, v105 + ; GCN-NEXT: v_pack_b32_f16 v104, v104, v96 + ; GCN-NEXT: v_add_f32_e32 v80, v122, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v119 + ; GCN-NEXT: v_pack_b32_f16 v105, v81, v82 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[98:99], v[114:115], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v113, v97 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v80, v113, v80 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[106:107], v[114:115], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v125, v118 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v106, v125, v80 + ; GCN-NEXT: ds_read_b128 v[80:83], v178 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[96:99], v178 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[100:101], v[104:105], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v129, v120 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[20:21] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111] - ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5 - ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v140, v17 - ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5 + ; GCN-NEXT: ds_write_b64 v183, v[136:137] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60 - ; GCN-NEXT: v_exp_f32_e32 v144, v22 + ; GCN-NEXT: ds_write_b64 v180, v[138:139] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[16:17] - ; GCN-NEXT: ; implicit-def: $vgpr17 - ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: ds_write_b64 v181, v[130:131] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[42:43] - ; GCN-NEXT: v_add_u32_e32 v22, v132, v22 - ; GCN-NEXT: v_add_u32_e32 v17, v132, v17 - ; GCN-NEXT: ; implicit-def: $vgpr20 - ; GCN-NEXT: ; implicit-def: $vgpr21 + ; GCN-NEXT: ds_write_b64 v182, v[132:133] + ; GCN-NEXT: v_add_f32_e32 v84, v129, v106 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v184, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[106:107], v185, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v20, v132, v20 - ; GCN-NEXT: v_add_u32_e32 v21, v132, v21 - ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44 - ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[114:115], v186, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[116:117], v187, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v132, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[108:109], v[104:105], v[16:31] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167 - ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134 - ; GCN-NEXT: ds_read_b128 v[20:23], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[36:39], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v62, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130 - ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134 - ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134 - ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134 - ; GCN-NEXT: ; implicit-def: $sgpr0 - ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140 - ; GCN-NEXT: v_exp_f32_e32 v145, v16 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141 - ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46 - ; GCN-NEXT: v_exp_f32_e32 v35, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v34, s4, v27, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v24 - ; GCN-NEXT: ds_read_b128 v[24:27], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v46, v20 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v47 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v132 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v47, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v62 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v34 - ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145 - ; GCN-NEXT: v_exp_f32_e32 v141, v16 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35 - ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134 - ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16 - ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v33, v20 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: ds_read_b128 v[20:23], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v36, v24 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47 - ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95] - ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_exp_f32_e32 v39, v24 - ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37 - ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141 - ; GCN-NEXT: v_exp_f32_e32 v148, v1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33 - ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127] - ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134 - ; GCN-NEXT: v_exp_f32_e32 v34, v1 - ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8 - ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38 - ; GCN-NEXT: v_exp_f32_e32 v150, v20 - ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v38, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39 - ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134 - ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5 - ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8 + ; GCN-NEXT: v_perm_b32 v118, v106, v100, s5 + ; GCN-NEXT: v_perm_b32 v120, v106, v100, s7 + ; GCN-NEXT: v_perm_b32 v119, v116, v114, s5 + ; GCN-NEXT: v_perm_b32 v121, v116, v114, s7 + ; GCN-NEXT: v_exp_f32_e32 v116, v123 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[104:105], v[32:47] + ; GCN-NEXT: v_perm_b32 v106, v107, v101, s5 + ; GCN-NEXT: v_perm_b32 v114, v107, v101, s7 + ; GCN-NEXT: v_perm_b32 v107, v117, v115, s5 + ; GCN-NEXT: v_perm_b32 v115, v117, v115, s7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v100, v122 + ; GCN-NEXT: v_mul_f32_e32 v117, 0x3fb8aa3b, v85 + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v113 + ; GCN-NEXT: v_exp_f32_e32 v113, v124 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[104:105], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v125 + ; GCN-NEXT: v_exp_f32_e32 v97, v126 + ; GCN-NEXT: v_pack_b32_f16 v108, v127, v100 + ; GCN-NEXT: v_add_f32_e32 v84, v116, v84 + ; GCN-NEXT: v_pack_b32_f16 v109, v80, v85 + ; GCN-NEXT: v_fma_f32 v81, s4, v86, -v128 + ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v81 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[102:103], v[108:109], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v105, v134 + ; GCN-NEXT: v_add_f32_e32 v84, v113, v84 + ; GCN-NEXT: v_fma_f32 v86, s4, v87, -v128 + ; GCN-NEXT: v_mul_f32_e32 v104, 0x3fb8aa3b, v86 + ; GCN-NEXT: v_add_f32_e32 v80, v97, v84 + ; GCN-NEXT: ds_read_b128 v[84:87], v179 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[100:103], v179 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[110:111], v[108:109], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v110, v117 + ; GCN-NEXT: v_add_f32_e32 v80, v105, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v96, v129 + ; GCN-NEXT: v_mul_f32_e32 v122, 0x3fb8aa3b, v88 + ; GCN-NEXT: v_add_f32_e32 v80, v110, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v88, v116 + ; GCN-NEXT: v_mul_f32_e32 v111, 0x3fb8aa3b, v89 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[108:109], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v82, v113 + ; GCN-NEXT: v_exp_f32_e32 v113, v81 + ; GCN-NEXT: v_cvt_f16_f32_e32 v81, v97 + ; GCN-NEXT: v_fma_f32 v83, s4, v90, -v128 + ; GCN-NEXT: v_mul_f32_e32 v116, 0x3fb8aa3b, v83 + ; GCN-NEXT: v_add_f32_e32 v80, v113, v80 + ; GCN-NEXT: v_fma_f32 v83, s4, v91, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[98:99], v[108:109], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v99, v104 + ; GCN-NEXT: v_mul_f32_e32 v104, 0x3fb8aa3b, v83 + ; GCN-NEXT: v_pack_b32_f16 v97, v82, v81 + ; GCN-NEXT: v_pack_b32_f16 v96, v96, v88 + ; GCN-NEXT: v_add_f32_e32 v98, v99, v80 + ; GCN-NEXT: ds_read_b128 v[80:83], v179 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[88:91], v179 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[84:85], v[96:97], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v105 + ; GCN-NEXT: v_exp_f32_e32 v105, v122 + ; GCN-NEXT: v_fma_f32 v85, s4, v92, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v92, v110 + ; GCN-NEXT: v_mul_f32_e32 v108, 0x3fb8aa3b, v85 + ; GCN-NEXT: v_add_f32_e32 v85, v105, v98 + ; GCN-NEXT: v_pack_b32_f16 v98, v84, v92 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[100:101], v[96:97], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v100, v111 + ; GCN-NEXT: v_mul_f32_e32 v101, 0x3fb8aa3b, v93 + ; GCN-NEXT: v_add_f32_e32 v84, v100, v85 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v99 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[96:97], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v113 + ; GCN-NEXT: v_exp_f32_e32 v109, v116 + ; GCN-NEXT: v_fma_f32 v81, s4, v94, -v128 + ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v81 + ; GCN-NEXT: v_pack_b32_f16 v99, v80, v85 + ; GCN-NEXT: v_add_f32_e32 v84, v109, v84 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[88:89], v[96:97], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v89, v104 + ; GCN-NEXT: v_fma_f32 v88, s4, v95, -v128 + ; GCN-NEXT: v_mul_f32_e32 v104, 0x3fb8aa3b, v64 + ; GCN-NEXT: v_mul_f32_e32 v96, 0x3fb8aa3b, v88 + ; GCN-NEXT: v_add_f32_e32 v80, v89, v84 + ; GCN-NEXT: v_cvt_f16_f32_e32 v88, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[86:87], v[98:99], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v97, v108 + ; GCN-NEXT: ds_read_b128 v[84:87], v178 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[92:95], v178 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_f32_e32 v64, v97, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v100 + ; GCN-NEXT: v_pack_b32_f16 v88, v88, v80 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[102:103], v[98:99], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v100, v101 + ; GCN-NEXT: v_mul_f32_e32 v101, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v109 + ; GCN-NEXT: v_mul_f32_e32 v103, 0x3fb8aa3b, v66 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v89 + ; GCN-NEXT: v_add_f32_e32 v64, v100, v64 + ; GCN-NEXT: v_pack_b32_f16 v89, v65, v66 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[98:99], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v102, v81 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v64, v102, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[90:91], v[98:99], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v91, v96 + ; GCN-NEXT: v_mul_f32_e32 v96, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_mul_f32_e32 v98, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_add_f32_e32 v90, v91, v64 + ; GCN-NEXT: ds_read_b128 v[64:67], v178 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[80:83], v178 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[84:85], v[88:89], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v97 + ; GCN-NEXT: v_exp_f32_e32 v97, v104 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v100 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[4:5] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111] - ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5 - ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_exp_f32_e32 v42, v1 - ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5 + ; GCN-NEXT: ds_write_b64 v183, v[118:119] + ; GCN-NEXT: v_add_f32_e32 v68, v97, v90 + ; GCN-NEXT: v_pack_b32_f16 v90, v84, v85 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[92:93], v[88:89], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v92, v101 + ; GCN-NEXT: v_mul_f32_e32 v93, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v91 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[20:21] + ; GCN-NEXT: ds_write_b64 v180, v[120:121] + ; GCN-NEXT: v_add_f32_e32 v68, v92, v68 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[0:1] + ; GCN-NEXT: ds_write_b64 v181, v[106:107] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[26:27] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6 - ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32 + ; GCN-NEXT: ds_write_b64 v182, v[114:115] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[88:89], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v102 + ; GCN-NEXT: v_exp_f32_e32 v99, v103 + ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_pack_b32_f16 v16, v37, v28 - ; GCN-NEXT: v_fma_f32 v24, s4, v7, -v134 - ; GCN-NEXT: v_exp_f32_e32 v25, v6 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[4:7], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149 - ; GCN-NEXT: v_exp_f32_e32 v26, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150 - ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38 - ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v28, s4, v9, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v29, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v41 - ; GCN-NEXT: v_fma_f32 v30, s4, v10, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v42 - ; GCN-NEXT: v_exp_f32_e32 v31, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v17, v2, v0 - ; GCN-NEXT: v_pack_b32_f16 v16, v1, v27 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24 - ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134 - ; GCN-NEXT: v_exp_f32_e32 v19, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8 - ; GCN-NEXT: ds_read_b128 v[8:11], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v24, v4 - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v26 - ; GCN-NEXT: v_exp_f32_e32 v27, v4 - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95] - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 - ; GCN-NEXT: v_fma_f32 v21, s4, v13, -v134 - ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31 - ; GCN-NEXT: v_exp_f32_e32 v30, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 - ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v16, v4 - ; GCN-NEXT: v_pack_b32_f16 v0, v5, v20 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v12 - ; GCN-NEXT: v_exp_f32_e32 v18, v9 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21 - ; GCN-NEXT: v_exp_f32_e32 v21, v9 - ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: ds_read_b128 v[4:7], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95] - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30 - ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_exp_f32_e32 v2, v2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127] - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_exp_f32_e32 v10, v1 - ; GCN-NEXT: v_pack_b32_f16 v8, v17, v20 - ; GCN-NEXT: v_pack_b32_f16 v9, v3, v0 - ; GCN-NEXT: v_add_f32_e32 v3, 0, v49 - ; GCN-NEXT: v_add_f32_e32 v3, v50, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v51, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v52, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v53, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v54, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v55, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v56, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v58, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v164, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v59, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v162, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v151, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v165, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v161, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v159, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v152, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v154, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v157, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v147, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v156, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v63, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v158, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v128, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v130, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v140, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v144, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v132, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v62, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v145, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v35, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v46, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v47, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v33, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v36, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v39, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95] - ; GCN-NEXT: v_add_f32_e32 v3, v34, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 - ; GCN-NEXT: v_add_f32_e32 v3, v38, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v42, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v25, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v26, v3 - ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22 - ; GCN-NEXT: v_add_f32_e32 v3, v29, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v31, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95] - ; GCN-NEXT: v_add_f32_e32 v3, v19, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v24, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v27, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v30, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v16, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v18, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v21, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: v_add_f32_e32 v0, v2, v3 - ; GCN-NEXT: v_add_f32_e32 v4, v10, v0 - ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_f32_e32 v2, v4, v5 - ; GCN-NEXT: ds_bpermute_b32 v3, v133, v2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111] - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7] - ; GCN-NEXT: ; implicit-def: $vgpr4 - ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pack_b32_f16 v91, v64, v69 + ; GCN-NEXT: v_add_f32_e32 v68, v99, v68 + ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[80:81], v[88:89], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v80, v96 + ; GCN-NEXT: v_cvt_f16_f32_e32 v88, v97 + ; GCN-NEXT: v_mul_f32_e32 v96, 0x3fb8aa3b, v72 + ; GCN-NEXT: v_fma_f32 v70, s4, v71, -v128 + ; GCN-NEXT: v_add_f32_e32 v64, v80, v68 + ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v70 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[68:71], v179 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[86:87], v[90:91], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v89, v98 + ; GCN-NEXT: ds_read_b128 v[84:87], v179 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_f32_e32 v72, v89, v64 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v92 + ; GCN-NEXT: v_pack_b32_f16 v64, v88, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[94:95], v[90:91], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v92, v93 + ; GCN-NEXT: v_mul_f32_e32 v93, 0x3fb8aa3b, v73 + ; GCN-NEXT: v_fma_f32 v73, s4, v75, -v128 + ; GCN-NEXT: v_add_f32_e32 v72, v92, v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[90:91], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v88, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v99 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v80 + ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128 + ; GCN-NEXT: v_add_f32_e32 v72, v88, v72 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[82:83], v[90:91], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v90, v81 + ; GCN-NEXT: v_mul_f32_e32 v91, 0x3fb8aa3b, v73 + ; GCN-NEXT: v_add_f32_e32 v66, v90, v72 + ; GCN-NEXT: ds_read_b128 v[72:75], v179 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[80:83], v179 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] + ; GCN-NEXT: v_fma_f32 v69, s4, v76, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v89 + ; GCN-NEXT: v_mul_f32_e32 v89, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v92 + ; GCN-NEXT: v_exp_f32_e32 v76, v96 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v66, v76, v66 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[64:65], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v92, v93 + ; GCN-NEXT: v_pack_b32_f16 v84, v68, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v88 + ; GCN-NEXT: v_fma_f32 v69, s4, v78, -v128 + ; GCN-NEXT: v_add_f32_e32 v66, v92, v66 + ; GCN-NEXT: v_mul_f32_e32 v93, 0x3fb8aa3b, v77 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[64:65], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v72, v67 + ; GCN-NEXT: v_mul_f32_e32 v73, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_add_f32_e32 v66, v72, v66 + ; GCN-NEXT: v_cvt_f16_f32_e32 v72, v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[80:81], v[64:65], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v90 + ; GCN-NEXT: v_exp_f32_e32 v88, v91 + ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v128 + ; GCN-NEXT: v_mul_f32_e32 v90, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_pack_b32_f16 v85, v68, v64 + ; GCN-NEXT: v_add_f32_e32 v91, v88, v66 + ; GCN-NEXT: ds_read_b128 v[64:67], v178 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[84:85], v[0:15] + ; GCN-NEXT: ds_read_b128 v[68:71], v178 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v70, v76 + ; GCN-NEXT: ds_read_b128 v[76:79], v178 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v89, v89 + ; GCN-NEXT: ds_read_b128 v[78:81], v178 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v92 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[84:85], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v81, v93 + ; GCN-NEXT: v_add_f32_e32 v71, v89, v91 + ; GCN-NEXT: v_pack_b32_f16 v70, v70, v80 + ; GCN-NEXT: v_add_f32_e32 v71, v81, v71 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[84:85], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v73, v73 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v74, v73, v71 + ; GCN-NEXT: v_cvt_f16_f32_e32 v71, v88 + ; GCN-NEXT: v_pack_b32_f16 v71, v72, v71 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[82:83], v[84:85], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v75, v90 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v72, v75, v74 + ; GCN-NEXT: ds_bpermute_b32 v74, v176, v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[70:71], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[64:65], v[70:71], v[0:15] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_nop 7 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v16, v72, v74 + ; GCN-NEXT: ds_bpermute_b32 v17, v176, v16 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v75 + ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v81 + ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v73 + ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v89 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[12:13] + ; GCN-NEXT: v_fmac_f32_e32 v16, v177, v112 + ; GCN-NEXT: v_pack_b32_f16 v17, v20, v18 + ; GCN-NEXT: v_pack_b32_f16 v16, v21, v19 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[76:77], v[70:71], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[78:79], v[70:71], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[16:17], v[0:15] ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index 0887fdf0844b0..1c570692719ac 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -6,488 +6,486 @@ define amdgpu_kernel void @smallInterleave() #0 { ret void } ; GCN-LABEL: smallInterleave: ; GCN: ; %bb.0: - ; GCN-NEXT: ; implicit-def: $vgpr2 - ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) - ; GCN-NEXT: v_readfirstlane_b32 s20, v2 - ; GCN-NEXT: ; implicit-def: $sgpr4 - ; GCN-NEXT: ; implicit-def: $vgpr3 - ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; implicit-def: $vgpr50 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr34 + ; GCN-NEXT: ; implicit-def: $vgpr35 + ; GCN-NEXT: ; implicit-def: $sgpr24 + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33 + ; GCN-NEXT: ; implicit-def: $sgpr20_sgpr21_sgpr22_sgpr23 + ; GCN-NEXT: ; implicit-def: $vgpr38 + ; GCN-NEXT: ; implicit-def: $vgpr39 + ; GCN-NEXT: ; implicit-def: $vgpr106 ; GCN-NEXT: ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19 - ; GCN-NEXT: ; implicit-def: $vgpr49 - ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 - ; GCN-NEXT: ; implicit-def: $vgpr51 - ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65 - ; GCN-NEXT: ; implicit-def: $vgpr76 - ; GCN-NEXT: ; implicit-def: $vgpr77 - ; GCN-NEXT: ; implicit-def: $vgpr78 - ; GCN-NEXT: ; implicit-def: $vgpr79 - ; GCN-NEXT: ; implicit-def: $vgpr80 - ; GCN-NEXT: ; implicit-def: $vgpr91 - ; GCN-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19 + ; GCN-NEXT: ; implicit-def: $vgpr40 + ; GCN-NEXT: ; implicit-def: $vgpr103 + ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67 + ; GCN-NEXT: ; implicit-def: $vgpr104 + ; GCN-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71 + ; GCN-NEXT: ; implicit-def: $vgpr72_vgpr73_vgpr74_vgpr75 + ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN-NEXT: ; implicit-def: $vgpr105 + ; GCN-NEXT: ; implicit-def: $vgpr93 + ; GCN-NEXT: ; implicit-def: $vgpr94 + ; GCN-NEXT: ; implicit-def: $vgpr95 + ; GCN-NEXT: ; implicit-def: $vgpr96 + ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + ; GCN-NEXT: v_mov_b32_e32 v92, 0 + ; GCN-NEXT: ; implicit-def: $sgpr6 + ; GCN-NEXT: ; implicit-def: $sgpr7 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: ; implicit-def: $vgpr97 + ; GCN-NEXT: ; implicit-def: $sgpr2 + ; GCN-NEXT: ; implicit-def: $sgpr3 + ; GCN-NEXT: ; implicit-def: $vgpr98 + ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr99 + ; GCN-NEXT: ; implicit-def: $vgpr100 + ; GCN-NEXT: ; implicit-def: $vgpr101 + ; GCN-NEXT: ; implicit-def: $vgpr102 ; GCN-NEXT: ; iglp_opt mask(0x00000002) + ; GCN-NEXT: v_readfirstlane_b32 s6, v34 ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v3 - ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1] - ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_lshl_add_u32 v34, s6, 4, v35 + ; GCN-NEXT: v_mad_u64_u32 v[36:37], s[8:9], s24, v34, v[32:33] + ; GCN-NEXT: buffer_load_dwordx4 v[32:35], v36, s[20:23], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: s_lshl_b32 s4, s20, 7 - ; GCN-NEXT: ; implicit-def: $vgpr5 - ; GCN-NEXT: v_add_lshl_u32 v48, v5, s4, 1 - ; GCN-NEXT: v_add_u32_e32 v76, s20, v76 - ; GCN-NEXT: v_and_b32_e32 v76, 0x1fffffff, v76 + ; GCN-NEXT: s_lshl_b32 s8, s6, 7 + ; GCN-NEXT: v_add_lshl_u32 v107, v38, s8, 1 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v48, v[0:3] + ; GCN-NEXT: ds_write_b128 v107, v[32:35] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[32:35], v4, s[0:3], 0 offen offset:64 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[80:83], v36, s[20:23], 0 offen offset:64 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr0 - ; GCN-NEXT: ; implicit-def: $vgpr1 - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_add_u32_e32 v0, v0, v50 - ; GCN-NEXT: v_add_u32_e32 v1, v1, v50 - ; GCN-NEXT: buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v32, v39, v106 + ; GCN-NEXT: buffer_load_dwordx2 v[88:89], v32, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[74:75], v1, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v32, v40, v106 + ; GCN-NEXT: buffer_load_dwordx2 v[90:91], v32, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[36:39], v49 + ; GCN-NEXT: ds_read_b128 v[32:35], v103 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[44:47], v49 offset:512 + ; GCN-NEXT: ds_read_b128 v[84:87], v103 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0 - ; GCN-NEXT: ; kill: killed $vgpr1 - ; GCN-NEXT: ; kill: killed $vgpr0 - ; GCN-NEXT: v_mul_lo_u32 v76, v76, s6 - ; GCN-NEXT: v_add_lshl_u32 v76, v77, v76, 1 - ; GCN-NEXT: v_lshl_add_u32 v77, v78, 1, v76 - ; GCN-NEXT: ; implicit-def: $sgpr5 - ; GCN-NEXT: v_lshl_add_u32 v78, v79, 1, v77 - ; GCN-NEXT: ; implicit-def: $sgpr2 - ; GCN-NEXT: ; implicit-def: $sgpr3 - ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78 - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] - ; GCN-NEXT: ds_read_b128 v[36:39], v51 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[32:33], v[64:65], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[34:35], v[66:67], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[84:85], v[64:65], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[86:87], v[66:67], v[32:47] + ; GCN-NEXT: ds_read_b128 v[64:67], v104 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] - ; GCN-NEXT: ds_read_b128 v[44:47], v51 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63] + ; GCN-NEXT: ds_read_b128 v[64:67], v104 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v48, v[32:35] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31] + ; GCN-NEXT: ds_write_b128 v107, v[80:83] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[68:69], v[32:47] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[70:71], v[32:47] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[32:35], v49 + ; GCN-NEXT: ds_read_b128 v[64:67], v103 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] - ; GCN-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] - ; GCN-NEXT: ds_read_b128 v[40:43], v49 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[72:73], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[74:75], v[48:63] + ; GCN-NEXT: ds_read_b128 v[64:67], v103 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v51 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[72:73], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[74:75], v[32:47] + ; GCN-NEXT: ds_read_b128 v[64:67], v104 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31] - ; GCN-NEXT: ; implicit-def: $vgpr32 - ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: v_add_u32_e32 v82, v32, v50 - ; GCN-NEXT: v_add_u32_e32 v83, v33, v50 - ; GCN-NEXT: ; kill: killed $vgpr82 - ; GCN-NEXT: ; kill: killed $vgpr83 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31] - ; GCN-NEXT: ds_read_b128 v[66:69], v51 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[76:77], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[78:79], v[48:63] + ; GCN-NEXT: ds_read_b128 v[64:67], v104 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_max_f32_e32 v81, v67, v67 - ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31] - ; GCN-NEXT: v_perm_b32 v70, v74, v72, s2 - ; GCN-NEXT: v_perm_b32 v71, v74, v72, s3 - ; GCN-NEXT: v_perm_b32 v72, v75, v73, s2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[76:77], v[32:47] + ; GCN-NEXT: s_nop 6 + ; GCN-NEXT: v_mul_f32_e32 v64, s4, v48 + ; GCN-NEXT: v_mul_f32_e32 v65, s4, v49 + ; GCN-NEXT: v_max3_f32 v64, v64, s7, v65 + ; GCN-NEXT: v_mul_f32_e32 v68, s4, v52 + ; GCN-NEXT: v_mul_f32_e32 v69, s4, v53 + ; GCN-NEXT: v_mul_f32_e32 v70, s4, v54 + ; GCN-NEXT: v_mul_f32_e32 v71, s4, v55 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[78:79], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v66, s4, v50 + ; GCN-NEXT: v_mul_f32_e32 v67, s4, v51 + ; GCN-NEXT: v_max3_f32 v64, v64, v66, v67 + ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 + ; GCN-NEXT: v_mul_f32_e32 v72, s4, v56 + ; GCN-NEXT: v_mul_f32_e32 v73, s4, v57 + ; GCN-NEXT: v_max3_f32 v64, v64, v70, v71 + ; GCN-NEXT: v_mul_f32_e32 v74, s4, v58 + ; GCN-NEXT: v_mul_f32_e32 v75, s4, v59 + ; GCN-NEXT: v_max3_f32 v64, v64, v72, v73 + ; GCN-NEXT: v_mul_f32_e32 v76, s4, v60 + ; GCN-NEXT: v_mul_f32_e32 v77, s4, v61 + ; GCN-NEXT: v_max3_f32 v64, v64, v74, v75 + ; GCN-NEXT: v_mul_f32_e32 v78, s4, v62 + ; GCN-NEXT: v_mul_f32_e32 v79, s4, v63 + ; GCN-NEXT: v_max3_f32 v64, v64, v76, v77 + ; GCN-NEXT: v_mul_f32_e32 v80, s4, v32 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v33 + ; GCN-NEXT: v_max3_f32 v64, v64, v78, v79 + ; GCN-NEXT: v_mul_f32_e32 v82, s4, v34 + ; GCN-NEXT: v_mul_f32_e32 v83, s4, v35 + ; GCN-NEXT: v_max3_f32 v64, v64, v80, v81 + ; GCN-NEXT: v_mul_f32_e32 v84, s4, v36 + ; GCN-NEXT: v_mul_f32_e32 v85, s4, v37 + ; GCN-NEXT: v_max3_f32 v64, v64, v82, v83 + ; GCN-NEXT: v_mul_f32_e32 v86, s4, v38 + ; GCN-NEXT: v_mul_f32_e32 v87, s4, v39 + ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 + ; GCN-NEXT: v_mul_f32_e32 v103, s4, v40 + ; GCN-NEXT: v_mul_f32_e32 v104, s4, v41 + ; GCN-NEXT: v_max3_f32 v64, v64, v86, v87 + ; GCN-NEXT: v_mul_f32_e32 v107, s4, v42 + ; GCN-NEXT: v_mul_f32_e32 v108, s4, v43 + ; GCN-NEXT: v_max3_f32 v64, v64, v103, v104 + ; GCN-NEXT: v_mul_f32_e32 v109, s4, v44 + ; GCN-NEXT: v_mul_f32_e32 v110, s4, v45 + ; GCN-NEXT: v_max3_f32 v64, v64, v107, v108 + ; GCN-NEXT: v_mul_f32_e32 v111, s4, v46 + ; GCN-NEXT: v_mul_f32_e32 v112, s4, v47 + ; GCN-NEXT: v_max3_f32 v64, v64, v109, v110 + ; GCN-NEXT: v_max3_f32 v64, v64, v111, v112 + ; GCN-NEXT: ds_bpermute_b32 v65, v105, v64 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 + ; GCN-NEXT: v_max_f32_e32 v64, v64, v65 + ; GCN-NEXT: ds_bpermute_b32 v65, v105, v64 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[0:1] + ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 + ; GCN-NEXT: v_max_f32_e32 v65, v93, v93 + ; GCN-NEXT: v_max_f32_e32 v64, v65, v64 + ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v64 + ; GCN-NEXT: v_fma_f32 v49, s4, v49, -v64 + ; GCN-NEXT: v_fma_f32 v50, s4, v50, -v64 + ; GCN-NEXT: v_fma_f32 v51, s4, v51, -v64 + ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v64 + ; GCN-NEXT: v_mul_f32_e32 v51, 0x3fb8aa3b, v51 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v65, v51 + ; GCN-NEXT: v_fma_f32 v51, s4, v52, -v64 + ; GCN-NEXT: v_fma_f32 v52, s4, v53, -v64 + ; GCN-NEXT: v_fma_f32 v53, s4, v54, -v64 + ; GCN-NEXT: v_fma_f32 v54, s4, v55, -v64 + ; GCN-NEXT: v_fma_f32 v55, s4, v56, -v64 + ; GCN-NEXT: v_fma_f32 v56, s4, v57, -v64 + ; GCN-NEXT: v_fma_f32 v57, s4, v58, -v64 + ; GCN-NEXT: v_fma_f32 v58, s4, v59, -v64 + ; GCN-NEXT: v_fma_f32 v59, s4, v60, -v64 + ; GCN-NEXT: v_fma_f32 v60, s4, v61, -v64 + ; GCN-NEXT: v_fma_f32 v61, s4, v62, -v64 + ; GCN-NEXT: v_fma_f32 v62, s4, v63, -v64 + ; GCN-NEXT: v_exp_f32_e32 v63, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v33, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v66, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v34, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v67, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v35, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v68, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v36, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v69, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v37, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v70, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v38, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v71, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v39, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v72, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v40, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v73, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v41, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v74, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v42, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v75, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v43, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v76, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v44, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v77, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v45, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v78, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v46, -v64 + ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 + ; GCN-NEXT: v_mul_f32_e32 v49, 0x3fb8aa3b, v49 + ; GCN-NEXT: v_mul_f32_e32 v50, 0x3fb8aa3b, v50 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v79, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v47, -v64 + ; GCN-NEXT: v_exp_f32_e32 v48, v48 + ; GCN-NEXT: v_exp_f32_e32 v49, v49 + ; GCN-NEXT: v_exp_f32_e32 v50, v50 + ; GCN-NEXT: v_mul_f32_e32 v51, 0x3fb8aa3b, v51 + ; GCN-NEXT: v_mul_f32_e32 v52, 0x3fb8aa3b, v52 + ; GCN-NEXT: v_mul_f32_e32 v53, 0x3fb8aa3b, v53 + ; GCN-NEXT: v_mul_f32_e32 v54, 0x3fb8aa3b, v54 + ; GCN-NEXT: v_mul_f32_e32 v55, 0x3fb8aa3b, v55 + ; GCN-NEXT: v_mul_f32_e32 v56, 0x3fb8aa3b, v56 + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v51, v51 + ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v50 + ; GCN-NEXT: v_exp_f32_e32 v52, v52 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v49 + ; GCN-NEXT: v_exp_f32_e32 v53, v53 + ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v48 + ; GCN-NEXT: v_exp_f32_e32 v54, v54 + ; GCN-NEXT: v_exp_f32_e32 v55, v55 + ; GCN-NEXT: v_exp_f32_e32 v56, v56 + ; GCN-NEXT: v_exp_f32_e32 v57, v57 + ; GCN-NEXT: v_exp_f32_e32 v80, v32 + ; GCN-NEXT: v_add_f32_e32 v32, 0, v48 + ; GCN-NEXT: v_add_f32_e32 v32, v49, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v50, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v65, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v51, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v52, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v53, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v54, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v55, v32 + ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v58 + ; GCN-NEXT: v_add_f32_e32 v32, v56, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v57, v32 + ; GCN-NEXT: v_exp_f32_e32 v58, v58 + ; GCN-NEXT: v_mul_f32_e32 v59, 0x3fb8aa3b, v59 + ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v60 + ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v61 + ; GCN-NEXT: v_add_f32_e32 v32, v58, v32 + ; GCN-NEXT: v_exp_f32_e32 v59, v59 + ; GCN-NEXT: v_exp_f32_e32 v60, v60 + ; GCN-NEXT: v_exp_f32_e32 v61, v61 + ; GCN-NEXT: v_mul_f32_e32 v62, 0x3fb8aa3b, v62 + ; GCN-NEXT: v_exp_f32_e32 v62, v62 + ; GCN-NEXT: v_add_f32_e32 v32, v59, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v60, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v61, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v62, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v63, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v66, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v67, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v68, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v69, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v70, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v71, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v72, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v73, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v74, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v75, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v76, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v77, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v78, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v79, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v80, v32 + ; GCN-NEXT: ds_bpermute_b32 v33, v105, v32 + ; GCN-NEXT: v_add_u32_e32 v38, s6, v94 + ; GCN-NEXT: v_and_b32_e32 v38, 0x1fffffff, v38 + ; GCN-NEXT: v_mul_lo_u32 v38, v38, s5 + ; GCN-NEXT: v_perm_b32 v34, v90, v88, s2 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_add_f32_e32 v32, v32, v33 + ; GCN-NEXT: ds_bpermute_b32 v33, v105, v32 + ; GCN-NEXT: v_perm_b32 v35, v90, v88, s3 + ; GCN-NEXT: v_perm_b32 v36, v91, v89, s2 + ; GCN-NEXT: v_perm_b32 v37, v91, v89, s3 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v33, v33, v32, s[0:1] + ; GCN-NEXT: v_sub_f32_e32 v32, v93, v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v32, v32 + ; GCN-NEXT: v_add_lshl_u32 v64, v95, v38, 1 + ; GCN-NEXT: v_lshl_add_u32 v81, v96, 1, v64 + ; GCN-NEXT: v_lshl_add_u32 v82, v97, 1, v81 + ; GCN-NEXT: v_lshl_add_u32 v83, v98, 1, v82 + ; GCN-NEXT: v_fmac_f32_e32 v33, v92, v32 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v70 + ; GCN-NEXT: ds_write_b32 v64, v34 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v71 + ; GCN-NEXT: ds_write_b32 v81, v35 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v72 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v20 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v64, s4, v16 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v17 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 - ; GCN-NEXT: v_max3_f32 v64, v64, s5, v65 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v21 - ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v22 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v23 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v24 - ; GCN-NEXT: v_mul_f32_e32 v87, s4, v25 - ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v26 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v27 - ; GCN-NEXT: v_max3_f32 v64, v64, v86, v87 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v28 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v29 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v30 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v31 - ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v0 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v80, v84 - ; GCN-NEXT: v_mul_f32_e32 v87, s4, v2 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v3 - ; GCN-NEXT: v_max3_f32 v64, v64, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v4 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v5 - ; GCN-NEXT: v_max3_f32 v64, v64, v87, v65 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v7 - ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v8 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v10 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v11 - ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v87, s4, v12 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v13 - ; GCN-NEXT: v_max3_f32 v64, v64, v86, v65 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v14 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v15 - ; GCN-NEXT: v_max3_f32 v64, v64, v87, v68 - ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 - ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 - ; GCN-NEXT: v_perm_b32 v68, v75, v73, s3 + ; GCN-NEXT: ds_write_b32 v82, v36 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v68 - ; GCN-NEXT: ; implicit-def: $vgpr84 - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v70, v64, v65 + ; GCN-NEXT: ds_write_b32 v83, v37 + ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_add_u32_e32 v32, v99, v106 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[48:49], v32, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v32, v100, v106 + ; GCN-NEXT: v_cvt_f16_f32_e32 v87, v51 + ; GCN-NEXT: buffer_load_dwordx2 v[50:51], v32, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_bpermute_b32 v71, v66, v70 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v70, v71, v70, s[0:1] - ; GCN-NEXT: v_max_f32_e32 v70, v70, v70 - ; GCN-NEXT: v_max_f32_e32 v72, v81, v70 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v72 - ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v72 - ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v72 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 - ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 - ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72 - ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72 - ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72 - ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72 - ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72 - ; GCN-NEXT: v_exp_f32_e32 v73, v16 - ; GCN-NEXT: v_exp_f32_e32 v74, v18 - ; GCN-NEXT: v_exp_f32_e32 v75, v19 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 - ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 - ; GCN-NEXT: v_exp_f32_e32 v80, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v73 - ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72 - ; GCN-NEXT: v_exp_f32_e32 v81, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v74 - ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72 - ; GCN-NEXT: v_exp_f32_e32 v82, v22 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v75 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72 - ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_sub_f32_e32 v24, v67, v72 - ; GCN-NEXT: v_exp_f32_e32 v83, v23 - ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72 - ; GCN-NEXT: v_exp_f32_e32 v85, v22 - ; GCN-NEXT: v_exp_f32_e32 v17, v17 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 - ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72 - ; GCN-NEXT: v_exp_f32_e32 v88, v23 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72 - ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 + ; GCN-NEXT: ds_read_b128 v[32:35], v101 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v16, v24 - ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576 + ; GCN-NEXT: ds_read_b128 v[36:39], v101 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v18, 0, v73 - ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83 - ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80 - ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72 - ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v17, v18 - ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81 - ; GCN-NEXT: v_fma_f32 v23, s4, v30, -v72 - ; GCN-NEXT: v_exp_f32_e32 v30, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82 - ; GCN-NEXT: v_fma_f32 v18, s4, v31, -v72 - ; GCN-NEXT: v_perm_b32 v31, v68, v64, s2 - ; GCN-NEXT: v_perm_b32 v64, v68, v64, s3 - ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3 - ; GCN-NEXT: ds_read_b128 v[26:29], v91 + ; GCN-NEXT: ds_read_b128 v[40:43], v102 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v91 offset:576 + ; GCN-NEXT: ds_read_b128 v[44:47], v102 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_perm_b32 v88, v50, v48, s2 + ; GCN-NEXT: v_perm_b32 v48, v50, v48, s3 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v31 - ; GCN-NEXT: v_mul_f32_e32 v31, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_exp_f32_e32 v31, v31 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_pack_b32_f16 v18, v19, v86 - ; GCN-NEXT: v_pack_b32_f16 v19, v22, v89 + ; GCN-NEXT: ds_write_b32 v64, v88 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v64 + ; GCN-NEXT: ds_write_b32 v81, v48 + ; GCN-NEXT: v_cvt_f16_f32_e32 v48, v65 + ; GCN-NEXT: v_perm_b32 v50, v51, v49, s2 + ; GCN-NEXT: v_perm_b32 v49, v51, v49, s3 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v90 + ; GCN-NEXT: ds_write_b32 v82, v50 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v65 - ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v73 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v87 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v74, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v85 - ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72 - ; GCN-NEXT: v_exp_f32_e32 v22, v64 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v88 - ; GCN-NEXT: v_exp_f32_e32 v64, v65 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v75, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 - ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72 - ; GCN-NEXT: v_exp_f32_e32 v23, v23 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v31 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v0 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v20, v21 - ; GCN-NEXT: v_pack_b32_f16 v1, v18, v19 - ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72 - ; GCN-NEXT: v_exp_f32_e32 v25, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v80, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 - ; GCN-NEXT: v_fma_f32 v26, s4, v4, -v72 - ; GCN-NEXT: v_exp_f32_e32 v27, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v5, -v72 - ; GCN-NEXT: v_exp_f32_e32 v65, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 - ; GCN-NEXT: v_add_f32_e32 v17, v81, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v23 - ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72 - ; GCN-NEXT: v_exp_f32_e32 v68, v2 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v25 + ; GCN-NEXT: ds_write_b32 v83, v49 + ; GCN-NEXT: v_pack_b32_f16 v49, v86, v48 + ; GCN-NEXT: v_pack_b32_f16 v48, v84, v85 + ; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 + ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 + ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v54 + ; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 + ; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 + ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 + ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v58 + ; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 + ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 + ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 + ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v62 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[48:49], v[16:31] + ; GCN-NEXT: v_pack_b32_f16 v51, v53, v50 + ; GCN-NEXT: v_pack_b32_f16 v50, v87, v52 + ; GCN-NEXT: v_pack_b32_f16 v53, v57, v54 + ; GCN-NEXT: v_pack_b32_f16 v52, v55, v56 + ; GCN-NEXT: v_pack_b32_f16 v55, v61, v58 + ; GCN-NEXT: v_pack_b32_f16 v54, v59, v60 + ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v80 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[50:51], v[16:31] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[40:41], v[52:53], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[42:43], v[54:55], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v43, v63 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[36:37], v[48:49], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v78 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[38:39], v[50:51], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v39, v66 + ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v67 + ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v68 + ; GCN-NEXT: v_pack_b32_f16 v41, v40, v38 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[52:53], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v79 + ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v77 + ; GCN-NEXT: v_pack_b32_f16 v40, v43, v39 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[54:55], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v48, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v70 + ; GCN-NEXT: v_cvt_f16_f32_e32 v42, v71 + ; GCN-NEXT: v_cvt_f16_f32_e32 v35, v72 + ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v75 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v76 + ; GCN-NEXT: v_pack_b32_f16 v43, v42, v35 + ; GCN-NEXT: v_pack_b32_f16 v42, v48, v37 + ; GCN-NEXT: v_pack_b32_f16 v35, v46, v33 + ; GCN-NEXT: v_pack_b32_f16 v33, v44, v32 + ; GCN-NEXT: v_pack_b32_f16 v32, v45, v36 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v84 + ; GCN-NEXT: ds_read_b128 v[36:39], v101 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pack_b32_f16 v4, v18, v4 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v19 - ; GCN-NEXT: v_exp_f32_e32 v24, v24 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 offset:576 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] + ; GCN-NEXT: ds_read_b128 v[36:39], v101 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v82, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v27 - ; GCN-NEXT: v_exp_f32_e32 v26, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v29, v65 - ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72 - ; GCN-NEXT: v_exp_f32_e32 v67, v67 - ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v83, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v68 - ; GCN-NEXT: v_exp_f32_e32 v6, v6 - ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v24 - ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v7 - ; GCN-NEXT: v_exp_f32_e32 v7, v7 - ; GCN-NEXT: v_pack_b32_f16 v4, v28, v29 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v69 - ; GCN-NEXT: ; implicit-def: $sgpr2 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v0, v85, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v4, v88, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 - ; GCN-NEXT: v_exp_f32_e32 v10, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 - ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 - ; GCN-NEXT: v_pack_b32_f16 v0, v17, v28 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v30, v4 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v0, v31, v2 - ; GCN-NEXT: v_add_f32_e32 v0, v22, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v64, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v23, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v25, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v27, v0 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72 - ; GCN-NEXT: v_add_f32_e32 v0, v65, v0 - ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72 - ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v68, v0 - ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72 - ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72 - ; GCN-NEXT: v_exp_f32_e32 v8, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v24, v0 - ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72 - ; GCN-NEXT: v_exp_f32_e32 v9, v9 - ; GCN-NEXT: v_add_f32_e32 v0, v26, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v67, v0 - ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72 - ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v5 - ; GCN-NEXT: v_add_f32_e32 v0, v6, v0 - ; GCN-NEXT: v_exp_f32_e32 v11, v11 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 - ; GCN-NEXT: v_exp_f32_e32 v12, v3 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13 - ; GCN-NEXT: v_exp_f32_e32 v17, v1 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v14 - ; GCN-NEXT: v_add_f32_e32 v0, v7, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9 - ; GCN-NEXT: v_exp_f32_e32 v15, v3 - ; GCN-NEXT: v_exp_f32_e32 v18, v1 - ; GCN-NEXT: v_add_f32_e32 v6, v8, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v91 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[36:37], v[40:41], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[38:39], v[42:43], v[0:15] + ; GCN-NEXT: ds_read_b128 v[36:39], v102 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 - ; GCN-NEXT: v_add_f32_e32 v6, v9, v6 - ; GCN-NEXT: v_pack_b32_f16 v8, v4, v13 - ; GCN-NEXT: v_add_f32_e32 v6, v10, v6 - ; GCN-NEXT: v_pack_b32_f16 v9, v5, v14 - ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v12 - ; GCN-NEXT: v_add_f32_e32 v6, v11, v6 - ; GCN-NEXT: v_add_f32_e32 v6, v12, v6 - ; GCN-NEXT: v_add_f32_e32 v1, v15, v6 - ; GCN-NEXT: v_add_f32_e32 v11, v17, v1 - ; GCN-NEXT: v_pack_b32_f16 v1, v0, v7 - ; GCN-NEXT: v_pack_b32_f16 v0, v4, v10 - ; GCN-NEXT: ds_read_b128 v[4:7], v91 offset:576 + ; GCN-NEXT: v_cvt_f16_f32_e32 v47, v73 + ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v74 + ; GCN-NEXT: v_pack_b32_f16 v34, v47, v34 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[34:35], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[32:33], v[16:31] + ; GCN-NEXT: s_nop 7 + ; GCN-NEXT: s_nop 2 + ; GCN-NEXT: ds_read_b128 v[16:19], v102 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[34:35], v[0:15] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mov_b32_e32 v4, 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v18, v11 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_add_f32_e32 v2, v2, v3 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] - ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[18:19], v[32:33], v[0:15] ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 565ad295ebbb3..7f10ed31b3651 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -17,100 +17,100 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_mov_b32_e32 v3, 2.0 -; GCN-NEXT: ; iglp_opt mask(0x00000000) -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440 -; GCN-NEXT: ds_read_b128 a[20:23], v2 offset:57424 -; GCN-NEXT: ds_read_b128 a[16:19], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[0:3], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[4:7], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:57392 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152 -; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:49152 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; GCN-NEXT: ds_read_b128 a[28:31], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[24:27], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[20:23], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[16:19], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[0:3], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[4:7], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[8:11], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[12:15], v4 offset:57392 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[128:131], v3 offset:24576 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(4) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[128:131], v1 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 +; GCN-NEXT: ; iglp_opt mask(0x00000000) +; GCN-NEXT: s_waitcnt lgkmcnt(2) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[64:67], v3 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; GCN-NEXT: ds_read_b128 a[60:63], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[56:59], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[52:55], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[48:51], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[44:47], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[40:43], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[36:39], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[32:35], v3 offset:8192 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:24608 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[128:131] +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[64:67] ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:24592 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 ; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16400 ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 ; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 @@ -156,62 +156,62 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 -; GCN-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[0:3], v1 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v3 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-NEXT: ds_read_b128 a[60:63], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[56:59], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[52:55], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[48:51], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[44:47], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[40:43], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[36:39], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[32:35], v3 offset:8192 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:24576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:49152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; GCN-NEXT: ds_read_b128 a[156:159], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[152:155], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[148:151], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[144:147], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[128:131], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[132:135], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[136:139], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[140:143], v4 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 @@ -221,38 +221,38 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] ; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:8208 -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:24592 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784 ; GCN-NEXT: s_endpgm entry: call void @llvm.amdgcn.iglp.opt(i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir index e93595b9ef273..f079d0b8c392a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir @@ -135,17 +135,17 @@ body: | ; GCN-NEXT: [[DEF34:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_10]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in22, !alias.scope !0, addrspace 7) ; GCN-NEXT: [[DEF35:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_11]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in23, !alias.scope !0, addrspace 7) ; GCN-NEXT: [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_9]].sub2_sub3, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[DEF36:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_12]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7) - ; GCN-NEXT: [[DEF37:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_13]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7) ; GCN-NEXT: [[V_ADD_U32_e32_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF22]], implicit $exec ; GCN-NEXT: [[V_ADD_U32_e32_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF23]], implicit $exec + ; GCN-NEXT: [[DEF36:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_12]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[DEF37:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_13]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7) ; GCN-NEXT: [[DEF38:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_14]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in26, !alias.scope !0, addrspace 7) ; GCN-NEXT: [[DEF39:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_15]], [[DEF47]], 0, 0, 0, 0, implicit $exec ; GCN-NEXT: [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_10]].sub0_sub1, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF2]], 0, 0, implicit $exec :: (store (s128) into %ir.in2, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[V_ADD_U32_e32_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF45]], implicit $exec - ; GCN-NEXT: [[V_ADD_U32_e32_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF46]], implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF2]], 0, 0, implicit $exec :: (store (s128) into %ir.in2, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DEF2:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_16]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in26, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_ADD_U32_e32_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF46]], implicit $exec ; GCN-NEXT: [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_10]].sub2_sub3, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF3]], 2064, 0, implicit $exec :: (store (s128) into %ir.in3, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DEF3:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF45]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in27, !alias.scope !0, addrspace 7) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir index a85478df10eb2..905cd0eaf52d3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir @@ -59,9 +59,10 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 0, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF1]] ; GCN-NEXT: [[DS_READ_B128_gfx9_2:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DS_READ_B128_gfx9_3:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 2064, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3) - ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF1]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:areg_512_align2 = COPY [[DEF]] ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 1024, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF33]], implicit $exec @@ -74,7 +75,6 @@ body: | ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF4]], [[DEF16]], 0, 0, implicit $exec :: (store (s128) into %ir.in6, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DEF16:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF6]], [[DEF7]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 7) ; GCN-NEXT: dead [[COPY:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[COPY1:%[0-9]+]]:areg_512_align2 = COPY [[DEF]] ; GCN-NEXT: undef [[DEF17:%[0-9]+]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF30]], implicit $exec ; GCN-NEXT: [[DEF17:%[0-9]+]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF30]], implicit $exec ; GCN-NEXT: [[DEF17:%[0-9]+]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF30]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 67ae05eb6f0b8..9d6f18dad3366 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -3231,47 +3231,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s4 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s7 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s8 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s10 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] +; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 @@ -3299,47 +3299,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; LIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s4 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s7 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s8 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s10 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] +; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 @@ -3367,8 +3367,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; GFX90A-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -3389,8 +3389,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 1 @@ -3403,8 +3403,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; GFX942-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) @@ -3425,8 +3425,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] -; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v0, a[0:15] +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v0, a[0:15] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 73586b1243376..0ca07a0a07dab 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -623,63 +623,63 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[128:131], v1 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: ds_read_b128 a[60:63], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v2 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v2 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v2 offset:57392 -; GCN-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[128:131], v3 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576 +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152 +; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; GCN-NEXT: s_waitcnt lgkmcnt(14) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] ; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 4 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 @@ -730,63 +730,63 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:24576 -; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 1.0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v2 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v2 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v2 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v2 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v2 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v2 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v2 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v2 offset:57392 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v3, s0, v0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v3 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v3 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v3 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v3 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v3 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v3 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:24576 +; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:49152 +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] ; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 4 +; EXACTCUTOFF-NEXT: s_nop 3 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80 @@ -1199,144 +1199,144 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 -; GCN-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b +; GCN-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v7, 0x32a5705f +; GCN-NEXT: v_mov_b32_e32 v6, 0x32a5705f ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, s0, v3 -; GCN-NEXT: v_rndne_f32_e32 v5, v4 -; GCN-NEXT: v_sub_f32_e32 v6, v4, v5 -; GCN-NEXT: v_fma_f32 v4, s0, v3, -v4 -; GCN-NEXT: v_fmac_f32_e32 v4, s0, v7 -; GCN-NEXT: v_add_f32_e32 v4, v6, v4 -; GCN-NEXT: v_exp_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GCN-NEXT: v_mul_f32_e32 v3, s0, v2 +; GCN-NEXT: v_rndne_f32_e32 v4, v3 +; GCN-NEXT: v_sub_f32_e32 v5, v3, v4 +; GCN-NEXT: v_fma_f32 v3, s0, v2, -v3 +; GCN-NEXT: v_fmac_f32_e32 v3, s0, v6 +; GCN-NEXT: v_add_f32_e32 v3, v5, v3 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_add_u32_e32 v1, s6, v0 -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[96:99], v1 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:48 -; GCN-NEXT: v_mov_b32_e32 v9, 1.0 -; GCN-NEXT: v_ldexp_f32 v4, v4, v5 -; GCN-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; GCN-NEXT: v_mul_f32_e32 v10, s1, v3 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 -; GCN-NEXT: v_mov_b32_e32 v6, 0x42b17218 -; GCN-NEXT: v_rndne_f32_e32 v11, v10 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v1 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; GCN-NEXT: v_mov_b32_e32 v5, 1.0 +; GCN-NEXT: v_ldexp_f32 v3, v3, v4 +; GCN-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v4 +; GCN-NEXT: v_mov_b32_e32 v7, 0x42b17218 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v7 ; GCN-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; GCN-NEXT: v_sub_f32_e32 v12, v10, v11 -; GCN-NEXT: v_fma_f32 v10, s1, v3, -v10 -; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GCN-NEXT: v_fmac_f32_e32 v10, s1, v7 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v4, a[96:127] -; GCN-NEXT: v_add_f32_e32 v4, v12, v10 -; GCN-NEXT: v_exp_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; GCN-NEXT: v_ldexp_f32 v4, v4, v10 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 -; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GCN-NEXT: v_mul_f32_e32 v10, s2, v3 -; GCN-NEXT: v_rndne_f32_e32 v11, v10 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31] -; GCN-NEXT: v_fma_f32 v4, s2, v3, -v10 -; GCN-NEXT: v_sub_f32_e32 v12, v10, v11 -; GCN-NEXT: v_fmac_f32_e32 v4, s2, v7 -; GCN-NEXT: v_add_f32_e32 v4, v12, v4 -; GCN-NEXT: v_exp_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11 -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 -; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152 -; GCN-NEXT: v_ldexp_f32 v1, v4, v10 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 -; GCN-NEXT: v_mul_f32_e32 v4, s3, v3 -; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_rndne_f32_e32 v10, v4 -; GCN-NEXT: s_load_dword s8, s[4:5], 0x54 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] -; GCN-NEXT: v_sub_f32_e32 v1, v4, v10 -; GCN-NEXT: v_fma_f32 v4, s3, v3, -v4 -; GCN-NEXT: v_fmac_f32_e32 v4, s3, v7 -; GCN-NEXT: v_add_f32_e32 v1, v1, v4 -; GCN-NEXT: v_exp_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v10 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 -; GCN-NEXT: ds_read_b128 a[156:159], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[152:155], v2 offset:57440 -; GCN-NEXT: v_ldexp_f32 v1, v1, v4 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 -; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v4, s8, v3 -; GCN-NEXT: v_fma_f32 v3, s8, v3, -v4 -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v1, a[32:63] -; GCN-NEXT: v_rndne_f32_e32 v1, v4 -; GCN-NEXT: v_sub_f32_e32 v10, v4, v1 -; GCN-NEXT: v_fmac_f32_e32 v3, s8, v7 +; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288 +; GCN-NEXT: s_waitcnt lgkmcnt(2) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v3, a[0:31] +; GCN-NEXT: v_mul_f32_e32 v3, s1, v2 +; GCN-NEXT: v_rndne_f32_e32 v9, v3 +; GCN-NEXT: v_sub_f32_e32 v10, v3, v9 +; GCN-NEXT: v_fma_f32 v3, s1, v2, -v3 +; GCN-NEXT: v_fmac_f32_e32 v3, s1, v6 ; GCN-NEXT: v_add_f32_e32 v3, v10, v3 ; GCN-NEXT: v_exp_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: ds_read_b128 a[148:151], v2 offset:57424 -; GCN-NEXT: ds_read_b128 a[144:147], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[128:131], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[132:135], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392 -; GCN-NEXT: v_ldexp_f32 v1, v3, v1 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v5 +; GCN-NEXT: v_cvt_i32_f32_e32 v9, v9 +; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 +; GCN-NEXT: v_ldexp_f32 v3, v3, v9 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v7 +; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 +; GCN-NEXT: s_waitcnt lgkmcnt(2) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v5, v3, a[128:159] +; GCN-NEXT: v_mul_f32_e32 v3, s2, v2 +; GCN-NEXT: v_rndne_f32_e32 v9, v3 +; GCN-NEXT: v_sub_f32_e32 v10, v3, v9 +; GCN-NEXT: v_fma_f32 v3, s2, v2, -v3 +; GCN-NEXT: v_fmac_f32_e32 v3, s2, v6 +; GCN-NEXT: v_add_f32_e32 v3, v10, v3 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v9, v9 +; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 +; GCN-NEXT: v_ldexp_f32 v3, v3, v9 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v7 +; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 +; GCN-NEXT: s_waitcnt lgkmcnt(2) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v5, v3, a[96:127] +; GCN-NEXT: v_mul_f32_e32 v3, s3, v2 +; GCN-NEXT: v_rndne_f32_e32 v9, v3 +; GCN-NEXT: v_sub_f32_e32 v10, v3, v9 +; GCN-NEXT: v_fma_f32 v3, s3, v2, -v3 +; GCN-NEXT: v_fmac_f32_e32 v3, s3, v6 +; GCN-NEXT: v_add_f32_e32 v3, v10, v3 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v9, v9 +; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x54 +; GCN-NEXT: v_ldexp_f32 v3, v3, v9 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v7 +; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v5, v3, a[64:95] +; GCN-NEXT: v_mul_f32_e32 v3, s0, v2 +; GCN-NEXT: v_rndne_f32_e32 v9, v3 +; GCN-NEXT: v_fma_f32 v2, s0, v2, -v3 +; GCN-NEXT: v_sub_f32_e32 v10, v3, v9 +; GCN-NEXT: v_fmac_f32_e32 v2, s0, v6 +; GCN-NEXT: v_add_f32_e32 v2, v10, v2 +; GCN-NEXT: v_exp_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v9 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392 +; GCN-NEXT: v_ldexp_f32 v1, v2, v3 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v7 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GCN-NEXT: v_add_u32_e32 v0, s7, v0 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[96:99] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v5, v1, a[32:63] +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[0:3] ; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1347,181 +1347,181 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784 ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x32a5705f +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v6, 0x32a5705f ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s0, v3 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v5, v4 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v6, v4, v5 -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s0, v3, -v4 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s0, v7 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v6, v4 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v3, s0, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v4, v3 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v5, v3, v4 +; EXACTCUTOFF-NEXT: v_fma_f32 v3, s0, v2, -v3 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s0, v6 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v5, v3 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v4, v4 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s6, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:48 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v9, 1.0 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v5 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s1, v3 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v6, 0x42b17218 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v5, 1.0 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v3, v3, v4 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v4 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x42b17218 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v7 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11 -; EXACTCUTOFF-NEXT: v_fma_f32 v10, s1, v3, -v10 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v10, s1, v7 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v4, a[96:127] -; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v10 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v10 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s2, v3 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31] -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s2, v3, -v10 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s2, v7 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v4 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11 -; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:24576 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:49152 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v4, v10 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s3, v3 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v4 -; EXACTCUTOFF-NEXT: s_load_dword s8, s[4:5], 0x54 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v10 -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s3, v3, -v4 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s3, v7 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v1, v4 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v1, v1 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v4, v10 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 -; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v2 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v2 offset:57440 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v1, v4 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s8, v3 -; EXACTCUTOFF-NEXT: v_fma_f32 v3, s8, v3, -v4 -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v1, a[32:63] -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v1, v4 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v4, v1 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s8, v7 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:8288 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v3, a[0:31] +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v3, s1, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v3 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v3, v9 +; EXACTCUTOFF-NEXT: v_fma_f32 v3, s1, v2, -v3 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s1, v6 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v10, v3 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v9, v9 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1 offset:8192 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v3, v3, v9 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v4 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v7 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:24672 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v5, v3, a[128:159] +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v3, s2, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v3 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v3, v9 +; EXACTCUTOFF-NEXT: v_fma_f32 v3, s2, v2, -v3 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s2, v6 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v10, v3 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v9, v9 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:24576 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v3, v3, v9 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v4 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v7 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:49248 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v5, v3, a[96:127] +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v3, s3, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v3 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v3, v9 +; EXACTCUTOFF-NEXT: v_fma_f32 v3, s3, v2, -v3 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s3, v6 ; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v10, v3 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v1, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v2 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v2 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v2 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v2 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v3, v1 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v5 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v9, v9 +; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:49152 +; EXACTCUTOFF-NEXT: s_load_dword s0, s[4:5], 0x54 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v3, v3, v9 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v4 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v7 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:57456 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v5, v3, a[64:95] +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v3, s0, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v3 +; EXACTCUTOFF-NEXT: v_fma_f32 v2, s0, v2, -v3 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v3, v9 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v2, s0, v6 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v2, v10, v2 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v2, v2 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v3, v9 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:57392 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v2, v3 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v4 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v7 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s7, v0 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:112 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:96 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:80 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:64 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:48 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:32 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:16 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v5, v1, a[32:63] +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s7 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1532,38 +1532,38 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:32784 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:8288 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:8304 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:8256 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:8272 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:8224 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:8240 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:8208 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:16480 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:16496 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:16448 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:16464 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:16416 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:16432 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:16384 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:16400 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:24672 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:24688 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:24640 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:24656 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:24608 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:24624 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:24592 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:32784 ; EXACTCUTOFF-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir index 23412aaeb2e23..1eea49843d168 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir @@ -1988,8 +1988,8 @@ body: | ; GFX908: bb.0: ; GFX908-NEXT: successors: %bb.1(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode ; GFX908-NEXT: [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF @@ -2274,7 +2274,7 @@ body: | ; GFX908-NEXT: S_NOP 0, implicit [[DEF91]], implicit [[DEF92]], implicit [[DEF93]], implicit [[DEF94]], implicit [[DEF95]], implicit [[DEF96]], implicit [[DEF97]], implicit [[DEF98]], implicit [[DEF99]], implicit [[DEF100]] ; GFX908-NEXT: S_NOP 0, implicit [[DEF101]], implicit [[DEF102]], implicit [[DEF103]], implicit [[DEF104]], implicit [[DEF105]], implicit [[DEF106]], implicit [[DEF107]], implicit [[DEF108]], implicit [[DEF109]], implicit [[DEF110]] ; GFX908-NEXT: S_NOP 0, implicit [[DEF111]], implicit [[DEF112]], implicit [[DEF113]], implicit [[DEF114]], implicit [[DEF115]], implicit [[DEF116]], implicit [[DEF117]], implicit [[DEF118]], implicit [[DEF119]], implicit [[DEF120]] - ; GFX908-NEXT: S_NOP 0, implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]], implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] + ; GFX908-NEXT: S_NOP 0, implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]], implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: reduce_spill_agpr_above_addressable_limit diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 9cc42ac448067..03b42479f4b4a 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -2391,660 +2391,656 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: .LBB3_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x34 -; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: s_clause 0xa -; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:122 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:125 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:130 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:129 @@ -10118,683 +10114,679 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800 ; ALIGNED-NEXT: .LBB8_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: s_clause 0x3a -; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128 -; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124 -; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:122 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:125 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:130 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:125 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:128 -; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:124 -; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:122 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:121 @@ -11173,827 +11165,823 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_mov_b32 s5, -1 ; ALIGNED-NEXT: .LBB8_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: s_clause 0xa -; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128 -; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x34 -; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124 -; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:121 -; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:120 -; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:119 -; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:117 -; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:116 -; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:114 -; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:113 -; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:112 -; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:111 -; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:108 -; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:107 -; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:106 -; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:105 -; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:104 -; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:103 -; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:101 -; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:100 -; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:98 -; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:97 -; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:96 -; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:95 -; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:92 -; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:88 -; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:81 -; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:74 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:122 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:125 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0xffffff00, v1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:130 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:125 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:122 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:121 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:119 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:118 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:117 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:116 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:115 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:114 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:113 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:112 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:111 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:110 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:109 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:108 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:107 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:106 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:105 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:104 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:103 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:102 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:100 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:99 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:98 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:97 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:96 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:95 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:94 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:93 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:91 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:90 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:89 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:85 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:79 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:77 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:128 -; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:124 -; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:121 -; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:120 -; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:119 -; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:117 -; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:116 -; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:114 -; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:113 -; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:112 -; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:111 -; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:108 -; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:107 -; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:106 -; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:105 -; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:104 -; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:103 -; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:101 -; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:100 -; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:98 -; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:97 -; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:96 -; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:95 -; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:92 -; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:88 -; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:81 -; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:73 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index 21af2dde2c4bf..25b9f6e06eaf9 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -635,47 +635,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY908: ; %bb.0: ; %bb ; GREEDY908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY908-NEXT: v_mov_b32_e32 v1, 1.0 +; GREEDY908-NEXT: v_mov_b32_e32 v0, 2.0 ; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: v_mov_b32_e32 v5, s15 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s14 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s13 +; GREEDY908-NEXT: v_mov_b32_e32 v3, s14 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s13 ; GREEDY908-NEXT: v_accvgpr_write_b32 a33, v5 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s12 -; GREEDY908-NEXT: v_accvgpr_write_b32 a32, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a32, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s11 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s10 +; GREEDY908-NEXT: v_mov_b32_e32 v3, s11 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s10 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s9 -; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s8 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s7 +; GREEDY908-NEXT: v_mov_b32_e32 v3, s8 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s7 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s5 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s4 +; GREEDY908-NEXT: v_mov_b32_e32 v3, s5 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s4 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s3 -; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s2 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s1 +; GREEDY908-NEXT: v_mov_b32_e32 v3, s2 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s1 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s0 -; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] -; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v1, v0, a[18:33] +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v1, v0, a[18:33] ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19 @@ -684,7 +684,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 @@ -719,8 +719,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY90A: ; %bb.0: ; %bb ; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -742,14 +742,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0 ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v1, v0, a[18:33] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v1, v0, a[18:33] ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19 ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 2 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 @@ -761,8 +761,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY942: ; %bb.0: ; %bb ; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY942-NEXT: v_mov_b32_e32 v1, 1.0 +; GREEDY942-NEXT: v_mov_b32_e32 v0, 2.0 ; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -784,14 +784,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1 ; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33] -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v1, v0, a[18:33] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v1, v0, a[18:33] ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 0 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v0, a[0:15] ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 @@ -923,8 +923,8 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-LABEL: test_mfma_f32_4x4x1f32: ; GREEDY908: ; %bb.0: ; %bb ; GREEDY908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY908-NEXT: v_mov_b32_e32 v1, 1.0 +; GREEDY908-NEXT: v_mov_b32_e32 v0, 2.0 ; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 @@ -938,10 +938,10 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v3 ; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v5 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] -; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3] +; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] +; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v1, v0, a[0:3] ; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] ; GREEDY908-NEXT: s_nop 3 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 @@ -954,8 +954,8 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-LABEL: test_mfma_f32_4x4x1f32: ; GREEDY90A: ; %bb.0: ; %bb ; GREEDY90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 @@ -965,10 +965,10 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] -; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3] +; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] +; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v1, v0, a[0:3] ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] ; GREEDY90A-NEXT: s_nop 4 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] ; GREEDY90A-NEXT: s_endpgm @@ -976,8 +976,8 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-LABEL: test_mfma_f32_4x4x1f32: ; GREEDY942: ; %bb.0: ; %bb ; GREEDY942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY942-NEXT: v_mov_b32_e32 v1, 1.0 +; GREEDY942-NEXT: v_mov_b32_e32 v0, 2.0 ; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 @@ -987,11 +987,11 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s2 ; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s3 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] +; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v0, a[0:3] ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[2:5], v0, v1, a[0:3] +; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[2:5], v1, v0, a[0:3] ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] +; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v0, a[0:3] ; GREEDY942-NEXT: s_nop 3 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] ; GREEDY942-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 0c6339e4f5121..9b5bf35884bac 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -7,52 +7,52 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp ; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112 -; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96 -; CHECK-NEXT: global_load_dwordx4 v[20:23], v0, s[0:1] offset:80 -; CHECK-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] offset:64 -; CHECK-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] offset:48 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] offset:32 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16 +; CHECK-NEXT: global_load_dwordx4 v[30:33], v2, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[26:29], v2, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[22:25], v2, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[18:21], v2, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[14:17], v2, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[10:13], v2, s[0:1] offset:32 +; CHECK-NEXT: global_load_dwordx4 v[6:9], v2, s[0:1] offset:16 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] +; CHECK-NEXT: global_load_dwordx4 v[2:5], v2, s[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v2 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v3 -; CHECK-NEXT: v_accvgpr_write_b32 a4, v4 -; CHECK-NEXT: v_accvgpr_write_b32 a5, v5 -; CHECK-NEXT: v_accvgpr_write_b32 a6, v6 -; CHECK-NEXT: v_accvgpr_write_b32 a7, v7 -; CHECK-NEXT: v_accvgpr_write_b32 a8, v8 -; CHECK-NEXT: v_accvgpr_write_b32 a9, v9 -; CHECK-NEXT: v_accvgpr_write_b32 a10, v10 -; CHECK-NEXT: v_accvgpr_write_b32 a11, v11 -; CHECK-NEXT: v_accvgpr_write_b32 a12, v12 -; CHECK-NEXT: v_accvgpr_write_b32 a13, v13 -; CHECK-NEXT: v_accvgpr_write_b32 a14, v14 -; CHECK-NEXT: v_accvgpr_write_b32 a15, v15 -; CHECK-NEXT: v_accvgpr_write_b32 a16, v16 -; CHECK-NEXT: v_accvgpr_write_b32 a17, v17 -; CHECK-NEXT: v_accvgpr_write_b32 a18, v18 -; CHECK-NEXT: v_accvgpr_write_b32 a19, v19 -; CHECK-NEXT: v_accvgpr_write_b32 a20, v20 -; CHECK-NEXT: v_accvgpr_write_b32 a21, v21 -; CHECK-NEXT: v_accvgpr_write_b32 a22, v22 -; CHECK-NEXT: v_accvgpr_write_b32 a23, v23 -; CHECK-NEXT: v_accvgpr_write_b32 a24, v24 -; CHECK-NEXT: v_accvgpr_write_b32 a25, v25 -; CHECK-NEXT: v_accvgpr_write_b32 a26, v26 -; CHECK-NEXT: v_accvgpr_write_b32 a27, v27 -; CHECK-NEXT: v_accvgpr_write_b32 a28, v28 -; CHECK-NEXT: v_accvgpr_write_b32 a29, v29 -; CHECK-NEXT: v_accvgpr_write_b32 a30, v30 -; CHECK-NEXT: v_accvgpr_write_b32 a31, v31 -; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 -; CHECK-NEXT: v_mov_b32_e32 v1, 2.0 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v2 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a3, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a4, v6 +; CHECK-NEXT: v_accvgpr_write_b32 a5, v7 +; CHECK-NEXT: v_accvgpr_write_b32 a6, v8 +; CHECK-NEXT: v_accvgpr_write_b32 a7, v9 +; CHECK-NEXT: v_accvgpr_write_b32 a8, v10 +; CHECK-NEXT: v_accvgpr_write_b32 a9, v11 +; CHECK-NEXT: v_accvgpr_write_b32 a10, v12 +; CHECK-NEXT: v_accvgpr_write_b32 a11, v13 +; CHECK-NEXT: v_accvgpr_write_b32 a12, v14 +; CHECK-NEXT: v_accvgpr_write_b32 a13, v15 +; CHECK-NEXT: v_accvgpr_write_b32 a14, v16 +; CHECK-NEXT: v_accvgpr_write_b32 a15, v17 +; CHECK-NEXT: v_accvgpr_write_b32 a16, v18 +; CHECK-NEXT: v_accvgpr_write_b32 a17, v19 +; CHECK-NEXT: v_accvgpr_write_b32 a18, v20 +; CHECK-NEXT: v_accvgpr_write_b32 a19, v21 +; CHECK-NEXT: v_accvgpr_write_b32 a20, v22 +; CHECK-NEXT: v_accvgpr_write_b32 a21, v23 +; CHECK-NEXT: v_accvgpr_write_b32 a22, v24 +; CHECK-NEXT: v_accvgpr_write_b32 a23, v25 +; CHECK-NEXT: v_accvgpr_write_b32 a24, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a25, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a26, v28 +; CHECK-NEXT: v_accvgpr_write_b32 a27, v29 +; CHECK-NEXT: v_accvgpr_write_b32 a28, v30 +; CHECK-NEXT: v_accvgpr_write_b32 a29, v31 +; CHECK-NEXT: v_accvgpr_write_b32 a30, v32 +; CHECK-NEXT: v_accvgpr_write_b32 a31, v33 ; CHECK-NEXT: s_nop 1 ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31] @@ -147,7 +147,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle( ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; CHECK-NEXT: v_mov_b32_e32 v1, 2.0 +; CHECK-NEXT: v_mov_b32_e32 v1, 1.0 +; CHECK-NEXT: v_mov_b32_e32 v2, 2.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 ; CHECK-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 @@ -157,16 +158,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle( ; CHECK-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 ; CHECK-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 ; CHECK-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] -; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: s_nop 2 ; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir index 7295506213c4b..7eed89967adc0 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir @@ -161,16 +161,16 @@ body: | ; CHECK-LABEL: name: sched_barrier_mask_4 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: SCHED_BARRIER 4 @@ -209,19 +209,19 @@ body: | ; CHECK-LABEL: name: sched_barrier_mask_8 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: S_NOP 0 - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: SCHED_BARRIER 8 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) @@ -451,21 +451,21 @@ body: | ; CHECK-LABEL: name: sched_barrier_masks_8_12 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: SCHED_BARRIER 12 - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: SCHED_BARRIER 8 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir index d6774bb39dca7..d226f36928391 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir @@ -20,12 +20,12 @@ body: | ; GREEDY-LABEL: name: sched_group_barrier_2_VMEM_10_ALU_5_MFMA_2_VMEM_WRITE ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; GREEDY-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; GREEDY-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; GREEDY-NEXT: S_NOP 0 ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec @@ -44,12 +44,12 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_2_VMEM_10_ALU_5_MFMA_2_VMEM_WRITE ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; EXACT-NEXT: S_NOP 0 ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec @@ -132,9 +132,9 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_MFMA_VALU_and_SALU_alternating ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec - ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec @@ -212,8 +212,8 @@ body: | ; GREEDY-LABEL: name: sched_group_barrier_2_separate_pipes ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; GREEDY-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec @@ -223,8 +223,8 @@ body: | ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; GREEDY-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec - ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; GREEDY-NEXT: SCHED_GROUP_BARRIER 16, 2, 0 @@ -238,8 +238,8 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_2_separate_pipes ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec @@ -300,12 +300,12 @@ body: | ; GREEDY-LABEL: name: sched_group_barrier_3_separate_pipes ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; GREEDY-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; GREEDY-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; GREEDY-NEXT: S_NOP 0 ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec @@ -330,8 +330,8 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_3_separate_pipes ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir index 4f844762b24e3..90a79f8ecbe74 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir @@ -20,18 +20,18 @@ body: | ; CHECK-LABEL: name: no_sched_group_barrier ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) @@ -40,18 +40,18 @@ body: | ; EXACT-LABEL: name: no_sched_group_barrier ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; EXACT-NEXT: S_NOP 0 ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) @@ -84,8 +84,8 @@ body: | ; CHECK-LABEL: name: sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec @@ -110,8 +110,8 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec @@ -172,18 +172,18 @@ body: | ; CHECK-LABEL: name: sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) @@ -196,18 +196,18 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec - ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - ; EXACT-NEXT: S_NOP 0 ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec - ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec - ; EXACT-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec - ; EXACT-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; EXACT-NEXT: S_NOP 0 + ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; EXACT-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; EXACT-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) @@ -252,12 +252,12 @@ body: | ; CHECK-LABEL: name: sched_group_barrier_MFMA_VALU_and_SALU_alternating ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec @@ -284,9 +284,9 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_MFMA_VALU_and_SALU_alternating ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec - ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec