diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index fce8f36d45969..35886eb04c711 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -68,6 +68,14 @@ static cl::opt GCNTrackers( cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false)); +static cl::opt ExaminePendingQueue( + "amdgpu-examine-pending-queue", cl::Hidden, + cl::desc( + "Examine instructions in the pending the pending queue when " + "scheduling. This makes instructions visible to heuristics that cannot " + "immediately be issued due to hardware resource constraints."), + cl::init(true)); + const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) @@ -319,17 +327,45 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, } } +static bool shouldCheckPending(SchedBoundary &Zone, + const TargetSchedModel *SchedModel) { + const unsigned ReadyListLimit = 256; + bool HasBufferedModel = + SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize(); + return ExaminePendingQueue && + Zone.Available.size() + Zone.Pending.size() <= ReadyListLimit && + HasBufferedModel; +} + +static SUnit *pickOnlyChoice(SchedBoundary &Zone, + const TargetSchedModel *SchedModel) { + if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty()) + return Zone.pickOnlyChoice(); + return nullptr; +} + +#ifndef NDEBUG +void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current, + const SchedCandidate &Preferred) { + LLVM_DEBUG(dbgs() << "Prefer:\t\t"; DAG->dumpNode(*Preferred.SU)); + if (Current.SU) + LLVM_DEBUG(dbgs() << "Not:\t"; DAG->dumpNode(*Current.SU)); + LLVM_DEBUG(dbgs() << "Reason:\t\t"; traceCandidate(Preferred)); +} +#endif + // This function is mostly cut and pasted from // GenericScheduler::pickNodeFromQueue() void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand, + SchedCandidate &Cand, bool &IsPending, bool IsBottomUp) { const SIRegisterInfo *SRI = static_cast(TRI); ArrayRef Pressure = RPTracker.getRegSetPressureAtPos(); unsigned SGPRPressure = 0; unsigned VGPRPressure = 0; + IsPending = false; if (DAG->isTrackingPressure()) { if (!GCNTrackers) { SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; @@ -342,8 +378,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, VGPRPressure = T->getPressure().getArchVGPRNum(); } } - ReadyQueue &Q = Zone.Available; - for (SUnit *SU : Q) { + LLVM_DEBUG(dbgs() << "Available Q:\n"); + ReadyQueue &AQ = Zone.Available; + for (SUnit *SU : AQ) { SchedCandidate TryCand(ZonePolicy); initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, @@ -355,27 +392,59 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, // Initialize resource delta if needed in case future heuristics query it. if (TryCand.ResDelta == SchedResourceDelta()) TryCand.initResourceDelta(Zone.DAG, SchedModel); + LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); Cand.setBest(TryCand); - LLVM_DEBUG(traceCandidate(Cand)); } +#ifndef NDEBUG + else + printCandidateDecision(TryCand, Cand); +#endif + } + + if (!shouldCheckPending(Zone, SchedModel)) + return; + + LLVM_DEBUG(dbgs() << "Pending Q:\n"); + ReadyQueue &PQ = Zone.Pending; + for (SUnit *SU : PQ) { + + SchedCandidate TryCand(ZonePolicy); + initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, + VGPRPressure, IsBottomUp); + // Pass SchedBoundary only when comparing nodes from the same boundary. + SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; + tryPendingCandidate(Cand, TryCand, ZoneArg); + if (TryCand.Reason != NoCand) { + // Initialize resource delta if needed in case future heuristics query it. + if (TryCand.ResDelta == SchedResourceDelta()) + TryCand.initResourceDelta(Zone.DAG, SchedModel); + LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); + IsPending = true; + Cand.setBest(TryCand); + } +#ifndef NDEBUG + else + printCandidateDecision(TryCand, Cand); +#endif } } // This function is mostly cut and pasted from // GenericScheduler::pickNodeBidirectional() -SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { +SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode, + bool &PickedPending) { // Schedule as far as possible in the direction of no choice. This is most // efficient, but also provides the best heuristics for CriticalPSets. - if (SUnit *SU = Bot.pickOnlyChoice()) { + if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) { IsTopNode = false; return SU; } - if (SUnit *SU = Top.pickOnlyChoice()) { + if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) { IsTopNode = true; return SU; } - // Set the bottom-up policy based on the state of the current bottom zone and - // the instructions outside the zone, including the top zone. + // Set the bottom-up policy based on the state of the current bottom zone + // and the instructions outside the zone, including the top zone. CandPolicy BotPolicy; setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); // Set the top-down policy based on the state of the current top zone and @@ -383,12 +452,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { CandPolicy TopPolicy; setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); + bool BotPending = false; // See if BotCand is still valid (because we previously scheduled from Top). LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); if (!BotCand.isValid() || BotCand.SU->isScheduled || BotCand.Policy != BotPolicy) { BotCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand, + BotPending, /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find the first candidate"); } else { @@ -398,6 +469,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { SchedCandidate TCand; TCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, + BotPending, /*IsBottomUp=*/true); assert(TCand.SU == BotCand.SU && "Last pick result should correspond to re-picking right now"); @@ -405,12 +477,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { #endif } + bool TopPending = false; // Check if the top Q has a better candidate. LLVM_DEBUG(dbgs() << "Picking from Top:\n"); if (!TopCand.isValid() || TopCand.SU->isScheduled || TopCand.Policy != TopPolicy) { TopCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand, + TopPending, /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find the first candidate"); } else { @@ -420,6 +494,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { SchedCandidate TCand; TCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, + TopPending, /*IsBottomUp=*/false); assert(TCand.SU == TopCand.SU && "Last pick result should correspond to re-picking right now"); @@ -430,12 +505,21 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { // Pick best from BotCand and TopCand. LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); dbgs() << "Bot Cand: "; traceCandidate(BotCand);); - SchedCandidate Cand = BotCand; - TopCand.Reason = NoCand; - tryCandidate(Cand, TopCand, nullptr); - if (TopCand.Reason != NoCand) { - Cand.setBest(TopCand); + SchedCandidate Cand = BotPending ? TopCand : BotCand; + SchedCandidate TryCand = BotPending ? BotCand : TopCand; + PickedPending = BotPending && TopPending; + + TryCand.Reason = NoCand; + if (BotPending || TopPending) { + PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr); + } else { + tryCandidate(Cand, TryCand, nullptr); } + + if (TryCand.Reason != NoCand) { + Cand.setBest(TryCand); + } + LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); IsTopNode = Cand.AtTop; @@ -450,35 +534,46 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); return nullptr; } + bool PickedPending; SUnit *SU; do { + PickedPending = false; if (RegionPolicy.OnlyTopDown) { - SU = Top.pickOnlyChoice(); + SU = pickOnlyChoice(Top, SchedModel); if (!SU) { CandPolicy NoPolicy; TopCand.reset(NoPolicy); pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand, + PickedPending, /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find a candidate"); SU = TopCand.SU; } IsTopNode = true; } else if (RegionPolicy.OnlyBottomUp) { - SU = Bot.pickOnlyChoice(); + SU = pickOnlyChoice(Bot, SchedModel); if (!SU) { CandPolicy NoPolicy; BotCand.reset(NoPolicy); pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand, + PickedPending, /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find a candidate"); SU = BotCand.SU; } IsTopNode = false; } else { - SU = pickNodeBidirectional(IsTopNode); + SU = pickNodeBidirectional(IsTopNode, PickedPending); } } while (SU->isScheduled); + if (PickedPending) { + unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle; + SchedBoundary &Zone = IsTopNode ? Top : Bot; + Zone.bumpCycle(ReadyCycle); + Zone.releasePending(); + } + if (SU->isTopReady()) Top.removeReady(SU); if (SU->isBottomReady()) @@ -524,6 +619,47 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const { return *std::next(CurrentStage); } +bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) const { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + // Bias PhysReg Defs and copies to their uses and defined respectively. + if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), + biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) + return TryCand.Reason != NoCand; + + // Avoid exceeding the target's limit. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, + RegExcess, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + // Avoid increasing the max critical pressure in the scheduled region. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, + TryCand, Cand, RegCritical, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + bool SameBoundary = Zone != nullptr; + if (SameBoundary) { + TryCand.initResourceDelta(DAG, SchedModel); + if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand, ResourceReduce)) + return TryCand.Reason != NoCand; + if (tryGreater(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, TryCand, Cand, + ResourceDemand)) + return TryCand.Reason != NoCand; + } + + return false; +} + GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( const MachineSchedContext *C, bool IsLegacyScheduler) : GCNSchedStrategy(C) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 94cd795bbc8f6..c78835c8d5a77 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -44,17 +44,34 @@ raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID); /// heuristics to determine excess/critical pressure sets. class GCNSchedStrategy : public GenericScheduler { protected: - SUnit *pickNodeBidirectional(bool &IsTopNode); + SUnit *pickNodeBidirectional(bool &IsTopNode, bool &PickedPending); void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand, bool IsBottomUp); + SchedCandidate &Cand, bool &IsPending, + bool IsBottomUp); void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp); + /// Evaluates instructions in the pending queue using a subset of scheduling + /// heuristics. + /// + /// Instructions that cannot be issued due to hardware constraints are placed + /// in the pending queue rather than the available queue, making them normally + /// invisible to scheduling heuristics. However, in certain scenarios (such as + /// avoiding register spilling), it may be beneficial to consider scheduling + /// these not-yet-ready instructions. + bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const; + +#ifndef NDEBUG + void printCandidateDecision(const SchedCandidate &Current, + const SchedCandidate &Preferred); +#endif + std::vector Pressure; std::vector MaxPressure; diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 668219875db72..86505107587f1 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -947,6 +947,7 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1020 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1028 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2044 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2040 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2036 @@ -1201,7 +1202,6 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1040 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1036 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1032 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1028 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1024 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1016 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1012 @@ -1466,6 +1466,7 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1020 +; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1028 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2044 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2040 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2036 @@ -1720,7 +1721,6 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1040 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1036 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1032 -; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1028 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1024 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1016 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:1012 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index aad6e031aa9ed..ac91dadc07995 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -6,1145 +6,1149 @@ define amdgpu_kernel void @largeInterleave() #0 { ret void } ; GCN-LABEL: largeInterleave: ; GCN: ; %bb.0: + ; GCN-NEXT: ; implicit-def: $sgpr17 + ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: ; implicit-def: $vgpr66 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: ; implicit-def: $vgpr0 - ; GCN-NEXT: ; implicit-def: $vgpr2 - ; GCN-NEXT: ; implicit-def: $vgpr1 - ; GCN-NEXT: ; implicit-def: $vgpr8 + ; GCN-NEXT: ; implicit-def: $vgpr65 + ; GCN-NEXT: ; implicit-def: $vgpr72 + ; GCN-NEXT: ; implicit-def: $vgpr238 + ; GCN-NEXT: ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155 + ; GCN-NEXT: ; implicit-def: $vgpr80 + ; GCN-NEXT: ; implicit-def: $vgpr81 + ; GCN-NEXT: ; implicit-def: $vgpr82 + ; GCN-NEXT: ; implicit-def: $vgpr83 + ; GCN-NEXT: ; implicit-def: $vgpr84 + ; GCN-NEXT: ; implicit-def: $vgpr85 + ; GCN-NEXT: ; implicit-def: $vgpr86 + ; GCN-NEXT: ; implicit-def: $vgpr87 + ; GCN-NEXT: ; implicit-def: $vgpr88 + ; GCN-NEXT: ; implicit-def: $vgpr89 + ; GCN-NEXT: ; implicit-def: $vgpr90 + ; GCN-NEXT: ; implicit-def: $vgpr91 + ; GCN-NEXT: ; implicit-def: $vgpr92 + ; GCN-NEXT: ; implicit-def: $vgpr93 ; GCN-NEXT: ; implicit-def: $vgpr94 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ; implicit-def: $vgpr106 - ; GCN-NEXT: ; implicit-def: $vgpr132 - ; GCN-NEXT: ; implicit-def: $vgpr133 - ; GCN-NEXT: ; implicit-def: $vgpr139 - ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 - ; GCN-NEXT: ; iglp_opt mask(0x00000002) - ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: ; implicit-def: $vgpr73 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) - ; GCN-NEXT: v_readfirstlane_b32 s7, v0 + ; GCN-NEXT: v_add_u32_e32 v232, v73, v80 + ; GCN-NEXT: v_readfirstlane_b32 s17, v64 + ; GCN-NEXT: ; implicit-def: $sgpr15 ; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: ; implicit-def: $sgpr5 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_lshl_add_u32 v0, s7, 4, v2 - ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 - ; GCN-NEXT: v_add_lshl_u32 v92, v0, v1, 1 - ; GCN-NEXT: v_add_u32_e32 v93, s0, v92 - ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v234, v73, v81 + ; GCN-NEXT: v_add_u32_e32 v235, v73, v82 + ; GCN-NEXT: v_lshl_add_u32 v64, s17, 4, v66 + ; GCN-NEXT: v_mul_lo_u32 v64, v64, s6 + ; GCN-NEXT: v_add_lshl_u32 v222, v64, v65, 1 + ; GCN-NEXT: v_add_u32_e32 v95, s15, v222 + ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v222, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v95, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: s_lshl_b32 s0, s7, 7 - ; GCN-NEXT: v_add_lshl_u32 v95, v8, s0, 1 - ; GCN-NEXT: v_add_u32_e32 v8, 64, v93 - ; GCN-NEXT: ; kill: killed $vgpr8 + ; GCN-NEXT: s_lshl_b32 s6, s17, 7 + ; GCN-NEXT: v_add_lshl_u32 v240, v72, s6, 1 + ; GCN-NEXT: v_add_u32_e32 v72, 64, v95 + ; GCN-NEXT: v_add_u32_e32 v206, 0x80, v95 + ; GCN-NEXT: v_add_u32_e32 v226, 0xc0, v95 + ; GCN-NEXT: v_add_u32_e32 v241, v73, v89 + ; GCN-NEXT: v_add_u32_e32 v242, v73, v83 + ; GCN-NEXT: v_add_u32_e32 v243, v73, v84 + ; GCN-NEXT: v_add_u32_e32 v244, v73, v85 + ; GCN-NEXT: v_add_u32_e32 v188, v73, v94 + ; GCN-NEXT: v_add_u32_e32 v189, v73, v86 + ; GCN-NEXT: v_add_u32_e32 v190, v73, v87 + ; GCN-NEXT: v_add_u32_e32 v191, v73, v88 + ; GCN-NEXT: v_add_u32_e32 v184, v73, v93 + ; GCN-NEXT: v_add_u32_e32 v185, v73, v90 + ; GCN-NEXT: v_add_u32_e32 v186, v73, v91 + ; GCN-NEXT: v_add_u32_e32 v187, v73, v92 + ; GCN-NEXT: ; implicit-def: $vgpr74 + ; GCN-NEXT: ; implicit-def: $sgpr16 + ; GCN-NEXT: ; implicit-def: $vgpr75 + ; GCN-NEXT: ; implicit-def: $vgpr76 + ; GCN-NEXT: ; implicit-def: $vgpr77 + ; GCN-NEXT: ; implicit-def: $vgpr78 + ; GCN-NEXT: ; implicit-def: $vgpr79 + ; GCN-NEXT: v_add_u32_e32 v230, v73, v79 + ; GCN-NEXT: v_add_u32_e32 v74, s17, v74 + ; GCN-NEXT: v_and_b32_e32 v74, 0x1fffffff, v74 + ; GCN-NEXT: v_mul_lo_u32 v74, v74, s16 + ; GCN-NEXT: v_add_lshl_u32 v183, v75, v74, 1 + ; GCN-NEXT: v_lshl_add_u32 v180, v76, 1, v183 + ; GCN-NEXT: v_lshl_add_u32 v181, v77, 1, v180 + ; GCN-NEXT: v_lshl_add_u32 v182, v78, 1, v181 + ; GCN-NEXT: ; implicit-def: $vgpr239 + ; GCN-NEXT: ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159 + ; GCN-NEXT: ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; kill: killed $vgpr92 - ; GCN-NEXT: ; implicit-def: $sgpr6 + ; GCN-NEXT: ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147 + ; GCN-NEXT: ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143 + ; GCN-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: ; implicit-def: $sgpr7 + ; GCN-NEXT: ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135 + ; GCN-NEXT: ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131 + ; GCN-NEXT: ; implicit-def: $sgpr14 + ; GCN-NEXT: ; implicit-def: $vgpr176 + ; GCN-NEXT: ; implicit-def: $sgpr12_sgpr13 + ; GCN-NEXT: ; implicit-def: $vgpr192 + ; GCN-NEXT: v_max_f32_e32 v193, v192, v192 + ; GCN-NEXT: ; implicit-def: $vgpr179 + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; GCN-NEXT: ; implicit-def: $vgpr178 + ; GCN-NEXT: ; implicit-def: $vgpr177 + ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[0:3] + ; GCN-NEXT: ds_write_b128 v240, v[64:67] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[4:7] offset:1024 + ; GCN-NEXT: ds_write_b128 v240, v[68:71] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v222, s[8:11], 0 offen offset:64 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[64:67], v238 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127] + ; GCN-NEXT: ds_read_b128 v[64:67], v238 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111] + ; GCN-NEXT: ds_read_b128 v[64:67], v238 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0 - ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[168:171], v238 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: ds_read_b128 v[172:175], v239 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[194:197], v239 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[84:87], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[198:201], v239 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:1536 + ; GCN-NEXT: ds_read_b128 v[202:205], v239 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_add_u32_e32 v72, 0x80, v93 + ; GCN-NEXT: ds_write_b128 v240, v[160:163] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v240, v[164:167] offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v222, s[8:11], 0 offen offset:128 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v206, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[206:209], v238 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512 + ; GCN-NEXT: ds_read_b128 v[164:167], v238 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[168:171], v238 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[210:213], v238 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[202:203], v[156:157], v[64:79] + ; GCN-NEXT: ds_read_b128 v[214:217], v239 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: ds_read_b128 v[218:221], v239 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[198:199], v[156:157], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[204:205], v[158:159], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[200:201], v[158:159], v[80:95] + ; GCN-NEXT: ds_read_b128 v[198:201], v239 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: ds_read_b128 v[202:205], v239 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: ds_write_b128 v240, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 - ; GCN-NEXT: ; implicit-def: $vgpr64 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93 - ; GCN-NEXT: ; implicit-def: $vgpr73 - ; GCN-NEXT: v_add_u32_e32 v76, v132, v64 + ; GCN-NEXT: ds_write_b128 v240, v[160:163] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[222:225], v222, s[8:11], 0 offen offset:192 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[210:211], v[148:149], v[64:79] + ; GCN-NEXT: buffer_load_dwordx4 v[226:229], v226, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr72 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v73 - ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[230:231], v230, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[232:233], v232, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr74 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v74 - ; GCN-NEXT: ; implicit-def: $vgpr75 - ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_perm_b32 v236, v232, v230, s5 + ; GCN-NEXT: buffer_load_dwordx2 v[210:211], v234, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v72, v132, v75 - ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[148:149], v[80:95] + ; GCN-NEXT: buffer_load_dwordx2 v[234:235], v235, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[72:75], v94 + ; GCN-NEXT: ds_read_b128 v[152:155], v238 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; kill: killed $vgpr76 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ; implicit-def: $sgpr8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512 + ; GCN-NEXT: v_perm_b32 v237, v234, v210, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[150:151], v[80:95] + ; GCN-NEXT: ds_read_b128 v[168:171], v238 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1024 + ; GCN-NEXT: ds_read_b128 v[160:163], v238 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[212:213], v[150:151], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[156:157], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[198:199], v[144:145], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[202:203], v[144:145], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[196:197], v[158:159], v[96:111] + ; GCN-NEXT: ds_read_b128 v[194:197], v238 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[200:201], v[146:147], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[204:205], v[146:147], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[160:161], v[140:141], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[194:195], v[140:141], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[206:207], v[148:149], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[162:163], v[142:143], v[80:95] + ; GCN-NEXT: ds_read_b128 v[160:163], v239 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[196:197], v[142:143], v[64:79] + ; GCN-NEXT: ds_read_b128 v[194:197], v239 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024 + ; GCN-NEXT: ds_read_b128 v[172:175], v239 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31] - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: ds_read_b128 v[198:201], v239 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v95, v[64:67] + ; GCN-NEXT: ds_write_b128 v240, v[222:225] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v240, v[226:229] offset:1024 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[164:165], v[148:149], v[96:111] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[64:67], v94 + ; GCN-NEXT: ds_read_b128 v[156:159], v238 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[90:93], v94 offset:512 + ; GCN-NEXT: ds_read_b128 v[202:205], v238 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71 - ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024 + ; GCN-NEXT: v_perm_b32 v226, v232, v230, s7 + ; GCN-NEXT: v_perm_b32 v227, v234, v210, s7 + ; GCN-NEXT: v_perm_b32 v228, v233, v231, s5 + ; GCN-NEXT: v_perm_b32 v230, v233, v231, s7 + ; GCN-NEXT: v_perm_b32 v229, v235, v211, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[208:209], v[150:151], v[112:127] + ; GCN-NEXT: v_perm_b32 v231, v235, v211, s7 + ; GCN-NEXT: ds_read_b128 v[210:213], v238 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15] - ; GCN-NEXT: ds_read_b128 v[76:79], v94 offset:1536 + ; GCN-NEXT: ds_read_b128 v[222:225], v238 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[94:97], v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[166:167], v[150:151], v[96:111] + ; GCN-NEXT: ds_read_b128 v[164:167], v239 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63] - ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47] - ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[214:215], v[144:145], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[198:199], v[136:137], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[144:145], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[216:217], v[146:147], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[200:201], v[138:139], v[64:79] + ; GCN-NEXT: ds_read_b128 v[198:201], v239 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[146:147], v[96:111] + ; GCN-NEXT: ds_read_b128 v[218:221], v239 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[140:141], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[222:223], v[132:133], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[168:169], v[140:141], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[142:143], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[224:225], v[134:135], v[64:79] + ; GCN-NEXT: ds_read_b128 v[222:225], v239 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63] - ; GCN-NEXT: v_perm_b32 v94, v102, v98, s5 - ; GCN-NEXT: v_perm_b32 v98, v102, v98, s8 - ; GCN-NEXT: v_perm_b32 v102, v103, v99, s5 - ; GCN-NEXT: v_perm_b32 v95, v104, v100, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63] - ; GCN-NEXT: v_perm_b32 v96, v103, v99, s8 - ; GCN-NEXT: v_perm_b32 v99, v104, v100, s8 - ; GCN-NEXT: v_perm_b32 v103, v105, v101, s5 - ; GCN-NEXT: v_perm_b32 v97, v105, v101, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47] - ; GCN-NEXT: s_nop 5 - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v48 - ; GCN-NEXT: v_mul_f32_e32 v101, s4, v49 - ; GCN-NEXT: v_max3_f32 v92, v100, s6, v101 - ; GCN-NEXT: v_mul_f32_e32 v93, s4, v50 - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v51 - ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 - ; GCN-NEXT: v_mul_f32_e32 v93, s4, v52 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31] - ; GCN-NEXT: v_mul_f32_e32 v100, s4, v53 - ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v54 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v55 - ; GCN-NEXT: v_max3_f32 v84, v92, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v56 - ; GCN-NEXT: v_mul_f32_e32 v92, s4, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15] - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v92 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v58 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v59 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v60 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v61 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v62 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v63 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31] - ; GCN-NEXT: s_nop 6 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v32 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v33 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v34 - ; GCN-NEXT: v_mul_f32_e32 v88, s4, v35 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v37 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v38 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v39 - ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v40 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v41 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31] - ; GCN-NEXT: v_max3_f32 v80, v84, v85, v80 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v42 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v43 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v44 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v45 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v46 - ; GCN-NEXT: v_mul_f32_e32 v82, s4, v47 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 - ; GCN-NEXT: v_mul_f32_e32 v81, s4, v16 - ; GCN-NEXT: v_mul_f32_e32 v82, s4, v17 - ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 - ; GCN-NEXT: v_max3_f32 v68, v80, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v20 - ; GCN-NEXT: v_mul_f32_e32 v76, s4, v21 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v76 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v22 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v23 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v24 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v25 - ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v26 - ; GCN-NEXT: v_mul_f32_e32 v70, s4, v27 - ; GCN-NEXT: v_max3_f32 v64, v68, v69, v70 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v28 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v29 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v30 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v31 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v0 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v2 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v3 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v4 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v5 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v7 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v8 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v10 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v11 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v12 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v13 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v14 - ; GCN-NEXT: v_mul_f32_e32 v66, s4, v15 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66 - ; GCN-NEXT: ; implicit-def: $vgpr65 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $vgpr68 - ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_add_u32_e32 v65, s7, v65 - ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65 - ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6 - ; GCN-NEXT: v_add_lshl_u32 v135, v66, v65, 1 - ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_lshl_add_u32 v136, v66, 1, v135 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_lshl_add_u32 v137, v66, 1, v136 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 - ; GCN-NEXT: v_lshl_add_u32 v138, v66, 1, v137 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v135, v[94:95] - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v64, v64, v65 - ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64 + ; GCN-NEXT: ds_write_b64 v183, v[236:237] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[98:99] + ; GCN-NEXT: ds_write_b64 v180, v[226:227] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[102:103] + ; GCN-NEXT: ds_write_b64 v181, v[228:229] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[96:97] - ; GCN-NEXT: v_add_u32_e32 v68, v132, v68 - ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7] - ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 - ; GCN-NEXT: ; implicit-def: $vgpr65 - ; GCN-NEXT: v_max_f32_e32 v66, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v134, v66, v64 - ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: ds_write_b64 v182, v[230:231] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[226:227], v241, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v64 - ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[228:229], v242, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v66 - ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[230:231], v243, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v64, v132, v67 - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[232:233], v244, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 - ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134 - ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 - ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134 - ; GCN-NEXT: v_exp_f32_e32 v163, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 - ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134 - ; GCN-NEXT: v_exp_f32_e32 v164, v57 - ; GCN-NEXT: v_exp_f32_e32 v49, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134 - ; GCN-NEXT: v_exp_f32_e32 v50, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v66 - ; GCN-NEXT: v_fma_f32 v68, s4, v52, -v134 - ; GCN-NEXT: v_exp_f32_e32 v51, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_fma_f32 v69, s4, v53, -v134 - ; GCN-NEXT: v_exp_f32_e32 v52, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[170:171], v[142:143], v[96:111] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_fma_f32 v70, s4, v54, -v134 - ; GCN-NEXT: v_exp_f32_e32 v53, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v69 - ; GCN-NEXT: v_fma_f32 v71, s4, v55, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v54, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70 - ; GCN-NEXT: v_exp_f32_e32 v55, v48 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71 - ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134 - ; GCN-NEXT: v_exp_f32_e32 v56, v48 - ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49 - ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50 - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51 - ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52 - ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 - ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v48, v48 - ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58 - ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67 - ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66 - ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 - ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55 - ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56 - ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 - ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 - ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] - ; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 - ; GCN-NEXT: v_exp_f32_e32 v58, v58 - ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95] - ; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57 - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59 - ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53 - ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54 - ; GCN-NEXT: v_exp_f32_e32 v59, v57 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111] - ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134 - ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134 - ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134 - ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60 - ; GCN-NEXT: ; implicit-def: $vgpr57 - ; GCN-NEXT: ds_read_b128 v[60:63], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v160, v149 - ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148 - ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79] - ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 - ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134 - ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134 - ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134 - ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162 - ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163 - ; GCN-NEXT: v_exp_f32_e32 v162, v146 - ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164 - ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134 - ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111] - ; GCN-NEXT: v_exp_f32_e32 v151, v33 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59 - ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 - ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127] - ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134 - ; GCN-NEXT: v_exp_f32_e32 v153, v33 - ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134 - ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5 - ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161 - ; GCN-NEXT: v_exp_f32_e32 v165, v60 - ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8 - ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v161, v61 - ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8 - ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5 - ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8 - ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8 + ; GCN-NEXT: v_perm_b32 v170, v228, v226, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[136:137], v[112:127] + ; GCN-NEXT: v_perm_b32 v168, v228, v226, s7 + ; GCN-NEXT: v_perm_b32 v171, v232, v230, s5 + ; GCN-NEXT: v_perm_b32 v169, v232, v230, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[136:137], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[138:139], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[136:137], v[80:95] + ; GCN-NEXT: v_perm_b32 v172, v229, v227, s5 + ; GCN-NEXT: v_perm_b32 v173, v233, v231, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[196:197], v[138:139], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[138:139], v[80:95] + ; GCN-NEXT: v_perm_b32 v174, v229, v227, s7 + ; GCN-NEXT: v_perm_b32 v175, v233, v231, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[202:203], v[132:133], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[210:211], v[132:133], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[204:205], v[134:135], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[164:165], v[128:129], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[212:213], v[134:135], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[198:199], v[128:129], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[166:167], v[130:131], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[222:223], v[128:129], v[64:79] + ; GCN-NEXT: s_nop 7 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v116 + ; GCN-NEXT: v_mul_f32_e32 v133, s4, v117 + ; GCN-NEXT: v_mul_f32_e32 v134, s4, v118 + ; GCN-NEXT: v_mul_f32_e32 v135, s4, v119 + ; GCN-NEXT: v_mul_f32_e32 v136, s4, v120 + ; GCN-NEXT: v_mul_f32_e32 v137, s4, v121 + ; GCN-NEXT: v_mul_f32_e32 v138, s4, v122 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[218:219], v[128:129], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v128, s4, v112 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v113 + ; GCN-NEXT: v_max3_f32 v128, v128, s14, v129 + ; GCN-NEXT: v_mul_f32_e32 v139, s4, v123 + ; GCN-NEXT: v_mul_f32_e32 v140, s4, v124 + ; GCN-NEXT: v_mul_f32_e32 v141, s4, v125 + ; GCN-NEXT: v_mul_f32_e32 v142, s4, v126 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[200:201], v[130:131], v[96:111] + ; GCN-NEXT: v_mul_f32_e32 v143, s4, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[224:225], v[130:131], v[64:79] + ; GCN-NEXT: s_nop 7 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_mul_f32_e32 v151, s4, v96 + ; GCN-NEXT: v_mul_f32_e32 v198, s4, v97 + ; GCN-NEXT: v_mul_f32_e32 v199, s4, v98 + ; GCN-NEXT: v_mul_f32_e32 v200, s4, v99 + ; GCN-NEXT: v_mul_f32_e32 v201, s4, v100 + ; GCN-NEXT: v_mul_f32_e32 v206, s4, v101 + ; GCN-NEXT: v_mul_f32_e32 v207, s4, v102 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[220:221], v[130:131], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v114 + ; GCN-NEXT: v_mul_f32_e32 v131, s4, v115 + ; GCN-NEXT: v_max3_f32 v128, v128, v130, v131 + ; GCN-NEXT: v_max3_f32 v128, v128, v132, v133 + ; GCN-NEXT: v_max3_f32 v128, v128, v134, v135 + ; GCN-NEXT: v_max3_f32 v128, v128, v136, v137 + ; GCN-NEXT: v_max3_f32 v128, v128, v138, v139 + ; GCN-NEXT: v_max3_f32 v128, v128, v140, v141 + ; GCN-NEXT: v_max3_f32 v128, v128, v142, v143 + ; GCN-NEXT: v_max3_f32 v128, v128, v151, v198 + ; GCN-NEXT: v_max3_f32 v128, v128, v199, v200 + ; GCN-NEXT: v_mul_f32_e32 v208, s4, v103 + ; GCN-NEXT: v_max3_f32 v128, v128, v201, v206 + ; GCN-NEXT: v_mul_f32_e32 v209, s4, v104 + ; GCN-NEXT: v_mul_f32_e32 v144, s4, v105 + ; GCN-NEXT: v_max3_f32 v128, v128, v207, v208 + ; GCN-NEXT: v_mul_f32_e32 v145, s4, v106 + ; GCN-NEXT: v_mul_f32_e32 v146, s4, v107 + ; GCN-NEXT: v_max3_f32 v128, v128, v209, v144 + ; GCN-NEXT: v_mul_f32_e32 v147, s4, v108 + ; GCN-NEXT: v_mul_f32_e32 v214, s4, v109 + ; GCN-NEXT: v_max3_f32 v128, v128, v145, v146 + ; GCN-NEXT: v_mul_f32_e32 v215, s4, v110 + ; GCN-NEXT: v_mul_f32_e32 v216, s4, v111 + ; GCN-NEXT: v_max3_f32 v128, v128, v147, v214 + ; GCN-NEXT: v_mul_f32_e32 v196, s4, v80 + ; GCN-NEXT: v_mul_f32_e32 v197, s4, v81 + ; GCN-NEXT: v_max3_f32 v128, v128, v215, v216 + ; GCN-NEXT: v_mul_f32_e32 v202, s4, v82 + ; GCN-NEXT: v_mul_f32_e32 v203, s4, v83 + ; GCN-NEXT: v_max3_f32 v128, v128, v196, v197 + ; GCN-NEXT: v_mul_f32_e32 v211, s4, v84 + ; GCN-NEXT: v_mul_f32_e32 v212, s4, v85 + ; GCN-NEXT: v_max3_f32 v128, v128, v202, v203 + ; GCN-NEXT: v_mul_f32_e32 v213, s4, v86 + ; GCN-NEXT: v_mul_f32_e32 v218, s4, v87 + ; GCN-NEXT: v_max3_f32 v128, v128, v211, v212 + ; GCN-NEXT: v_mul_f32_e32 v219, s4, v88 + ; GCN-NEXT: v_mul_f32_e32 v204, s4, v89 + ; GCN-NEXT: v_max3_f32 v128, v128, v213, v218 + ; GCN-NEXT: v_mul_f32_e32 v205, s4, v90 + ; GCN-NEXT: v_mul_f32_e32 v220, s4, v91 + ; GCN-NEXT: v_max3_f32 v128, v128, v219, v204 + ; GCN-NEXT: v_mul_f32_e32 v221, s4, v92 + ; GCN-NEXT: v_mul_f32_e32 v148, s4, v93 + ; GCN-NEXT: v_max3_f32 v128, v128, v205, v220 + ; GCN-NEXT: v_mul_f32_e32 v149, s4, v94 + ; GCN-NEXT: v_mul_f32_e32 v150, s4, v95 + ; GCN-NEXT: v_max3_f32 v128, v128, v221, v148 + ; GCN-NEXT: v_mul_f32_e32 v222, s4, v64 + ; GCN-NEXT: v_mul_f32_e32 v223, s4, v65 + ; GCN-NEXT: v_max3_f32 v128, v128, v149, v150 + ; GCN-NEXT: v_mul_f32_e32 v224, s4, v66 + ; GCN-NEXT: v_mul_f32_e32 v225, s4, v67 + ; GCN-NEXT: v_max3_f32 v128, v128, v222, v223 + ; GCN-NEXT: v_mul_f32_e32 v226, s4, v68 + ; GCN-NEXT: v_mul_f32_e32 v227, s4, v69 + ; GCN-NEXT: v_max3_f32 v128, v128, v224, v225 + ; GCN-NEXT: v_mul_f32_e32 v228, s4, v70 + ; GCN-NEXT: v_mul_f32_e32 v229, s4, v71 + ; GCN-NEXT: v_max3_f32 v128, v128, v226, v227 + ; GCN-NEXT: v_mul_f32_e32 v230, s4, v72 + ; GCN-NEXT: v_mul_f32_e32 v231, s4, v73 + ; GCN-NEXT: v_max3_f32 v128, v128, v228, v229 + ; GCN-NEXT: v_mul_f32_e32 v232, s4, v74 + ; GCN-NEXT: v_mul_f32_e32 v233, s4, v75 + ; GCN-NEXT: v_max3_f32 v128, v128, v230, v231 + ; GCN-NEXT: v_mul_f32_e32 v234, s4, v76 + ; GCN-NEXT: v_mul_f32_e32 v194, s4, v77 + ; GCN-NEXT: v_max3_f32 v128, v128, v232, v233 + ; GCN-NEXT: v_mul_f32_e32 v195, s4, v78 + ; GCN-NEXT: v_mul_f32_e32 v210, s4, v79 + ; GCN-NEXT: v_max3_f32 v128, v128, v234, v194 + ; GCN-NEXT: v_max3_f32 v128, v128, v195, v210 + ; GCN-NEXT: ds_bpermute_b32 v129, v176, v128 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_max_f32_e32 v129, v129, v129 + ; GCN-NEXT: v_max_f32_e32 v128, v128, v129 + ; GCN-NEXT: ds_bpermute_b32 v129, v176, v128 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v128, v129, v128, s[12:13] + ; GCN-NEXT: v_max_f32_e32 v128, v128, v128 + ; GCN-NEXT: v_max_f32_e32 v128, v193, v128 + ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112 + ; GCN-NEXT: v_fma_f32 v113, s4, v113, -v128 + ; GCN-NEXT: v_fma_f32 v114, s4, v114, -v128 + ; GCN-NEXT: v_mul_f32_e32 v113, 0x3fb8aa3b, v113 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_fma_f32 v114, s4, v115, -v128 + ; GCN-NEXT: v_exp_f32_e32 v112, v112 + ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_fma_f32 v114, s4, v116, -v128 + ; GCN-NEXT: v_exp_f32_e32 v113, v113 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_fma_f32 v114, s4, v117, -v128 + ; GCN-NEXT: v_exp_f32_e32 v129, v129 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_fma_f32 v114, s4, v118, -v128 + ; GCN-NEXT: v_exp_f32_e32 v134, v134 + ; GCN-NEXT: v_mul_f32_e32 v118, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_fma_f32 v114, s4, v119, -v128 + ; GCN-NEXT: v_add_f32_e32 v138, 0, v112 + ; GCN-NEXT: v_exp_f32_e32 v140, v135 + ; GCN-NEXT: v_mul_f32_e32 v119, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_sub_f32_e32 v114, v192, v128 + ; GCN-NEXT: v_add_f32_e32 v138, v113, v138 + ; GCN-NEXT: v_exp_f32_e32 v142, v136 + ; GCN-NEXT: v_mul_f32_e32 v137, 0x3fb8aa3b, v114 + ; GCN-NEXT: ds_read_b128 v[114:117], v179 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_f32_e32 v138, v129, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v112 + ; GCN-NEXT: v_fma_f32 v112, s4, v120, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v120, v129 + ; GCN-NEXT: v_exp_f32_e32 v129, v118 + ; GCN-NEXT: ds_read_b128 v[130:133], v179 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_f32_e32 v138, v134, v138 + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v112 + ; GCN-NEXT: v_fma_f32 v112, s4, v121, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v118, v134 + ; GCN-NEXT: v_exp_f32_e32 v144, v119 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v112 + ; GCN-NEXT: v_add_f32_e32 v112, v140, v138 + ; GCN-NEXT: v_fma_f32 v121, s4, v122, -v128 + ; GCN-NEXT: v_add_f32_e32 v112, v142, v112 + ; GCN-NEXT: v_cvt_f16_f32_e32 v113, v113 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v121 + ; GCN-NEXT: v_fma_f32 v121, s4, v123, -v128 + ; GCN-NEXT: v_add_f32_e32 v119, v129, v112 + ; GCN-NEXT: v_exp_f32_e32 v112, v137 + ; GCN-NEXT: v_mul_f32_e32 v145, 0x3fb8aa3b, v121 + ; GCN-NEXT: v_add_f32_e32 v146, v144, v119 + ; GCN-NEXT: v_pack_b32_f16 v123, v120, v118 + ; GCN-NEXT: ds_read_b128 v[118:121], v179 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pack_b32_f16 v122, v139, v113 + ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[114:115], v[122:123], v[0:15] + ; GCN-NEXT: ds_read_b128 v[134:137], v179 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v113, v140 + ; GCN-NEXT: v_exp_f32_e32 v139, v141 + ; GCN-NEXT: v_cvt_f16_f32_e32 v115, v142 + ; GCN-NEXT: v_fma_f32 v114, s4, v124, -v128 + ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v114 + ; GCN-NEXT: v_add_f32_e32 v114, v139, v146 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[130:131], v[122:123], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v141, v143 + ; GCN-NEXT: v_pack_b32_f16 v130, v113, v115 + ; GCN-NEXT: v_fma_f32 v115, s4, v126, -v128 + ; GCN-NEXT: v_fma_f32 v124, s4, v125, -v128 + ; GCN-NEXT: v_add_f32_e32 v113, v141, v114 + ; GCN-NEXT: v_cvt_f16_f32_e32 v114, v129 + ; GCN-NEXT: v_fma_f32 v96, s4, v96, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[118:119], v[122:123], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v119, 0x3fb8aa3b, v115 + ; GCN-NEXT: v_cvt_f16_f32_e32 v115, v144 + ; GCN-NEXT: v_exp_f32_e32 v118, v138 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v124 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v139 + ; GCN-NEXT: v_pack_b32_f16 v131, v114, v115 + ; GCN-NEXT: v_add_f32_e32 v113, v118, v113 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[134:135], v[122:123], v[48:63] + ; GCN-NEXT: v_fma_f32 v122, s4, v127, -v128 + ; GCN-NEXT: v_exp_f32_e32 v127, v145 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v96 + ; GCN-NEXT: v_fma_f32 v97, s4, v97, -v128 + ; GCN-NEXT: v_fma_f32 v98, s4, v98, -v128 + ; GCN-NEXT: v_add_f32_e32 v113, v127, v113 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[116:117], v[130:131], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v134, v140 + ; GCN-NEXT: ds_read_b128 v[114:117], v178 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v99, s4, v99, -v128 + ; GCN-NEXT: ds_read_b128 v[122:125], v178 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_f32_e32 v96, v134, v113 + ; GCN-NEXT: v_cvt_f16_f32_e32 v113, v141 + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v97 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[132:133], v[130:131], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v140, v142 + ; GCN-NEXT: v_cvt_f16_f32_e32 v97, v118 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v98 + ; GCN-NEXT: v_cvt_f16_f32_e32 v98, v127 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v113 + ; GCN-NEXT: v_add_f32_e32 v96, v140, v96 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v99 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[120:121], v[130:131], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v113, v119 + ; GCN-NEXT: v_pack_b32_f16 v127, v97, v98 + ; GCN-NEXT: v_fma_f32 v100, s4, v100, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v144, v134 + ; GCN-NEXT: v_add_f32_e32 v96, v113, v96 + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v100 + ; GCN-NEXT: v_fma_f32 v101, s4, v101, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[130:131], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v129, v129 + ; GCN-NEXT: v_fma_f32 v104, s4, v104, -v128 + ; GCN-NEXT: v_fma_f32 v105, s4, v105, -v128 + ; GCN-NEXT: v_fma_f32 v109, s4, v109, -v128 + ; GCN-NEXT: v_add_f32_e32 v130, v129, v96 + ; GCN-NEXT: ds_read_b128 v[96:99], v178 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[118:121], v178 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[114:115], v[126:127], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v145, v135 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111] - ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5 - ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151 - ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153 - ; GCN-NEXT: v_exp_f32_e32 v159, v33 - ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38 - ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152 - ; GCN-NEXT: v_exp_f32_e32 v152, v38 + ; GCN-NEXT: ds_write_b64 v183, v[170:171] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[60:61] + ; GCN-NEXT: ds_write_b64 v180, v[168:169] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[32:33] - ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: ; implicit-def: $vgpr38 + ; GCN-NEXT: ds_write_b64 v181, v[172:173] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[140:141] - ; GCN-NEXT: v_add_u32_e32 v38, v132, v38 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v33 + ; GCN-NEXT: ds_write_b64 v182, v[174:175] + ; GCN-NEXT: v_add_f32_e32 v100, v145, v130 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[114:115], v188, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v189, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr36 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v36 - ; GCN-NEXT: ; implicit-def: $vgpr37 - ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[132:133], v190, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v33, v132, v37 - ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v191, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[122:123], v[126:127], v[16:31] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165 - ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156 - ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134 - ; GCN-NEXT: ds_read_b128 v[36:39], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v154, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158 - ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134 - ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v155, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157 - ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161 - ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159 - ; GCN-NEXT: v_exp_f32_e32 v157, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32 - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142 - ; GCN-NEXT: v_exp_f32_e32 v146, v32 - ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134 - ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40 - ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v147, v36 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v143, v36 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155 - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142 - ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156 - ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157 - ; GCN-NEXT: v_exp_f32_e32 v156, v32 - ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146 - ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32 - ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v129, v36 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44 - ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147 - ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: ds_read_b128 v[36:39], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v142, v40 - ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143 - ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95] - ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v63, v40 - ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61 - ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134 - ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156 - ; GCN-NEXT: v_exp_f32_e32 v158, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129 - ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v128, v17 - ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8 - ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62 - ; GCN-NEXT: v_exp_f32_e32 v167, v36 - ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8 - ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v130, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158 - ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5 - ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63 + ; GCN-NEXT: v_fma_f32 v80, s4, v80, -v128 + ; GCN-NEXT: v_fma_f32 v81, s4, v81, -v128 + ; GCN-NEXT: v_fma_f32 v82, s4, v82, -v128 + ; GCN-NEXT: v_fma_f32 v83, s4, v83, -v128 + ; GCN-NEXT: v_fma_f32 v84, s4, v84, -v128 + ; GCN-NEXT: v_fma_f32 v85, s4, v85, -v128 + ; GCN-NEXT: v_fma_f32 v88, s4, v88, -v128 + ; GCN-NEXT: v_fma_f32 v89, s4, v89, -v128 + ; GCN-NEXT: v_fma_f32 v93, s4, v93, -v128 + ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128 + ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128 + ; GCN-NEXT: v_fma_f32 v66, s4, v66, -v128 + ; GCN-NEXT: v_fma_f32 v67, s4, v67, -v128 + ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128 + ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128 + ; GCN-NEXT: v_fma_f32 v72, s4, v72, -v128 + ; GCN-NEXT: v_fma_f32 v73, s4, v73, -v128 + ; GCN-NEXT: v_fma_f32 v77, s4, v77, -v128 + ; GCN-NEXT: v_perm_b32 v136, v130, v114, s5 + ; GCN-NEXT: v_perm_b32 v138, v130, v114, s7 + ; GCN-NEXT: v_perm_b32 v137, v134, v132, s5 + ; GCN-NEXT: v_perm_b32 v139, v134, v132, s7 + ; GCN-NEXT: v_exp_f32_e32 v134, v141 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[96:97], v[126:127], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v96, v113 + ; GCN-NEXT: v_exp_f32_e32 v113, v142 + ; GCN-NEXT: v_perm_b32 v130, v131, v115, s5 + ; GCN-NEXT: v_perm_b32 v132, v131, v115, s7 + ; GCN-NEXT: v_perm_b32 v131, v135, v133, s5 + ; GCN-NEXT: v_perm_b32 v133, v135, v133, s7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v114, v140 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v101 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[118:119], v[126:127], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v101, v129 + ; GCN-NEXT: v_exp_f32_e32 v119, v143 + ; GCN-NEXT: v_add_f32_e32 v100, v134, v100 + ; GCN-NEXT: v_fma_f32 v97, s4, v102, -v128 + ; GCN-NEXT: v_add_f32_e32 v100, v113, v100 + ; GCN-NEXT: v_fma_f32 v102, s4, v103, -v128 + ; GCN-NEXT: v_pack_b32_f16 v122, v144, v114 + ; GCN-NEXT: v_mul_f32_e32 v126, 0x3fb8aa3b, v102 + ; GCN-NEXT: v_pack_b32_f16 v123, v96, v101 + ; GCN-NEXT: v_add_f32_e32 v96, v119, v100 + ; GCN-NEXT: ds_read_b128 v[100:103], v179 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[116:117], v[122:123], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v127, v146 + ; GCN-NEXT: v_mul_f32_e32 v97, 0x3fb8aa3b, v97 + ; GCN-NEXT: ds_read_b128 v[114:117], v179 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v118, v145 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v104 + ; GCN-NEXT: v_cvt_f16_f32_e32 v104, v134 + ; GCN-NEXT: v_add_f32_e32 v96, v127, v96 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[124:125], v[122:123], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v124, v135 + ; GCN-NEXT: v_pack_b32_f16 v118, v118, v104 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v105 + ; GCN-NEXT: v_add_f32_e32 v96, v124, v96 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[98:99], v[122:123], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v98, v113 + ; GCN-NEXT: v_exp_f32_e32 v113, v97 + ; GCN-NEXT: v_cvt_f16_f32_e32 v97, v119 + ; GCN-NEXT: v_fma_f32 v99, s4, v106, -v128 + ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v99 + ; GCN-NEXT: v_add_f32_e32 v96, v113, v96 + ; GCN-NEXT: v_fma_f32 v99, s4, v107, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[120:121], v[122:123], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v120, v126 + ; GCN-NEXT: v_mul_f32_e32 v121, 0x3fb8aa3b, v99 + ; GCN-NEXT: v_pack_b32_f16 v119, v98, v97 + ; GCN-NEXT: v_add_f32_e32 v122, v120, v96 + ; GCN-NEXT: ds_read_b128 v[96:99], v179 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[104:107], v179 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[100:101], v[118:119], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v123, v129 + ; GCN-NEXT: v_fma_f32 v101, s4, v108, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v100, v127 + ; GCN-NEXT: v_mul_f32_e32 v126, 0x3fb8aa3b, v101 + ; GCN-NEXT: v_add_f32_e32 v101, v123, v122 + ; GCN-NEXT: v_cvt_f16_f32_e32 v108, v124 + ; GCN-NEXT: v_mul_f32_e32 v124, 0x3fb8aa3b, v109 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[114:115], v[118:119], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v122, v125 + ; GCN-NEXT: v_pack_b32_f16 v114, v100, v108 + ; GCN-NEXT: v_add_f32_e32 v100, v122, v101 + ; GCN-NEXT: v_cvt_f16_f32_e32 v101, v120 + ; GCN-NEXT: v_mul_f32_e32 v120, 0x3fb8aa3b, v80 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[96:97], v[118:119], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v96, v113 + ; GCN-NEXT: v_exp_f32_e32 v113, v134 + ; GCN-NEXT: v_fma_f32 v97, s4, v110, -v128 + ; GCN-NEXT: v_mul_f32_e32 v97, 0x3fb8aa3b, v97 + ; GCN-NEXT: v_pack_b32_f16 v115, v96, v101 + ; GCN-NEXT: v_add_f32_e32 v100, v113, v100 + ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v84 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[104:105], v[118:119], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v105, v121 + ; GCN-NEXT: v_fma_f32 v104, s4, v111, -v128 + ; GCN-NEXT: v_mul_f32_e32 v118, 0x3fb8aa3b, v104 + ; GCN-NEXT: v_cvt_f16_f32_e32 v104, v123 + ; GCN-NEXT: v_add_f32_e32 v96, v105, v100 + ; GCN-NEXT: v_mul_f32_e32 v123, 0x3fb8aa3b, v81 + ; GCN-NEXT: v_cvt_f16_f32_e32 v81, v113 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[102:103], v[114:115], v[0:15] + ; GCN-NEXT: ds_read_b128 v[100:103], v178 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v119, v126 + ; GCN-NEXT: ds_read_b128 v[108:111], v178 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v126, 0x3fb8aa3b, v83 + ; GCN-NEXT: v_add_f32_e32 v80, v119, v96 + ; GCN-NEXT: v_cvt_f16_f32_e32 v96, v122 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[116:117], v[114:115], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v122, v124 + ; GCN-NEXT: v_mul_f32_e32 v124, 0x3fb8aa3b, v82 + ; GCN-NEXT: v_cvt_f16_f32_e32 v82, v105 + ; GCN-NEXT: v_pack_b32_f16 v104, v104, v96 + ; GCN-NEXT: v_add_f32_e32 v80, v122, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v119 + ; GCN-NEXT: v_pack_b32_f16 v105, v81, v82 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[98:99], v[114:115], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v113, v97 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v80, v113, v80 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[106:107], v[114:115], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v125, v118 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v106, v125, v80 + ; GCN-NEXT: ds_read_b128 v[80:83], v178 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[96:99], v178 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[100:101], v[104:105], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v129, v120 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[20:21] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111] - ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5 - ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150 - ; GCN-NEXT: v_exp_f32_e32 v140, v17 - ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5 + ; GCN-NEXT: ds_write_b64 v183, v[136:137] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[36:37] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60 - ; GCN-NEXT: v_exp_f32_e32 v144, v22 + ; GCN-NEXT: ds_write_b64 v180, v[138:139] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[16:17] - ; GCN-NEXT: ; implicit-def: $vgpr17 - ; GCN-NEXT: ; implicit-def: $vgpr22 + ; GCN-NEXT: ds_write_b64 v181, v[130:131] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[42:43] - ; GCN-NEXT: v_add_u32_e32 v22, v132, v22 - ; GCN-NEXT: v_add_u32_e32 v17, v132, v17 - ; GCN-NEXT: ; implicit-def: $vgpr20 - ; GCN-NEXT: ; implicit-def: $vgpr21 + ; GCN-NEXT: ds_write_b64 v182, v[132:133] + ; GCN-NEXT: v_add_f32_e32 v84, v129, v106 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v184, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[106:107], v185, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v20, v132, v20 - ; GCN-NEXT: v_add_u32_e32 v21, v132, v21 - ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44 - ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[114:115], v186, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[116:117], v187, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79] - ; GCN-NEXT: v_exp_f32_e32 v132, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[108:109], v[104:105], v[16:31] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167 - ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134 - ; GCN-NEXT: ds_read_b128 v[20:23], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[36:39], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v62, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130 - ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134 - ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134 - ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134 - ; GCN-NEXT: ; implicit-def: $sgpr0 - ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140 - ; GCN-NEXT: v_exp_f32_e32 v145, v16 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141 - ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46 - ; GCN-NEXT: v_exp_f32_e32 v35, v16 - ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v34, s4, v27, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v24 - ; GCN-NEXT: ds_read_b128 v[24:27], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v46, v20 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v47 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v132 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v47, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v62 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v34 - ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145 - ; GCN-NEXT: v_exp_f32_e32 v141, v16 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35 - ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134 - ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16 - ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v33, v20 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: ds_read_b128 v[20:23], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v36, v24 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47 - ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95] - ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_exp_f32_e32 v39, v24 - ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37 - ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141 - ; GCN-NEXT: v_exp_f32_e32 v148, v1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33 - ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127] - ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134 - ; GCN-NEXT: v_exp_f32_e32 v34, v1 - ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8 - ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38 - ; GCN-NEXT: v_exp_f32_e32 v150, v20 - ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8 - ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v38, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39 - ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134 - ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5 - ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8 + ; GCN-NEXT: v_perm_b32 v118, v106, v100, s5 + ; GCN-NEXT: v_perm_b32 v120, v106, v100, s7 + ; GCN-NEXT: v_perm_b32 v119, v116, v114, s5 + ; GCN-NEXT: v_perm_b32 v121, v116, v114, s7 + ; GCN-NEXT: v_exp_f32_e32 v116, v123 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[104:105], v[32:47] + ; GCN-NEXT: v_perm_b32 v106, v107, v101, s5 + ; GCN-NEXT: v_perm_b32 v114, v107, v101, s7 + ; GCN-NEXT: v_perm_b32 v107, v117, v115, s5 + ; GCN-NEXT: v_perm_b32 v115, v117, v115, s7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v100, v122 + ; GCN-NEXT: v_mul_f32_e32 v117, 0x3fb8aa3b, v85 + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v113 + ; GCN-NEXT: v_exp_f32_e32 v113, v124 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[104:105], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v125 + ; GCN-NEXT: v_exp_f32_e32 v97, v126 + ; GCN-NEXT: v_pack_b32_f16 v108, v127, v100 + ; GCN-NEXT: v_add_f32_e32 v84, v116, v84 + ; GCN-NEXT: v_pack_b32_f16 v109, v80, v85 + ; GCN-NEXT: v_fma_f32 v81, s4, v86, -v128 + ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v81 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[102:103], v[108:109], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v105, v134 + ; GCN-NEXT: v_add_f32_e32 v84, v113, v84 + ; GCN-NEXT: v_fma_f32 v86, s4, v87, -v128 + ; GCN-NEXT: v_mul_f32_e32 v104, 0x3fb8aa3b, v86 + ; GCN-NEXT: v_add_f32_e32 v80, v97, v84 + ; GCN-NEXT: ds_read_b128 v[84:87], v179 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[100:103], v179 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[110:111], v[108:109], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v110, v117 + ; GCN-NEXT: v_add_f32_e32 v80, v105, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v96, v129 + ; GCN-NEXT: v_mul_f32_e32 v122, 0x3fb8aa3b, v88 + ; GCN-NEXT: v_add_f32_e32 v80, v110, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v88, v116 + ; GCN-NEXT: v_mul_f32_e32 v111, 0x3fb8aa3b, v89 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[108:109], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v82, v113 + ; GCN-NEXT: v_exp_f32_e32 v113, v81 + ; GCN-NEXT: v_cvt_f16_f32_e32 v81, v97 + ; GCN-NEXT: v_fma_f32 v83, s4, v90, -v128 + ; GCN-NEXT: v_mul_f32_e32 v116, 0x3fb8aa3b, v83 + ; GCN-NEXT: v_add_f32_e32 v80, v113, v80 + ; GCN-NEXT: v_fma_f32 v83, s4, v91, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[98:99], v[108:109], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v99, v104 + ; GCN-NEXT: v_mul_f32_e32 v104, 0x3fb8aa3b, v83 + ; GCN-NEXT: v_pack_b32_f16 v97, v82, v81 + ; GCN-NEXT: v_pack_b32_f16 v96, v96, v88 + ; GCN-NEXT: v_add_f32_e32 v98, v99, v80 + ; GCN-NEXT: ds_read_b128 v[80:83], v179 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[88:91], v179 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[84:85], v[96:97], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v105 + ; GCN-NEXT: v_exp_f32_e32 v105, v122 + ; GCN-NEXT: v_fma_f32 v85, s4, v92, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v92, v110 + ; GCN-NEXT: v_mul_f32_e32 v108, 0x3fb8aa3b, v85 + ; GCN-NEXT: v_add_f32_e32 v85, v105, v98 + ; GCN-NEXT: v_pack_b32_f16 v98, v84, v92 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[100:101], v[96:97], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v100, v111 + ; GCN-NEXT: v_mul_f32_e32 v101, 0x3fb8aa3b, v93 + ; GCN-NEXT: v_add_f32_e32 v84, v100, v85 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v99 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[96:97], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v113 + ; GCN-NEXT: v_exp_f32_e32 v109, v116 + ; GCN-NEXT: v_fma_f32 v81, s4, v94, -v128 + ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v81 + ; GCN-NEXT: v_pack_b32_f16 v99, v80, v85 + ; GCN-NEXT: v_add_f32_e32 v84, v109, v84 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[88:89], v[96:97], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v89, v104 + ; GCN-NEXT: v_fma_f32 v88, s4, v95, -v128 + ; GCN-NEXT: v_mul_f32_e32 v104, 0x3fb8aa3b, v64 + ; GCN-NEXT: v_mul_f32_e32 v96, 0x3fb8aa3b, v88 + ; GCN-NEXT: v_add_f32_e32 v80, v89, v84 + ; GCN-NEXT: v_cvt_f16_f32_e32 v88, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[86:87], v[98:99], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v97, v108 + ; GCN-NEXT: ds_read_b128 v[84:87], v178 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[92:95], v178 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_f32_e32 v64, v97, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v100 + ; GCN-NEXT: v_pack_b32_f16 v88, v88, v80 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[102:103], v[98:99], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v100, v101 + ; GCN-NEXT: v_mul_f32_e32 v101, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v109 + ; GCN-NEXT: v_mul_f32_e32 v103, 0x3fb8aa3b, v66 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v89 + ; GCN-NEXT: v_add_f32_e32 v64, v100, v64 + ; GCN-NEXT: v_pack_b32_f16 v89, v65, v66 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[98:99], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v102, v81 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v64, v102, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[90:91], v[98:99], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v91, v96 + ; GCN-NEXT: v_mul_f32_e32 v96, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_mul_f32_e32 v98, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_add_f32_e32 v90, v91, v64 + ; GCN-NEXT: ds_read_b128 v[64:67], v178 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[80:83], v178 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[84:85], v[88:89], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v97 + ; GCN-NEXT: v_exp_f32_e32 v97, v104 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v100 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v135, v[4:5] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111] - ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5 - ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_exp_f32_e32 v42, v1 - ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5 + ; GCN-NEXT: ds_write_b64 v183, v[118:119] + ; GCN-NEXT: v_add_f32_e32 v68, v97, v90 + ; GCN-NEXT: v_pack_b32_f16 v90, v84, v85 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[92:93], v[88:89], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v92, v101 + ; GCN-NEXT: v_mul_f32_e32 v93, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v91 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v136, v[20:21] + ; GCN-NEXT: ds_write_b64 v180, v[120:121] + ; GCN-NEXT: v_add_f32_e32 v68, v92, v68 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v137, v[0:1] + ; GCN-NEXT: ds_write_b64 v181, v[106:107] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v138, v[26:27] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6 - ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32 + ; GCN-NEXT: ds_write_b64 v182, v[114:115] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[88:89], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v102 + ; GCN-NEXT: v_exp_f32_e32 v99, v103 + ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_pack_b32_f16 v16, v37, v28 - ; GCN-NEXT: v_fma_f32 v24, s4, v7, -v134 - ; GCN-NEXT: v_exp_f32_e32 v25, v6 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[4:7], v139 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149 - ; GCN-NEXT: v_exp_f32_e32 v26, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150 - ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38 - ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v28, s4, v9, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95] - ; GCN-NEXT: v_exp_f32_e32 v29, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v41 - ; GCN-NEXT: v_fma_f32 v30, s4, v10, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v42 - ; GCN-NEXT: v_exp_f32_e32 v31, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127] - ; GCN-NEXT: v_pack_b32_f16 v17, v2, v0 - ; GCN-NEXT: v_pack_b32_f16 v16, v1, v27 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24 - ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134 - ; GCN-NEXT: v_exp_f32_e32 v19, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v139 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8 - ; GCN-NEXT: ds_read_b128 v[8:11], v139 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v24, v4 - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v26 - ; GCN-NEXT: v_exp_f32_e32 v27, v4 - ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95] - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 - ; GCN-NEXT: v_fma_f32 v21, s4, v13, -v134 - ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31 - ; GCN-NEXT: v_exp_f32_e32 v30, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 - ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127] - ; GCN-NEXT: v_exp_f32_e32 v16, v4 - ; GCN-NEXT: v_pack_b32_f16 v0, v5, v20 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v12 - ; GCN-NEXT: v_exp_f32_e32 v18, v9 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21 - ; GCN-NEXT: v_exp_f32_e32 v21, v9 - ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: ds_read_b128 v[4:7], v57 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95] - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111] - ; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30 - ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28 - ; GCN-NEXT: v_exp_f32_e32 v2, v2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127] - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_exp_f32_e32 v10, v1 - ; GCN-NEXT: v_pack_b32_f16 v8, v17, v20 - ; GCN-NEXT: v_pack_b32_f16 v9, v3, v0 - ; GCN-NEXT: v_add_f32_e32 v3, 0, v49 - ; GCN-NEXT: v_add_f32_e32 v3, v50, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v51, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v52, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v53, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v54, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v55, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v56, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v58, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v163, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v164, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v59, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v160, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v162, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v151, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v153, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v165, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v161, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v159, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v152, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v154, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v155, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v157, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v146, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v147, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v143, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v156, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v129, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v142, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v63, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v158, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v128, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v167, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v130, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v140, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v144, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v132, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v62, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v145, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v35, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v46, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v47, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v141, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v33, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v36, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v39, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95] - ; GCN-NEXT: v_add_f32_e32 v3, v34, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 - ; GCN-NEXT: v_add_f32_e32 v3, v38, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v42, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v25, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v26, v3 - ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22 - ; GCN-NEXT: v_add_f32_e32 v3, v29, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v31, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95] - ; GCN-NEXT: v_add_f32_e32 v3, v19, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v24, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v27, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v30, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v16, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v18, v3 - ; GCN-NEXT: v_add_f32_e32 v3, v21, v3 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] - ; GCN-NEXT: v_add_f32_e32 v0, v2, v3 - ; GCN-NEXT: v_add_f32_e32 v4, v10, v0 - ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_f32_e32 v2, v4, v5 - ; GCN-NEXT: ds_bpermute_b32 v3, v133, v2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111] - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7] - ; GCN-NEXT: ; implicit-def: $vgpr4 - ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48 - ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pack_b32_f16 v91, v64, v69 + ; GCN-NEXT: v_add_f32_e32 v68, v99, v68 + ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[80:81], v[88:89], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v80, v96 + ; GCN-NEXT: v_cvt_f16_f32_e32 v88, v97 + ; GCN-NEXT: v_mul_f32_e32 v96, 0x3fb8aa3b, v72 + ; GCN-NEXT: v_fma_f32 v70, s4, v71, -v128 + ; GCN-NEXT: v_add_f32_e32 v64, v80, v68 + ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v70 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[68:71], v179 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[86:87], v[90:91], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v89, v98 + ; GCN-NEXT: ds_read_b128 v[84:87], v179 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_f32_e32 v72, v89, v64 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v92 + ; GCN-NEXT: v_pack_b32_f16 v64, v88, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[94:95], v[90:91], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v92, v93 + ; GCN-NEXT: v_mul_f32_e32 v93, 0x3fb8aa3b, v73 + ; GCN-NEXT: v_fma_f32 v73, s4, v75, -v128 + ; GCN-NEXT: v_add_f32_e32 v72, v92, v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[90:91], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v88, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v99 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v80 + ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128 + ; GCN-NEXT: v_add_f32_e32 v72, v88, v72 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[82:83], v[90:91], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v90, v81 + ; GCN-NEXT: v_mul_f32_e32 v91, 0x3fb8aa3b, v73 + ; GCN-NEXT: v_add_f32_e32 v66, v90, v72 + ; GCN-NEXT: ds_read_b128 v[72:75], v179 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[80:83], v179 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] + ; GCN-NEXT: v_fma_f32 v69, s4, v76, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v89 + ; GCN-NEXT: v_mul_f32_e32 v89, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v92 + ; GCN-NEXT: v_exp_f32_e32 v76, v96 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v66, v76, v66 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[64:65], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v92, v93 + ; GCN-NEXT: v_pack_b32_f16 v84, v68, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v88 + ; GCN-NEXT: v_fma_f32 v69, s4, v78, -v128 + ; GCN-NEXT: v_add_f32_e32 v66, v92, v66 + ; GCN-NEXT: v_mul_f32_e32 v93, 0x3fb8aa3b, v77 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[64:65], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v72, v67 + ; GCN-NEXT: v_mul_f32_e32 v73, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_add_f32_e32 v66, v72, v66 + ; GCN-NEXT: v_cvt_f16_f32_e32 v72, v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[80:81], v[64:65], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v90 + ; GCN-NEXT: v_exp_f32_e32 v88, v91 + ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v128 + ; GCN-NEXT: v_mul_f32_e32 v90, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_pack_b32_f16 v85, v68, v64 + ; GCN-NEXT: v_add_f32_e32 v91, v88, v66 + ; GCN-NEXT: ds_read_b128 v[64:67], v178 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[84:85], v[0:15] + ; GCN-NEXT: ds_read_b128 v[68:71], v178 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v70, v76 + ; GCN-NEXT: ds_read_b128 v[76:79], v178 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v89, v89 + ; GCN-NEXT: ds_read_b128 v[78:81], v178 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v92 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[84:85], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v81, v93 + ; GCN-NEXT: v_add_f32_e32 v71, v89, v91 + ; GCN-NEXT: v_pack_b32_f16 v70, v70, v80 + ; GCN-NEXT: v_add_f32_e32 v71, v81, v71 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[84:85], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v73, v73 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v74, v73, v71 + ; GCN-NEXT: v_cvt_f16_f32_e32 v71, v88 + ; GCN-NEXT: v_pack_b32_f16 v71, v72, v71 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[82:83], v[84:85], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v75, v90 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v72, v75, v74 + ; GCN-NEXT: ds_bpermute_b32 v74, v176, v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[70:71], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[64:65], v[70:71], v[0:15] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_nop 7 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_add_f32_e32 v16, v72, v74 + ; GCN-NEXT: ds_bpermute_b32 v17, v176, v16 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v75 + ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v81 + ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v73 + ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v89 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[12:13] + ; GCN-NEXT: v_fmac_f32_e32 v16, v177, v112 + ; GCN-NEXT: v_pack_b32_f16 v17, v20, v18 + ; GCN-NEXT: v_pack_b32_f16 v16, v21, v19 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[76:77], v[70:71], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[78:79], v[70:71], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[16:17], v[0:15] ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index 0887fdf0844b0..1c570692719ac 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -6,488 +6,486 @@ define amdgpu_kernel void @smallInterleave() #0 { ret void } ; GCN-LABEL: smallInterleave: ; GCN: ; %bb.0: - ; GCN-NEXT: ; implicit-def: $vgpr2 - ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) - ; GCN-NEXT: v_readfirstlane_b32 s20, v2 - ; GCN-NEXT: ; implicit-def: $sgpr4 - ; GCN-NEXT: ; implicit-def: $vgpr3 - ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; implicit-def: $vgpr50 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr34 + ; GCN-NEXT: ; implicit-def: $vgpr35 + ; GCN-NEXT: ; implicit-def: $sgpr24 + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33 + ; GCN-NEXT: ; implicit-def: $sgpr20_sgpr21_sgpr22_sgpr23 + ; GCN-NEXT: ; implicit-def: $vgpr38 + ; GCN-NEXT: ; implicit-def: $vgpr39 + ; GCN-NEXT: ; implicit-def: $vgpr106 ; GCN-NEXT: ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19 - ; GCN-NEXT: ; implicit-def: $vgpr49 - ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 - ; GCN-NEXT: ; implicit-def: $vgpr51 - ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65 - ; GCN-NEXT: ; implicit-def: $vgpr76 - ; GCN-NEXT: ; implicit-def: $vgpr77 - ; GCN-NEXT: ; implicit-def: $vgpr78 - ; GCN-NEXT: ; implicit-def: $vgpr79 - ; GCN-NEXT: ; implicit-def: $vgpr80 - ; GCN-NEXT: ; implicit-def: $vgpr91 - ; GCN-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19 + ; GCN-NEXT: ; implicit-def: $vgpr40 + ; GCN-NEXT: ; implicit-def: $vgpr103 + ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67 + ; GCN-NEXT: ; implicit-def: $vgpr104 + ; GCN-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71 + ; GCN-NEXT: ; implicit-def: $vgpr72_vgpr73_vgpr74_vgpr75 + ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79 + ; GCN-NEXT: ; implicit-def: $vgpr105 + ; GCN-NEXT: ; implicit-def: $vgpr93 + ; GCN-NEXT: ; implicit-def: $vgpr94 + ; GCN-NEXT: ; implicit-def: $vgpr95 + ; GCN-NEXT: ; implicit-def: $vgpr96 + ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + ; GCN-NEXT: v_mov_b32_e32 v92, 0 + ; GCN-NEXT: ; implicit-def: $sgpr6 + ; GCN-NEXT: ; implicit-def: $sgpr7 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: ; implicit-def: $vgpr97 + ; GCN-NEXT: ; implicit-def: $sgpr2 + ; GCN-NEXT: ; implicit-def: $sgpr3 + ; GCN-NEXT: ; implicit-def: $vgpr98 + ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr99 + ; GCN-NEXT: ; implicit-def: $vgpr100 + ; GCN-NEXT: ; implicit-def: $vgpr101 + ; GCN-NEXT: ; implicit-def: $vgpr102 ; GCN-NEXT: ; iglp_opt mask(0x00000002) + ; GCN-NEXT: v_readfirstlane_b32 s6, v34 ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v3 - ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1] - ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_lshl_add_u32 v34, s6, 4, v35 + ; GCN-NEXT: v_mad_u64_u32 v[36:37], s[8:9], s24, v34, v[32:33] + ; GCN-NEXT: buffer_load_dwordx4 v[32:35], v36, s[20:23], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: s_lshl_b32 s4, s20, 7 - ; GCN-NEXT: ; implicit-def: $vgpr5 - ; GCN-NEXT: v_add_lshl_u32 v48, v5, s4, 1 - ; GCN-NEXT: v_add_u32_e32 v76, s20, v76 - ; GCN-NEXT: v_and_b32_e32 v76, 0x1fffffff, v76 + ; GCN-NEXT: s_lshl_b32 s8, s6, 7 + ; GCN-NEXT: v_add_lshl_u32 v107, v38, s8, 1 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v48, v[0:3] + ; GCN-NEXT: ds_write_b128 v107, v[32:35] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[32:35], v4, s[0:3], 0 offen offset:64 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[80:83], v36, s[20:23], 0 offen offset:64 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr0 - ; GCN-NEXT: ; implicit-def: $vgpr1 - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_add_u32_e32 v0, v0, v50 - ; GCN-NEXT: v_add_u32_e32 v1, v1, v50 - ; GCN-NEXT: buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v32, v39, v106 + ; GCN-NEXT: buffer_load_dwordx2 v[88:89], v32, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[74:75], v1, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v32, v40, v106 + ; GCN-NEXT: buffer_load_dwordx2 v[90:91], v32, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[36:39], v49 + ; GCN-NEXT: ds_read_b128 v[32:35], v103 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[44:47], v49 offset:512 + ; GCN-NEXT: ds_read_b128 v[84:87], v103 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0 - ; GCN-NEXT: ; kill: killed $vgpr1 - ; GCN-NEXT: ; kill: killed $vgpr0 - ; GCN-NEXT: v_mul_lo_u32 v76, v76, s6 - ; GCN-NEXT: v_add_lshl_u32 v76, v77, v76, 1 - ; GCN-NEXT: v_lshl_add_u32 v77, v78, 1, v76 - ; GCN-NEXT: ; implicit-def: $sgpr5 - ; GCN-NEXT: v_lshl_add_u32 v78, v79, 1, v77 - ; GCN-NEXT: ; implicit-def: $sgpr2 - ; GCN-NEXT: ; implicit-def: $sgpr3 - ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78 - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] - ; GCN-NEXT: ds_read_b128 v[36:39], v51 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[32:33], v[64:65], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[34:35], v[66:67], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[84:85], v[64:65], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[86:87], v[66:67], v[32:47] + ; GCN-NEXT: ds_read_b128 v[64:67], v104 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] - ; GCN-NEXT: ds_read_b128 v[44:47], v51 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63] + ; GCN-NEXT: ds_read_b128 v[64:67], v104 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v48, v[32:35] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31] + ; GCN-NEXT: ds_write_b128 v107, v[80:83] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[68:69], v[32:47] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[70:71], v[32:47] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[32:35], v49 + ; GCN-NEXT: ds_read_b128 v[64:67], v103 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] - ; GCN-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] - ; GCN-NEXT: ds_read_b128 v[40:43], v49 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[72:73], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[74:75], v[48:63] + ; GCN-NEXT: ds_read_b128 v[64:67], v103 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v51 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[72:73], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[74:75], v[32:47] + ; GCN-NEXT: ds_read_b128 v[64:67], v104 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31] - ; GCN-NEXT: ; implicit-def: $vgpr32 - ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: v_add_u32_e32 v82, v32, v50 - ; GCN-NEXT: v_add_u32_e32 v83, v33, v50 - ; GCN-NEXT: ; kill: killed $vgpr82 - ; GCN-NEXT: ; kill: killed $vgpr83 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31] - ; GCN-NEXT: ds_read_b128 v[66:69], v51 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[76:77], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[78:79], v[48:63] + ; GCN-NEXT: ds_read_b128 v[64:67], v104 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_max_f32_e32 v81, v67, v67 - ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31] - ; GCN-NEXT: v_perm_b32 v70, v74, v72, s2 - ; GCN-NEXT: v_perm_b32 v71, v74, v72, s3 - ; GCN-NEXT: v_perm_b32 v72, v75, v73, s2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[76:77], v[32:47] + ; GCN-NEXT: s_nop 6 + ; GCN-NEXT: v_mul_f32_e32 v64, s4, v48 + ; GCN-NEXT: v_mul_f32_e32 v65, s4, v49 + ; GCN-NEXT: v_max3_f32 v64, v64, s7, v65 + ; GCN-NEXT: v_mul_f32_e32 v68, s4, v52 + ; GCN-NEXT: v_mul_f32_e32 v69, s4, v53 + ; GCN-NEXT: v_mul_f32_e32 v70, s4, v54 + ; GCN-NEXT: v_mul_f32_e32 v71, s4, v55 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[78:79], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v66, s4, v50 + ; GCN-NEXT: v_mul_f32_e32 v67, s4, v51 + ; GCN-NEXT: v_max3_f32 v64, v64, v66, v67 + ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 + ; GCN-NEXT: v_mul_f32_e32 v72, s4, v56 + ; GCN-NEXT: v_mul_f32_e32 v73, s4, v57 + ; GCN-NEXT: v_max3_f32 v64, v64, v70, v71 + ; GCN-NEXT: v_mul_f32_e32 v74, s4, v58 + ; GCN-NEXT: v_mul_f32_e32 v75, s4, v59 + ; GCN-NEXT: v_max3_f32 v64, v64, v72, v73 + ; GCN-NEXT: v_mul_f32_e32 v76, s4, v60 + ; GCN-NEXT: v_mul_f32_e32 v77, s4, v61 + ; GCN-NEXT: v_max3_f32 v64, v64, v74, v75 + ; GCN-NEXT: v_mul_f32_e32 v78, s4, v62 + ; GCN-NEXT: v_mul_f32_e32 v79, s4, v63 + ; GCN-NEXT: v_max3_f32 v64, v64, v76, v77 + ; GCN-NEXT: v_mul_f32_e32 v80, s4, v32 + ; GCN-NEXT: v_mul_f32_e32 v81, s4, v33 + ; GCN-NEXT: v_max3_f32 v64, v64, v78, v79 + ; GCN-NEXT: v_mul_f32_e32 v82, s4, v34 + ; GCN-NEXT: v_mul_f32_e32 v83, s4, v35 + ; GCN-NEXT: v_max3_f32 v64, v64, v80, v81 + ; GCN-NEXT: v_mul_f32_e32 v84, s4, v36 + ; GCN-NEXT: v_mul_f32_e32 v85, s4, v37 + ; GCN-NEXT: v_max3_f32 v64, v64, v82, v83 + ; GCN-NEXT: v_mul_f32_e32 v86, s4, v38 + ; GCN-NEXT: v_mul_f32_e32 v87, s4, v39 + ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 + ; GCN-NEXT: v_mul_f32_e32 v103, s4, v40 + ; GCN-NEXT: v_mul_f32_e32 v104, s4, v41 + ; GCN-NEXT: v_max3_f32 v64, v64, v86, v87 + ; GCN-NEXT: v_mul_f32_e32 v107, s4, v42 + ; GCN-NEXT: v_mul_f32_e32 v108, s4, v43 + ; GCN-NEXT: v_max3_f32 v64, v64, v103, v104 + ; GCN-NEXT: v_mul_f32_e32 v109, s4, v44 + ; GCN-NEXT: v_mul_f32_e32 v110, s4, v45 + ; GCN-NEXT: v_max3_f32 v64, v64, v107, v108 + ; GCN-NEXT: v_mul_f32_e32 v111, s4, v46 + ; GCN-NEXT: v_mul_f32_e32 v112, s4, v47 + ; GCN-NEXT: v_max3_f32 v64, v64, v109, v110 + ; GCN-NEXT: v_max3_f32 v64, v64, v111, v112 + ; GCN-NEXT: ds_bpermute_b32 v65, v105, v64 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 + ; GCN-NEXT: v_max_f32_e32 v64, v64, v65 + ; GCN-NEXT: ds_bpermute_b32 v65, v105, v64 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[0:1] + ; GCN-NEXT: v_max_f32_e32 v64, v64, v64 + ; GCN-NEXT: v_max_f32_e32 v65, v93, v93 + ; GCN-NEXT: v_max_f32_e32 v64, v65, v64 + ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v64 + ; GCN-NEXT: v_fma_f32 v49, s4, v49, -v64 + ; GCN-NEXT: v_fma_f32 v50, s4, v50, -v64 + ; GCN-NEXT: v_fma_f32 v51, s4, v51, -v64 + ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v64 + ; GCN-NEXT: v_mul_f32_e32 v51, 0x3fb8aa3b, v51 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v65, v51 + ; GCN-NEXT: v_fma_f32 v51, s4, v52, -v64 + ; GCN-NEXT: v_fma_f32 v52, s4, v53, -v64 + ; GCN-NEXT: v_fma_f32 v53, s4, v54, -v64 + ; GCN-NEXT: v_fma_f32 v54, s4, v55, -v64 + ; GCN-NEXT: v_fma_f32 v55, s4, v56, -v64 + ; GCN-NEXT: v_fma_f32 v56, s4, v57, -v64 + ; GCN-NEXT: v_fma_f32 v57, s4, v58, -v64 + ; GCN-NEXT: v_fma_f32 v58, s4, v59, -v64 + ; GCN-NEXT: v_fma_f32 v59, s4, v60, -v64 + ; GCN-NEXT: v_fma_f32 v60, s4, v61, -v64 + ; GCN-NEXT: v_fma_f32 v61, s4, v62, -v64 + ; GCN-NEXT: v_fma_f32 v62, s4, v63, -v64 + ; GCN-NEXT: v_exp_f32_e32 v63, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v33, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v66, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v34, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v67, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v35, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v68, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v36, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v69, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v37, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v70, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v38, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v71, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v39, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v72, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v40, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v73, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v41, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v74, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v42, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v75, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v43, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v76, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v44, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v77, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v45, -v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v78, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v46, -v64 + ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 + ; GCN-NEXT: v_mul_f32_e32 v49, 0x3fb8aa3b, v49 + ; GCN-NEXT: v_mul_f32_e32 v50, 0x3fb8aa3b, v50 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v79, v32 + ; GCN-NEXT: v_fma_f32 v32, s4, v47, -v64 + ; GCN-NEXT: v_exp_f32_e32 v48, v48 + ; GCN-NEXT: v_exp_f32_e32 v49, v49 + ; GCN-NEXT: v_exp_f32_e32 v50, v50 + ; GCN-NEXT: v_mul_f32_e32 v51, 0x3fb8aa3b, v51 + ; GCN-NEXT: v_mul_f32_e32 v52, 0x3fb8aa3b, v52 + ; GCN-NEXT: v_mul_f32_e32 v53, 0x3fb8aa3b, v53 + ; GCN-NEXT: v_mul_f32_e32 v54, 0x3fb8aa3b, v54 + ; GCN-NEXT: v_mul_f32_e32 v55, 0x3fb8aa3b, v55 + ; GCN-NEXT: v_mul_f32_e32 v56, 0x3fb8aa3b, v56 + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v51, v51 + ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v50 + ; GCN-NEXT: v_exp_f32_e32 v52, v52 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v49 + ; GCN-NEXT: v_exp_f32_e32 v53, v53 + ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v48 + ; GCN-NEXT: v_exp_f32_e32 v54, v54 + ; GCN-NEXT: v_exp_f32_e32 v55, v55 + ; GCN-NEXT: v_exp_f32_e32 v56, v56 + ; GCN-NEXT: v_exp_f32_e32 v57, v57 + ; GCN-NEXT: v_exp_f32_e32 v80, v32 + ; GCN-NEXT: v_add_f32_e32 v32, 0, v48 + ; GCN-NEXT: v_add_f32_e32 v32, v49, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v50, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v65, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v51, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v52, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v53, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v54, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v55, v32 + ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v58 + ; GCN-NEXT: v_add_f32_e32 v32, v56, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v57, v32 + ; GCN-NEXT: v_exp_f32_e32 v58, v58 + ; GCN-NEXT: v_mul_f32_e32 v59, 0x3fb8aa3b, v59 + ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v60 + ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v61 + ; GCN-NEXT: v_add_f32_e32 v32, v58, v32 + ; GCN-NEXT: v_exp_f32_e32 v59, v59 + ; GCN-NEXT: v_exp_f32_e32 v60, v60 + ; GCN-NEXT: v_exp_f32_e32 v61, v61 + ; GCN-NEXT: v_mul_f32_e32 v62, 0x3fb8aa3b, v62 + ; GCN-NEXT: v_exp_f32_e32 v62, v62 + ; GCN-NEXT: v_add_f32_e32 v32, v59, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v60, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v61, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v62, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v63, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v66, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v67, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v68, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v69, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v70, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v71, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v72, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v73, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v74, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v75, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v76, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v77, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v78, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v79, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v80, v32 + ; GCN-NEXT: ds_bpermute_b32 v33, v105, v32 + ; GCN-NEXT: v_add_u32_e32 v38, s6, v94 + ; GCN-NEXT: v_and_b32_e32 v38, 0x1fffffff, v38 + ; GCN-NEXT: v_mul_lo_u32 v38, v38, s5 + ; GCN-NEXT: v_perm_b32 v34, v90, v88, s2 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_add_f32_e32 v32, v32, v33 + ; GCN-NEXT: ds_bpermute_b32 v33, v105, v32 + ; GCN-NEXT: v_perm_b32 v35, v90, v88, s3 + ; GCN-NEXT: v_perm_b32 v36, v91, v89, s2 + ; GCN-NEXT: v_perm_b32 v37, v91, v89, s3 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v33, v33, v32, s[0:1] + ; GCN-NEXT: v_sub_f32_e32 v32, v93, v64 + ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32 + ; GCN-NEXT: v_exp_f32_e32 v32, v32 + ; GCN-NEXT: v_add_lshl_u32 v64, v95, v38, 1 + ; GCN-NEXT: v_lshl_add_u32 v81, v96, 1, v64 + ; GCN-NEXT: v_lshl_add_u32 v82, v97, 1, v81 + ; GCN-NEXT: v_lshl_add_u32 v83, v98, 1, v82 + ; GCN-NEXT: v_fmac_f32_e32 v33, v92, v32 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v70 + ; GCN-NEXT: ds_write_b32 v64, v34 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v71 + ; GCN-NEXT: ds_write_b32 v81, v35 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v72 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v20 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v64, s4, v16 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v17 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 - ; GCN-NEXT: v_max3_f32 v64, v64, s5, v65 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v21 - ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v22 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v23 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v24 - ; GCN-NEXT: v_mul_f32_e32 v87, s4, v25 - ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v26 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v27 - ; GCN-NEXT: v_max3_f32 v64, v64, v86, v87 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v28 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v29 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v30 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v31 - ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v0 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v80, v84 - ; GCN-NEXT: v_mul_f32_e32 v87, s4, v2 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v3 - ; GCN-NEXT: v_max3_f32 v64, v64, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v4 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v5 - ; GCN-NEXT: v_max3_f32 v64, v64, v87, v65 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v7 - ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v8 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v10 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v11 - ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v87, s4, v12 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v13 - ; GCN-NEXT: v_max3_f32 v64, v64, v86, v65 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v14 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v15 - ; GCN-NEXT: v_max3_f32 v64, v64, v87, v68 - ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 - ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 - ; GCN-NEXT: v_perm_b32 v68, v75, v73, s3 + ; GCN-NEXT: ds_write_b32 v82, v36 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v68 - ; GCN-NEXT: ; implicit-def: $vgpr84 - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v70, v64, v65 + ; GCN-NEXT: ds_write_b32 v83, v37 + ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[32:33] op_sel_hi:[1,0] + ; GCN-NEXT: v_add_u32_e32 v32, v99, v106 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[48:49], v32, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v32, v100, v106 + ; GCN-NEXT: v_cvt_f16_f32_e32 v87, v51 + ; GCN-NEXT: buffer_load_dwordx2 v[50:51], v32, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_bpermute_b32 v71, v66, v70 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v70, v71, v70, s[0:1] - ; GCN-NEXT: v_max_f32_e32 v70, v70, v70 - ; GCN-NEXT: v_max_f32_e32 v72, v81, v70 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v72 - ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v72 - ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v72 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 - ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 - ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72 - ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72 - ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72 - ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72 - ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72 - ; GCN-NEXT: v_exp_f32_e32 v73, v16 - ; GCN-NEXT: v_exp_f32_e32 v74, v18 - ; GCN-NEXT: v_exp_f32_e32 v75, v19 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 - ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 - ; GCN-NEXT: v_exp_f32_e32 v80, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v73 - ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72 - ; GCN-NEXT: v_exp_f32_e32 v81, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v74 - ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72 - ; GCN-NEXT: v_exp_f32_e32 v82, v22 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v75 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72 - ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_sub_f32_e32 v24, v67, v72 - ; GCN-NEXT: v_exp_f32_e32 v83, v23 - ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72 - ; GCN-NEXT: v_exp_f32_e32 v85, v22 - ; GCN-NEXT: v_exp_f32_e32 v17, v17 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 - ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72 - ; GCN-NEXT: v_exp_f32_e32 v88, v23 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72 - ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 + ; GCN-NEXT: ds_read_b128 v[32:35], v101 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v16, v24 - ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576 + ; GCN-NEXT: ds_read_b128 v[36:39], v101 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v18, 0, v73 - ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83 - ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80 - ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72 - ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v17, v18 - ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81 - ; GCN-NEXT: v_fma_f32 v23, s4, v30, -v72 - ; GCN-NEXT: v_exp_f32_e32 v30, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82 - ; GCN-NEXT: v_fma_f32 v18, s4, v31, -v72 - ; GCN-NEXT: v_perm_b32 v31, v68, v64, s2 - ; GCN-NEXT: v_perm_b32 v64, v68, v64, s3 - ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3 - ; GCN-NEXT: ds_read_b128 v[26:29], v91 + ; GCN-NEXT: ds_read_b128 v[40:43], v102 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v91 offset:576 + ; GCN-NEXT: ds_read_b128 v[44:47], v102 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_perm_b32 v88, v50, v48, s2 + ; GCN-NEXT: v_perm_b32 v48, v50, v48, s3 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v31 - ; GCN-NEXT: v_mul_f32_e32 v31, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_exp_f32_e32 v31, v31 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_pack_b32_f16 v18, v19, v86 - ; GCN-NEXT: v_pack_b32_f16 v19, v22, v89 + ; GCN-NEXT: ds_write_b32 v64, v88 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v64 + ; GCN-NEXT: ds_write_b32 v81, v48 + ; GCN-NEXT: v_cvt_f16_f32_e32 v48, v65 + ; GCN-NEXT: v_perm_b32 v50, v51, v49, s2 + ; GCN-NEXT: v_perm_b32 v49, v51, v49, s3 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v90 + ; GCN-NEXT: ds_write_b32 v82, v50 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v65 - ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v73 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v87 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v74, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v85 - ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72 - ; GCN-NEXT: v_exp_f32_e32 v22, v64 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v88 - ; GCN-NEXT: v_exp_f32_e32 v64, v65 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v75, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 - ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72 - ; GCN-NEXT: v_exp_f32_e32 v23, v23 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v31 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v0 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v20, v21 - ; GCN-NEXT: v_pack_b32_f16 v1, v18, v19 - ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72 - ; GCN-NEXT: v_exp_f32_e32 v25, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v80, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 - ; GCN-NEXT: v_fma_f32 v26, s4, v4, -v72 - ; GCN-NEXT: v_exp_f32_e32 v27, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v5, -v72 - ; GCN-NEXT: v_exp_f32_e32 v65, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 - ; GCN-NEXT: v_add_f32_e32 v17, v81, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v23 - ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72 - ; GCN-NEXT: v_exp_f32_e32 v68, v2 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v25 + ; GCN-NEXT: ds_write_b32 v83, v49 + ; GCN-NEXT: v_pack_b32_f16 v49, v86, v48 + ; GCN-NEXT: v_pack_b32_f16 v48, v84, v85 + ; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 + ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 + ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v54 + ; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 + ; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 + ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 + ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v58 + ; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 + ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 + ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 + ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v62 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[48:49], v[16:31] + ; GCN-NEXT: v_pack_b32_f16 v51, v53, v50 + ; GCN-NEXT: v_pack_b32_f16 v50, v87, v52 + ; GCN-NEXT: v_pack_b32_f16 v53, v57, v54 + ; GCN-NEXT: v_pack_b32_f16 v52, v55, v56 + ; GCN-NEXT: v_pack_b32_f16 v55, v61, v58 + ; GCN-NEXT: v_pack_b32_f16 v54, v59, v60 + ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v80 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[50:51], v[16:31] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[40:41], v[52:53], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[42:43], v[54:55], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v43, v63 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[36:37], v[48:49], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v78 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[38:39], v[50:51], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v39, v66 + ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v67 + ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v68 + ; GCN-NEXT: v_pack_b32_f16 v41, v40, v38 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[52:53], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v79 + ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v77 + ; GCN-NEXT: v_pack_b32_f16 v40, v43, v39 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[54:55], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v48, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v70 + ; GCN-NEXT: v_cvt_f16_f32_e32 v42, v71 + ; GCN-NEXT: v_cvt_f16_f32_e32 v35, v72 + ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v75 + ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v76 + ; GCN-NEXT: v_pack_b32_f16 v43, v42, v35 + ; GCN-NEXT: v_pack_b32_f16 v42, v48, v37 + ; GCN-NEXT: v_pack_b32_f16 v35, v46, v33 + ; GCN-NEXT: v_pack_b32_f16 v33, v44, v32 + ; GCN-NEXT: v_pack_b32_f16 v32, v45, v36 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v84 + ; GCN-NEXT: ds_read_b128 v[36:39], v101 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pack_b32_f16 v4, v18, v4 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v19 - ; GCN-NEXT: v_exp_f32_e32 v24, v24 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 offset:576 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] + ; GCN-NEXT: ds_read_b128 v[36:39], v101 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v82, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v27 - ; GCN-NEXT: v_exp_f32_e32 v26, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v29, v65 - ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72 - ; GCN-NEXT: v_exp_f32_e32 v67, v67 - ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v83, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v68 - ; GCN-NEXT: v_exp_f32_e32 v6, v6 - ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v24 - ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v7 - ; GCN-NEXT: v_exp_f32_e32 v7, v7 - ; GCN-NEXT: v_pack_b32_f16 v4, v28, v29 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v69 - ; GCN-NEXT: ; implicit-def: $sgpr2 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v0, v85, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v4, v88, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 - ; GCN-NEXT: v_exp_f32_e32 v10, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 - ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 - ; GCN-NEXT: v_pack_b32_f16 v0, v17, v28 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v30, v4 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v0, v31, v2 - ; GCN-NEXT: v_add_f32_e32 v0, v22, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v64, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v23, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v25, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v27, v0 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72 - ; GCN-NEXT: v_add_f32_e32 v0, v65, v0 - ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72 - ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v68, v0 - ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72 - ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72 - ; GCN-NEXT: v_exp_f32_e32 v8, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v24, v0 - ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72 - ; GCN-NEXT: v_exp_f32_e32 v9, v9 - ; GCN-NEXT: v_add_f32_e32 v0, v26, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v67, v0 - ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72 - ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v5 - ; GCN-NEXT: v_add_f32_e32 v0, v6, v0 - ; GCN-NEXT: v_exp_f32_e32 v11, v11 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 - ; GCN-NEXT: v_exp_f32_e32 v12, v3 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13 - ; GCN-NEXT: v_exp_f32_e32 v17, v1 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v14 - ; GCN-NEXT: v_add_f32_e32 v0, v7, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9 - ; GCN-NEXT: v_exp_f32_e32 v15, v3 - ; GCN-NEXT: v_exp_f32_e32 v18, v1 - ; GCN-NEXT: v_add_f32_e32 v6, v8, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v91 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[36:37], v[40:41], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[38:39], v[42:43], v[0:15] + ; GCN-NEXT: ds_read_b128 v[36:39], v102 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 - ; GCN-NEXT: v_add_f32_e32 v6, v9, v6 - ; GCN-NEXT: v_pack_b32_f16 v8, v4, v13 - ; GCN-NEXT: v_add_f32_e32 v6, v10, v6 - ; GCN-NEXT: v_pack_b32_f16 v9, v5, v14 - ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v12 - ; GCN-NEXT: v_add_f32_e32 v6, v11, v6 - ; GCN-NEXT: v_add_f32_e32 v6, v12, v6 - ; GCN-NEXT: v_add_f32_e32 v1, v15, v6 - ; GCN-NEXT: v_add_f32_e32 v11, v17, v1 - ; GCN-NEXT: v_pack_b32_f16 v1, v0, v7 - ; GCN-NEXT: v_pack_b32_f16 v0, v4, v10 - ; GCN-NEXT: ds_read_b128 v[4:7], v91 offset:576 + ; GCN-NEXT: v_cvt_f16_f32_e32 v47, v73 + ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v74 + ; GCN-NEXT: v_pack_b32_f16 v34, v47, v34 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[34:35], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[32:33], v[16:31] + ; GCN-NEXT: s_nop 7 + ; GCN-NEXT: s_nop 2 + ; GCN-NEXT: ds_read_b128 v[16:19], v102 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[34:35], v[0:15] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mov_b32_e32 v4, 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v18, v11 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_add_f32_e32 v2, v2, v3 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] - ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[18:19], v[32:33], v[0:15] ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 565ad295ebbb3..7f10ed31b3651 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -17,100 +17,100 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_mov_b32_e32 v3, 2.0 -; GCN-NEXT: ; iglp_opt mask(0x00000000) -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440 -; GCN-NEXT: ds_read_b128 a[20:23], v2 offset:57424 -; GCN-NEXT: ds_read_b128 a[16:19], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[0:3], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[4:7], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:57392 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152 -; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:49152 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; GCN-NEXT: ds_read_b128 a[28:31], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[24:27], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[20:23], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[16:19], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[0:3], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[4:7], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[8:11], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[12:15], v4 offset:57392 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[128:131], v3 offset:24576 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(4) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[128:131], v1 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 +; GCN-NEXT: ; iglp_opt mask(0x00000000) +; GCN-NEXT: s_waitcnt lgkmcnt(2) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[64:67], v3 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; GCN-NEXT: ds_read_b128 a[60:63], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[56:59], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[52:55], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[48:51], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[44:47], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[40:43], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[36:39], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[32:35], v3 offset:8192 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:24608 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[128:131] +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[64:67] ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:24592 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 ; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16400 ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 ; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 @@ -156,62 +156,62 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 -; GCN-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[0:3], v1 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v3 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-NEXT: ds_read_b128 a[60:63], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[56:59], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[52:55], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[48:51], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[44:47], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[40:43], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[36:39], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[32:35], v3 offset:8192 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:24576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:49152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] +; GCN-NEXT: ds_read_b128 a[156:159], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[152:155], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[148:151], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[144:147], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[128:131], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[132:135], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[136:139], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[140:143], v4 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 @@ -221,38 +221,38 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] ; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:8208 -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:24592 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784 ; GCN-NEXT: s_endpgm entry: call void @llvm.amdgcn.iglp.opt(i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir index e93595b9ef273..f079d0b8c392a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir @@ -135,17 +135,17 @@ body: | ; GCN-NEXT: [[DEF34:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_10]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in22, !alias.scope !0, addrspace 7) ; GCN-NEXT: [[DEF35:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_11]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in23, !alias.scope !0, addrspace 7) ; GCN-NEXT: [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_9]].sub2_sub3, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[DEF36:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_12]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7) - ; GCN-NEXT: [[DEF37:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_13]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7) ; GCN-NEXT: [[V_ADD_U32_e32_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF22]], implicit $exec ; GCN-NEXT: [[V_ADD_U32_e32_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF23]], implicit $exec + ; GCN-NEXT: [[DEF36:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_12]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[DEF37:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_13]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7) ; GCN-NEXT: [[DEF38:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_14]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in26, !alias.scope !0, addrspace 7) ; GCN-NEXT: [[DEF39:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_15]], [[DEF47]], 0, 0, 0, 0, implicit $exec ; GCN-NEXT: [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_10]].sub0_sub1, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF2]], 0, 0, implicit $exec :: (store (s128) into %ir.in2, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[V_ADD_U32_e32_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF45]], implicit $exec - ; GCN-NEXT: [[V_ADD_U32_e32_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF46]], implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF2]], 0, 0, implicit $exec :: (store (s128) into %ir.in2, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DEF2:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_16]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in26, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_ADD_U32_e32_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF46]], implicit $exec ; GCN-NEXT: [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_10]].sub2_sub3, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF3]], 2064, 0, implicit $exec :: (store (s128) into %ir.in3, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DEF3:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF45]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in27, !alias.scope !0, addrspace 7) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir index a85478df10eb2..905cd0eaf52d3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir @@ -59,9 +59,10 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 0, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF1]] ; GCN-NEXT: [[DS_READ_B128_gfx9_2:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DS_READ_B128_gfx9_3:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 2064, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3) - ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF1]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:areg_512_align2 = COPY [[DEF]] ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 1024, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF33]], implicit $exec @@ -74,7 +75,6 @@ body: | ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF4]], [[DEF16]], 0, 0, implicit $exec :: (store (s128) into %ir.in6, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DEF16:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF6]], [[DEF7]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 7) ; GCN-NEXT: dead [[COPY:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[COPY1:%[0-9]+]]:areg_512_align2 = COPY [[DEF]] ; GCN-NEXT: undef [[DEF17:%[0-9]+]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF30]], implicit $exec ; GCN-NEXT: [[DEF17:%[0-9]+]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF30]], implicit $exec ; GCN-NEXT: [[DEF17:%[0-9]+]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF30]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 67ae05eb6f0b8..9d6f18dad3366 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -3231,47 +3231,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 1.0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s4 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s7 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s8 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s10 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s13 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 -; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; NOLIT-SRCC-NEXT: s_nop 0 +; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] +; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 @@ -3299,47 +3299,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; LIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 1.0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s4 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s7 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s8 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s10 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s13 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 -; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; LIT-SRCC-NEXT: s_nop 0 +; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] +; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 @@ -3367,8 +3367,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; GFX90A-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -3389,8 +3389,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 1 @@ -3403,8 +3403,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; GFX942-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) @@ -3425,8 +3425,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] -; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v0, a[0:15] +; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v0, a[0:15] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 73586b1243376..0ca07a0a07dab 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -623,63 +623,63 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[128:131], v1 -; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: ds_read_b128 a[60:63], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v2 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v2 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v2 offset:57392 -; GCN-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-NEXT: v_add_u32_e32 v3, s0, v0 +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[128:131], v3 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576 +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152 +; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; GCN-NEXT: s_waitcnt lgkmcnt(14) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] ; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 4 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 @@ -730,63 +730,63 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:24576 -; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:49152 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 1.0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v2 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v2 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v2 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v2 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v2 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v2 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v2 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v2 offset:57392 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v3, s0, v0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v3 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v3 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v3 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v3 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v3 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v3 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v3 offset:48 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v3 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v3 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v3 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v3 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v3 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v3 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v3 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v3 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v3 offset:8192 +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:24576 +; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v3 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v3 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:49152 +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v4 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v4 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v4 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v4 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v4 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v4 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v4 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] ; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 4 +; EXACTCUTOFF-NEXT: s_nop 3 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80 @@ -1199,144 +1199,144 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 -; GCN-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b +; GCN-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v7, 0x32a5705f +; GCN-NEXT: v_mov_b32_e32 v6, 0x32a5705f ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, s0, v3 -; GCN-NEXT: v_rndne_f32_e32 v5, v4 -; GCN-NEXT: v_sub_f32_e32 v6, v4, v5 -; GCN-NEXT: v_fma_f32 v4, s0, v3, -v4 -; GCN-NEXT: v_fmac_f32_e32 v4, s0, v7 -; GCN-NEXT: v_add_f32_e32 v4, v6, v4 -; GCN-NEXT: v_exp_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GCN-NEXT: v_mul_f32_e32 v3, s0, v2 +; GCN-NEXT: v_rndne_f32_e32 v4, v3 +; GCN-NEXT: v_sub_f32_e32 v5, v3, v4 +; GCN-NEXT: v_fma_f32 v3, s0, v2, -v3 +; GCN-NEXT: v_fmac_f32_e32 v3, s0, v6 +; GCN-NEXT: v_add_f32_e32 v3, v5, v3 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_add_u32_e32 v1, s6, v0 -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[96:99], v1 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:48 -; GCN-NEXT: v_mov_b32_e32 v9, 1.0 -; GCN-NEXT: v_ldexp_f32 v4, v4, v5 -; GCN-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; GCN-NEXT: v_mul_f32_e32 v10, s1, v3 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 -; GCN-NEXT: v_mov_b32_e32 v6, 0x42b17218 -; GCN-NEXT: v_rndne_f32_e32 v11, v10 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 +; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v1 +; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; GCN-NEXT: v_mov_b32_e32 v5, 1.0 +; GCN-NEXT: v_ldexp_f32 v3, v3, v4 +; GCN-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v4 +; GCN-NEXT: v_mov_b32_e32 v7, 0x42b17218 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v7 ; GCN-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; GCN-NEXT: v_sub_f32_e32 v12, v10, v11 -; GCN-NEXT: v_fma_f32 v10, s1, v3, -v10 -; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GCN-NEXT: v_fmac_f32_e32 v10, s1, v7 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v4, a[96:127] -; GCN-NEXT: v_add_f32_e32 v4, v12, v10 -; GCN-NEXT: v_exp_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; GCN-NEXT: v_ldexp_f32 v4, v4, v10 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 -; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GCN-NEXT: v_mul_f32_e32 v10, s2, v3 -; GCN-NEXT: v_rndne_f32_e32 v11, v10 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31] -; GCN-NEXT: v_fma_f32 v4, s2, v3, -v10 -; GCN-NEXT: v_sub_f32_e32 v12, v10, v11 -; GCN-NEXT: v_fmac_f32_e32 v4, s2, v7 -; GCN-NEXT: v_add_f32_e32 v4, v12, v4 -; GCN-NEXT: v_exp_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11 -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 -; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152 -; GCN-NEXT: v_ldexp_f32 v1, v4, v10 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 -; GCN-NEXT: v_mul_f32_e32 v4, s3, v3 -; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_rndne_f32_e32 v10, v4 -; GCN-NEXT: s_load_dword s8, s[4:5], 0x54 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] -; GCN-NEXT: v_sub_f32_e32 v1, v4, v10 -; GCN-NEXT: v_fma_f32 v4, s3, v3, -v4 -; GCN-NEXT: v_fmac_f32_e32 v4, s3, v7 -; GCN-NEXT: v_add_f32_e32 v1, v1, v4 -; GCN-NEXT: v_exp_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v10 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 -; GCN-NEXT: ds_read_b128 a[156:159], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[152:155], v2 offset:57440 -; GCN-NEXT: v_ldexp_f32 v1, v1, v4 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 -; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v4, s8, v3 -; GCN-NEXT: v_fma_f32 v3, s8, v3, -v4 -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v1, a[32:63] -; GCN-NEXT: v_rndne_f32_e32 v1, v4 -; GCN-NEXT: v_sub_f32_e32 v10, v4, v1 -; GCN-NEXT: v_fmac_f32_e32 v3, s8, v7 +; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288 +; GCN-NEXT: s_waitcnt lgkmcnt(2) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v3, a[0:31] +; GCN-NEXT: v_mul_f32_e32 v3, s1, v2 +; GCN-NEXT: v_rndne_f32_e32 v9, v3 +; GCN-NEXT: v_sub_f32_e32 v10, v3, v9 +; GCN-NEXT: v_fma_f32 v3, s1, v2, -v3 +; GCN-NEXT: v_fmac_f32_e32 v3, s1, v6 ; GCN-NEXT: v_add_f32_e32 v3, v10, v3 ; GCN-NEXT: v_exp_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: ds_read_b128 a[148:151], v2 offset:57424 -; GCN-NEXT: ds_read_b128 a[144:147], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[128:131], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[132:135], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392 -; GCN-NEXT: v_ldexp_f32 v1, v3, v1 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v5 +; GCN-NEXT: v_cvt_i32_f32_e32 v9, v9 +; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 +; GCN-NEXT: v_ldexp_f32 v3, v3, v9 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v7 +; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 +; GCN-NEXT: s_waitcnt lgkmcnt(2) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v5, v3, a[128:159] +; GCN-NEXT: v_mul_f32_e32 v3, s2, v2 +; GCN-NEXT: v_rndne_f32_e32 v9, v3 +; GCN-NEXT: v_sub_f32_e32 v10, v3, v9 +; GCN-NEXT: v_fma_f32 v3, s2, v2, -v3 +; GCN-NEXT: v_fmac_f32_e32 v3, s2, v6 +; GCN-NEXT: v_add_f32_e32 v3, v10, v3 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v9, v9 +; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 +; GCN-NEXT: v_ldexp_f32 v3, v3, v9 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v7 +; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 +; GCN-NEXT: s_waitcnt lgkmcnt(2) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v5, v3, a[96:127] +; GCN-NEXT: v_mul_f32_e32 v3, s3, v2 +; GCN-NEXT: v_rndne_f32_e32 v9, v3 +; GCN-NEXT: v_sub_f32_e32 v10, v3, v9 +; GCN-NEXT: v_fma_f32 v3, s3, v2, -v3 +; GCN-NEXT: v_fmac_f32_e32 v3, s3, v6 +; GCN-NEXT: v_add_f32_e32 v3, v10, v3 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v9, v9 +; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x54 +; GCN-NEXT: v_ldexp_f32 v3, v3, v9 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v7 +; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v5, v3, a[64:95] +; GCN-NEXT: v_mul_f32_e32 v3, s0, v2 +; GCN-NEXT: v_rndne_f32_e32 v9, v3 +; GCN-NEXT: v_fma_f32 v2, s0, v2, -v3 +; GCN-NEXT: v_sub_f32_e32 v10, v3, v9 +; GCN-NEXT: v_fmac_f32_e32 v2, s0, v6 +; GCN-NEXT: v_add_f32_e32 v2, v10, v2 +; GCN-NEXT: v_exp_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v9 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392 +; GCN-NEXT: v_ldexp_f32 v1, v2, v3 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v7 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GCN-NEXT: v_add_u32_e32 v0, s7, v0 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[96:99] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v5, v1, a[32:63] +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[0:3] ; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1347,181 +1347,181 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784 ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x32a5705f +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v6, 0x32a5705f ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s0, v3 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v5, v4 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v6, v4, v5 -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s0, v3, -v4 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s0, v7 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v6, v4 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v3, s0, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v4, v3 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v5, v3, v4 +; EXACTCUTOFF-NEXT: v_fma_f32 v3, s0, v2, -v3 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s0, v6 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v5, v3 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v4, v4 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s6, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:48 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v9, 1.0 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v5 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s1, v3 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v6, 0x42b17218 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v5, 1.0 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v3, v3, v4 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v4 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x42b17218 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v7 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11 -; EXACTCUTOFF-NEXT: v_fma_f32 v10, s1, v3, -v10 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v10, s1, v7 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v4, a[96:127] -; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v10 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v10 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s2, v3 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31] -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s2, v3, -v10 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s2, v7 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v4 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11 -; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:24576 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:49152 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v4, v10 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s3, v3 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v4 -; EXACTCUTOFF-NEXT: s_load_dword s8, s[4:5], 0x54 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v10 -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s3, v3, -v4 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s3, v7 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v1, v4 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v1, v1 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v4, v10 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 -; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v2 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v2 offset:57440 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v1, v4 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s8, v3 -; EXACTCUTOFF-NEXT: v_fma_f32 v3, s8, v3, -v4 -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v1, a[32:63] -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v1, v4 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v4, v1 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s8, v7 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:8288 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v5, v3, a[0:31] +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v3, s1, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v3 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v3, v9 +; EXACTCUTOFF-NEXT: v_fma_f32 v3, s1, v2, -v3 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s1, v6 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v10, v3 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v9, v9 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1 offset:8192 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v3, v3, v9 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v4 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v7 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:24672 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v5, v3, a[128:159] +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v3, s2, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v3 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v3, v9 +; EXACTCUTOFF-NEXT: v_fma_f32 v3, s2, v2, -v3 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s2, v6 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v10, v3 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v9, v9 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:24576 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v3, v3, v9 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v4 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v7 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:49248 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v5, v3, a[96:127] +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v3, s3, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v3 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v3, v9 +; EXACTCUTOFF-NEXT: v_fma_f32 v3, s3, v2, -v3 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s3, v6 ; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v10, v3 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v1, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v2 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v2 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v2 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v2 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v3, v1 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v5 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v9, v9 +; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:49152 +; EXACTCUTOFF-NEXT: s_load_dword s0, s[4:5], 0x54 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v3, v3, v9 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v4 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v7 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:57456 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v5, v3, a[64:95] +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v3, s0, v2 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v3 +; EXACTCUTOFF-NEXT: v_fma_f32 v2, s0, v2, -v3 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v3, v9 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v2, s0, v6 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v2, v10, v2 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v2, v2 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v3, v9 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:57392 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v2, v3 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v4 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v7 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s7, v0 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:112 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:96 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:80 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:64 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:48 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:32 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:16 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v5, v1, a[32:63] +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s7 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1532,38 +1532,38 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:24672 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:24688 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:24640 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:24656 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:24608 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:24624 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:24576 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:32784 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:8288 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:8304 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:8256 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:8272 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:8224 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:8240 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:8208 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:16480 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:16496 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:16448 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:16464 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:16416 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:16432 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:16384 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:16400 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:24672 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:24688 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:24640 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:24656 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:24608 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:24624 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:24592 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:32784 ; EXACTCUTOFF-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir index 23412aaeb2e23..1eea49843d168 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir @@ -1988,8 +1988,8 @@ body: | ; GFX908: bb.0: ; GFX908-NEXT: successors: %bb.1(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode ; GFX908-NEXT: [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF @@ -2274,7 +2274,7 @@ body: | ; GFX908-NEXT: S_NOP 0, implicit [[DEF91]], implicit [[DEF92]], implicit [[DEF93]], implicit [[DEF94]], implicit [[DEF95]], implicit [[DEF96]], implicit [[DEF97]], implicit [[DEF98]], implicit [[DEF99]], implicit [[DEF100]] ; GFX908-NEXT: S_NOP 0, implicit [[DEF101]], implicit [[DEF102]], implicit [[DEF103]], implicit [[DEF104]], implicit [[DEF105]], implicit [[DEF106]], implicit [[DEF107]], implicit [[DEF108]], implicit [[DEF109]], implicit [[DEF110]] ; GFX908-NEXT: S_NOP 0, implicit [[DEF111]], implicit [[DEF112]], implicit [[DEF113]], implicit [[DEF114]], implicit [[DEF115]], implicit [[DEF116]], implicit [[DEF117]], implicit [[DEF118]], implicit [[DEF119]], implicit [[DEF120]] - ; GFX908-NEXT: S_NOP 0, implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]], implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] + ; GFX908-NEXT: S_NOP 0, implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]], implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: reduce_spill_agpr_above_addressable_limit diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 9cc42ac448067..03b42479f4b4a 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -2391,660 +2391,656 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: .LBB3_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x34 -; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: s_clause 0xa -; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:122 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:125 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:130 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:129 @@ -10118,683 +10114,679 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800 ; ALIGNED-NEXT: .LBB8_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: s_clause 0x3a -; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128 -; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124 -; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:122 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:125 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:130 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:125 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:128 -; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:124 -; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:122 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:121 @@ -11173,827 +11165,823 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_mov_b32 s5, -1 ; ALIGNED-NEXT: .LBB8_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: s_clause 0xa -; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128 -; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x34 -; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124 -; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:121 -; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:120 -; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:119 -; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:117 -; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:116 -; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:114 -; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:113 -; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:112 -; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:111 -; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:108 -; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:107 -; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:106 -; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:105 -; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:104 -; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:103 -; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:101 -; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:100 -; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:98 -; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:97 -; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:96 -; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:95 -; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:92 -; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:88 -; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:81 -; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:74 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:122 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:125 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0xffffff00, v1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:130 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:125 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:122 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:121 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:119 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:118 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:117 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:116 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:115 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:114 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:113 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:112 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:111 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:110 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:109 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:108 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:107 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:106 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:105 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:104 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:103 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:102 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:100 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:99 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:98 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:97 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:96 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:95 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:94 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:93 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:91 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:90 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:89 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:85 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:79 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:77 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:128 -; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:124 -; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:121 -; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:120 -; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:119 -; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:117 -; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:116 -; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:114 -; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:113 -; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:112 -; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:111 -; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:108 -; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:107 -; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:106 -; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:105 -; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:104 -; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:103 -; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:101 -; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:100 -; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:98 -; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:97 -; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:96 -; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:95 -; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:92 -; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:88 -; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:81 -; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:73 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index 21af2dde2c4bf..25b9f6e06eaf9 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -635,47 +635,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY908: ; %bb.0: ; %bb ; GREEDY908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0 +; GREEDY908-NEXT: v_mov_b32_e32 v1, 1.0 +; GREEDY908-NEXT: v_mov_b32_e32 v0, 2.0 ; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: v_mov_b32_e32 v5, s15 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s14 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s13 +; GREEDY908-NEXT: v_mov_b32_e32 v3, s14 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s13 ; GREEDY908-NEXT: v_accvgpr_write_b32 a33, v5 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s12 -; GREEDY908-NEXT: v_accvgpr_write_b32 a32, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a32, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s11 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s10 +; GREEDY908-NEXT: v_mov_b32_e32 v3, s11 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s10 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s9 -; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s8 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s7 +; GREEDY908-NEXT: v_mov_b32_e32 v3, s8 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s7 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s6 -; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s5 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s4 +; GREEDY908-NEXT: v_mov_b32_e32 v3, s5 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s4 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s3 -; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v2, s2 -; GREEDY908-NEXT: v_mov_b32_e32 v1, s1 +; GREEDY908-NEXT: v_mov_b32_e32 v3, s2 +; GREEDY908-NEXT: v_mov_b32_e32 v2, s1 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s0 -; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2 -; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1 +; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v3 +; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v5 -; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0 -; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] -; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] +; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v1, v0, a[18:33] +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v1, v0, a[18:33] ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19 @@ -684,7 +684,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 @@ -719,8 +719,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY90A: ; %bb.0: ; %bb ; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -742,14 +742,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0 ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v1, v0, a[18:33] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v1, v0, a[18:33] ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19 ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] +; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 2 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 @@ -761,8 +761,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY942: ; %bb.0: ; %bb ; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY942-NEXT: v_mov_b32_e32 v1, 1.0 +; GREEDY942-NEXT: v_mov_b32_e32 v0, 2.0 ; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -784,14 +784,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1 ; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33] -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v1, v0, a[18:33] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v1, v0, a[18:33] ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 0 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] +; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v0, a[0:15] ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 @@ -923,8 +923,8 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-LABEL: test_mfma_f32_4x4x1f32: ; GREEDY908: ; %bb.0: ; %bb ; GREEDY908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY908-NEXT: v_mov_b32_e32 v1, 1.0 +; GREEDY908-NEXT: v_mov_b32_e32 v0, 2.0 ; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 @@ -938,10 +938,10 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v3 ; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v5 ; GREEDY908-NEXT: s_nop 0 -; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] -; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3] +; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] +; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v1, v0, a[0:3] ; GREEDY908-NEXT: s_nop 1 -; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] ; GREEDY908-NEXT: s_nop 3 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 @@ -954,8 +954,8 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-LABEL: test_mfma_f32_4x4x1f32: ; GREEDY90A: ; %bb.0: ; %bb ; GREEDY90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GREEDY90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 @@ -965,10 +965,10 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] -; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3] +; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] +; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v1, v0, a[0:3] ; GREEDY90A-NEXT: s_nop 1 -; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] +; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] ; GREEDY90A-NEXT: s_nop 4 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] ; GREEDY90A-NEXT: s_endpgm @@ -976,8 +976,8 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-LABEL: test_mfma_f32_4x4x1f32: ; GREEDY942: ; %bb.0: ; %bb ; GREEDY942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 -; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 +; GREEDY942-NEXT: v_mov_b32_e32 v1, 1.0 +; GREEDY942-NEXT: v_mov_b32_e32 v0, 2.0 ; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 @@ -987,11 +987,11 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s2 ; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s3 ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] +; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v0, a[0:3] ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[2:5], v0, v1, a[0:3] +; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[2:5], v1, v0, a[0:3] ; GREEDY942-NEXT: s_nop 1 -; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] +; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v0, a[0:3] ; GREEDY942-NEXT: s_nop 3 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] ; GREEDY942-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 0c6339e4f5121..9b5bf35884bac 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -7,52 +7,52 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp ; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 7, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112 -; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96 -; CHECK-NEXT: global_load_dwordx4 v[20:23], v0, s[0:1] offset:80 -; CHECK-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] offset:64 -; CHECK-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] offset:48 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] offset:32 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16 +; CHECK-NEXT: global_load_dwordx4 v[30:33], v2, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[26:29], v2, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[22:25], v2, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[18:21], v2, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[14:17], v2, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[10:13], v2, s[0:1] offset:32 +; CHECK-NEXT: global_load_dwordx4 v[6:9], v2, s[0:1] offset:16 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] +; CHECK-NEXT: global_load_dwordx4 v[2:5], v2, s[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v2 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v3 -; CHECK-NEXT: v_accvgpr_write_b32 a4, v4 -; CHECK-NEXT: v_accvgpr_write_b32 a5, v5 -; CHECK-NEXT: v_accvgpr_write_b32 a6, v6 -; CHECK-NEXT: v_accvgpr_write_b32 a7, v7 -; CHECK-NEXT: v_accvgpr_write_b32 a8, v8 -; CHECK-NEXT: v_accvgpr_write_b32 a9, v9 -; CHECK-NEXT: v_accvgpr_write_b32 a10, v10 -; CHECK-NEXT: v_accvgpr_write_b32 a11, v11 -; CHECK-NEXT: v_accvgpr_write_b32 a12, v12 -; CHECK-NEXT: v_accvgpr_write_b32 a13, v13 -; CHECK-NEXT: v_accvgpr_write_b32 a14, v14 -; CHECK-NEXT: v_accvgpr_write_b32 a15, v15 -; CHECK-NEXT: v_accvgpr_write_b32 a16, v16 -; CHECK-NEXT: v_accvgpr_write_b32 a17, v17 -; CHECK-NEXT: v_accvgpr_write_b32 a18, v18 -; CHECK-NEXT: v_accvgpr_write_b32 a19, v19 -; CHECK-NEXT: v_accvgpr_write_b32 a20, v20 -; CHECK-NEXT: v_accvgpr_write_b32 a21, v21 -; CHECK-NEXT: v_accvgpr_write_b32 a22, v22 -; CHECK-NEXT: v_accvgpr_write_b32 a23, v23 -; CHECK-NEXT: v_accvgpr_write_b32 a24, v24 -; CHECK-NEXT: v_accvgpr_write_b32 a25, v25 -; CHECK-NEXT: v_accvgpr_write_b32 a26, v26 -; CHECK-NEXT: v_accvgpr_write_b32 a27, v27 -; CHECK-NEXT: v_accvgpr_write_b32 a28, v28 -; CHECK-NEXT: v_accvgpr_write_b32 a29, v29 -; CHECK-NEXT: v_accvgpr_write_b32 a30, v30 -; CHECK-NEXT: v_accvgpr_write_b32 a31, v31 -; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 -; CHECK-NEXT: v_mov_b32_e32 v1, 2.0 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v2 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a3, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a4, v6 +; CHECK-NEXT: v_accvgpr_write_b32 a5, v7 +; CHECK-NEXT: v_accvgpr_write_b32 a6, v8 +; CHECK-NEXT: v_accvgpr_write_b32 a7, v9 +; CHECK-NEXT: v_accvgpr_write_b32 a8, v10 +; CHECK-NEXT: v_accvgpr_write_b32 a9, v11 +; CHECK-NEXT: v_accvgpr_write_b32 a10, v12 +; CHECK-NEXT: v_accvgpr_write_b32 a11, v13 +; CHECK-NEXT: v_accvgpr_write_b32 a12, v14 +; CHECK-NEXT: v_accvgpr_write_b32 a13, v15 +; CHECK-NEXT: v_accvgpr_write_b32 a14, v16 +; CHECK-NEXT: v_accvgpr_write_b32 a15, v17 +; CHECK-NEXT: v_accvgpr_write_b32 a16, v18 +; CHECK-NEXT: v_accvgpr_write_b32 a17, v19 +; CHECK-NEXT: v_accvgpr_write_b32 a18, v20 +; CHECK-NEXT: v_accvgpr_write_b32 a19, v21 +; CHECK-NEXT: v_accvgpr_write_b32 a20, v22 +; CHECK-NEXT: v_accvgpr_write_b32 a21, v23 +; CHECK-NEXT: v_accvgpr_write_b32 a22, v24 +; CHECK-NEXT: v_accvgpr_write_b32 a23, v25 +; CHECK-NEXT: v_accvgpr_write_b32 a24, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a25, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a26, v28 +; CHECK-NEXT: v_accvgpr_write_b32 a27, v29 +; CHECK-NEXT: v_accvgpr_write_b32 a28, v30 +; CHECK-NEXT: v_accvgpr_write_b32 a29, v31 +; CHECK-NEXT: v_accvgpr_write_b32 a30, v32 +; CHECK-NEXT: v_accvgpr_write_b32 a31, v33 ; CHECK-NEXT: s_nop 1 ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31] @@ -147,7 +147,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle( ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; CHECK-NEXT: v_mov_b32_e32 v1, 2.0 +; CHECK-NEXT: v_mov_b32_e32 v1, 1.0 +; CHECK-NEXT: v_mov_b32_e32 v2, 2.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 ; CHECK-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 @@ -157,16 +158,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle( ; CHECK-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 ; CHECK-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 ; CHECK-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] -; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] +; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; CHECK-NEXT: s_nop 7 ; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: s_nop 2 ; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir index 7295506213c4b..7eed89967adc0 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir @@ -161,16 +161,16 @@ body: | ; CHECK-LABEL: name: sched_barrier_mask_4 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: SCHED_BARRIER 4 @@ -209,19 +209,19 @@ body: | ; CHECK-LABEL: name: sched_barrier_mask_8 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: S_NOP 0 - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: SCHED_BARRIER 8 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) @@ -451,21 +451,21 @@ body: | ; CHECK-LABEL: name: sched_barrier_masks_8_12 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: SCHED_BARRIER 12 - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: SCHED_BARRIER 8 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir index d6774bb39dca7..d226f36928391 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir @@ -20,12 +20,12 @@ body: | ; GREEDY-LABEL: name: sched_group_barrier_2_VMEM_10_ALU_5_MFMA_2_VMEM_WRITE ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; GREEDY-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; GREEDY-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; GREEDY-NEXT: S_NOP 0 ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec @@ -44,12 +44,12 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_2_VMEM_10_ALU_5_MFMA_2_VMEM_WRITE ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; EXACT-NEXT: S_NOP 0 ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec @@ -132,9 +132,9 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_MFMA_VALU_and_SALU_alternating ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec - ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec @@ -212,8 +212,8 @@ body: | ; GREEDY-LABEL: name: sched_group_barrier_2_separate_pipes ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; GREEDY-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec @@ -223,8 +223,8 @@ body: | ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; GREEDY-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec - ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; GREEDY-NEXT: SCHED_GROUP_BARRIER 16, 2, 0 @@ -238,8 +238,8 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_2_separate_pipes ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec @@ -300,12 +300,12 @@ body: | ; GREEDY-LABEL: name: sched_group_barrier_3_separate_pipes ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; GREEDY-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; GREEDY-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; GREEDY-NEXT: S_NOP 0 ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec @@ -330,8 +330,8 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_3_separate_pipes ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir index 4f844762b24e3..90a79f8ecbe74 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir @@ -20,18 +20,18 @@ body: | ; CHECK-LABEL: name: no_sched_group_barrier ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) @@ -40,18 +40,18 @@ body: | ; EXACT-LABEL: name: no_sched_group_barrier ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; EXACT-NEXT: S_NOP 0 ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec - ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) @@ -84,8 +84,8 @@ body: | ; CHECK-LABEL: name: sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec @@ -110,8 +110,8 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec @@ -172,18 +172,18 @@ body: | ; CHECK-LABEL: name: sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) @@ -196,18 +196,18 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec - ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - ; EXACT-NEXT: S_NOP 0 ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec - ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec - ; EXACT-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec - ; EXACT-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; EXACT-NEXT: S_NOP 0 + ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; EXACT-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; EXACT-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec ; EXACT-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) @@ -252,12 +252,12 @@ body: | ; CHECK-LABEL: name: sched_group_barrier_MFMA_VALU_and_SALU_alternating ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec @@ -284,9 +284,9 @@ body: | ; EXACT-LABEL: name: sched_group_barrier_MFMA_VALU_and_SALU_alternating ; EXACT: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; EXACT-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) ; EXACT-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec - ; EXACT-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF ; EXACT-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec ; EXACT-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec