From fb9754d32ca18236960bc5014d6515c0099466da Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 7 Jul 2025 13:32:33 -0700 Subject: [PATCH 1/2] [InlineSpiller] Do not fold undef copies Change-Id: I134bc03e7eeb7f5324c1ce7005b96b28d1b01643 --- llvm/lib/CodeGen/InlineSpiller.cpp | 3 + .../AMDGPU/regalloc-undef-copy-fold.mir | 80 +++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index ee08dc42dce57..6b8b8731e6b56 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -946,6 +946,9 @@ foldMemoryOperand(ArrayRef> Ops, if (MO.isUse() && !MO.readsReg() && !MO.isTied()) continue; + if (MI->isCopy() && MI->getOperand(1).isUndef()) + continue; + if (MO.isImplicit()) { ImpReg = MO.getReg(); continue; diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir b/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir new file mode 100644 index 0000000000000..440753fc88277 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir @@ -0,0 +1,80 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs --start-before=greedy,2 --stop-after=greedy,2 %s -o - | FileCheck %s + +# Make sure there's no machine verifier error + +# If RA is unable to find a register to allocate, then cleanupFailedVReg will do ad-hoc rewriting and will insert undefs to make the LiveRanges workable. +# %30:av_128 = COPY undef $vgpr0_vgpr1_vgpr2_vgpr3 is an example of such a rewrite / undef. If we were to want to spill %30, we should not be inserting +# actual spill code, as the source operand is undef. +# Check that there are no verfier issues with the LiveRange of $vgpr0_vgpr1_vgpr2_vgpr3 / that we do not insert spill code for %30. + + +--- | + define void @foo() #0 { + ret void + } + + attributes #0 = { "amdgpu-waves-per-eu"="8,8" } + +... + +--- +name: foo +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 32, alignment: 4 } +machineFunctionInfo: + maxKernArgAlign: 4 + isEntryFunction: true + waveLimiter: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledVGPRs: true + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + dispatchPtr: { reg: '$sgpr4_sgpr5' } + kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } + workGroupIDX: { reg: '$sgpr8' } + privateSegmentWaveByteOffset: { reg: '$sgpr9' } +body: | + bb.0: + ; CHECK-LABEL: name: foo + ; CHECK: INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 10 /* regdef */, def %10, 10 /* regdef */, def %1, 10 /* regdef */, def %2, 10 /* regdef */, def $vgpr0_vgpr1_vgpr2_vgpr3, 10 /* regdef */, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY undef $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: SI_SPILL_AV128_SAVE [[COPY]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_AV160_SAVE %2, %stack.1, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_AV256_SAVE %1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_512 = COPY %10 + ; CHECK-NEXT: SI_SPILL_V512_SAVE [[COPY1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_512 = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; CHECK-NEXT: SI_SPILL_V512_SAVE [[COPY2]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.6, align 4, addrspace 5) + ; CHECK-NEXT: INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; CHECK-NEXT: [[SI_SPILL_V512_RESTORE:%[0-9]+]]:vreg_512 = SI_SPILL_V512_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.6, align 4, addrspace 5) + ; CHECK-NEXT: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = COPY [[SI_SPILL_V512_RESTORE]] + ; CHECK-NEXT: [[SI_SPILL_V512_RESTORE1:%[0-9]+]]:vreg_512 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V512_SAVE [[SI_SPILL_V512_RESTORE1]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_AV256_RESTORE:%[0-9]+]]:vreg_256 = SI_SPILL_AV256_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V256_SAVE [[SI_SPILL_AV256_RESTORE]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.5, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_AV160_RESTORE:%[0-9]+]]:vreg_160 = SI_SPILL_AV160_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_AV512_RESTORE:%[0-9]+]]:av_512 = SI_SPILL_AV512_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V256_RESTORE:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.5, align 4, addrspace 5) + ; CHECK-NEXT: INLINEASM &"; use $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 9 /* reguse */, [[SI_SPILL_AV512_RESTORE]], 9 /* reguse */, [[SI_SPILL_V256_RESTORE]], 9 /* reguse */, [[SI_SPILL_AV160_RESTORE]], 9 /* reguse */, undef $vgpr0_vgpr1_vgpr2_vgpr3, 9 /* reguse */, $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 + ; CHECK-NEXT: SI_RETURN + INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 10, def %22:vreg_512, 10, def %25:vreg_256, 10, def %28:vreg_160, 10, def $vgpr0_vgpr1_vgpr2_vgpr3, 10, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + %30:av_128 = COPY undef $vgpr0_vgpr1_vgpr2_vgpr3 + %27:av_160 = COPY %28:vreg_160 + %24:av_256 = COPY %25:vreg_256 + SI_SPILL_V512_SAVE %22:vreg_512, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) + %18:vreg_512 = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 10, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = COPY %18:vreg_512 + %23:vreg_512 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) + %26:vreg_256 = COPY %24:av_256 + %29:vreg_160 = COPY %27:av_160 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %30:av_128 + INLINEASM &"; use $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 9, %23:vreg_512, 9, %26:vreg_256, 9, %29:vreg_160, 9, undef $vgpr0_vgpr1_vgpr2_vgpr3, 9, $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 + SI_RETURN + +... From adff47f01b90bf498bbcf6b3255085891d201047 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 8 Jul 2025 16:20:53 -0700 Subject: [PATCH 2/2] Use IMPLICIT_DEF instead of undef COPYs Change-Id: Icfd4d568221b9d2425ebd7568c06b48c6a8ecdc9 --- llvm/lib/CodeGen/InlineSpiller.cpp | 3 - llvm/lib/CodeGen/RegAllocBase.cpp | 20 ++- llvm/lib/CodeGen/RegAllocBase.h | 1 + ...-reg-class-snippet-copy-use-after-free.mir | 8 +- .../AMDGPU/regalloc-undef-copy-fold.mir | 155 +++++++++++------- 5 files changed, 119 insertions(+), 68 deletions(-) diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index 6b8b8731e6b56..ee08dc42dce57 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -946,9 +946,6 @@ foldMemoryOperand(ArrayRef> Ops, if (MO.isUse() && !MO.readsReg() && !MO.isTied()) continue; - if (MI->isCopy() && MI->getOperand(1).isUndef()) - continue; - if (MO.isImplicit()) { ImpReg = MO.getReg(); continue; diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp index 69b92917399fd..c1517d26d7942 100644 --- a/llvm/lib/CodeGen/RegAllocBase.cpp +++ b/llvm/lib/CodeGen/RegAllocBase.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Spiller.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/IR/DiagnosticInfo.h" @@ -60,6 +61,7 @@ void RegAllocBase::init(VirtRegMap &vrm, LiveIntervals &lis, LiveRegMatrix &mat) { TRI = &vrm.getTargetRegInfo(); MRI = &vrm.getRegInfo(); + TII = vrm.getMachineFunction().getSubtarget().getInstrInfo(); VRM = &vrm; LIS = &lis; Matrix = &mat; @@ -167,9 +169,15 @@ void RegAllocBase::cleanupFailedVReg(Register FailedReg, MCRegister PhysReg, // We still should produce valid IR. Kill all the uses and reduce the live // ranges so that we don't think it's possible to introduce kill flags later // which will fail the verifier. + + SmallVector UndefCopies; + for (MachineOperand &MO : MRI->reg_operands(FailedReg)) { - if (MO.readsReg()) + if (MO.readsReg()) { MO.setIsUndef(true); + if (MO.getParent()->isCopy() && MO.isUse()) + UndefCopies.push_back(MO.getParent()); + } } if (!MRI->isReserved(PhysReg)) { @@ -180,12 +188,22 @@ void RegAllocBase::cleanupFailedVReg(Register FailedReg, MCRegister PhysReg, for (MachineOperand &MO : MRI->reg_operands(*Aliases)) { if (MO.readsReg()) { MO.setIsUndef(true); + if (MO.getParent()->isCopy() && MO.isUse()) + UndefCopies.push_back(MO.getParent()); LIS->removeAllRegUnitsForPhysReg(MO.getReg()); } } } } + // If we have produced an undef copy, convert to IMPLICIT_DEF. + for (MachineInstr *UndefCopy : UndefCopies) { + assert(UndefCopy->isCopy() && UndefCopy->getNumOperands() == 2); + const MCInstrDesc &Desc = TII->get(TargetOpcode::IMPLICIT_DEF); + UndefCopy->removeOperand(1); + UndefCopy->setDesc(Desc); + } + // Directly perform the rewrite, and do not leave it to VirtRegRewriter as // usual. This avoids trying to manage illegal overlapping assignments in // LiveRegMatrix. diff --git a/llvm/lib/CodeGen/RegAllocBase.h b/llvm/lib/CodeGen/RegAllocBase.h index f1b5af8cd4d74..58743283458a6 100644 --- a/llvm/lib/CodeGen/RegAllocBase.h +++ b/llvm/lib/CodeGen/RegAllocBase.h @@ -65,6 +65,7 @@ class RegAllocBase { protected: const TargetRegisterInfo *TRI = nullptr; + const TargetInstrInfo *TII = nullptr; MachineRegisterInfo *MRI = nullptr; VirtRegMap *VRM = nullptr; LiveIntervals *LIS = nullptr; diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir index c1e0d0716acae..d12f5d7ef2315 100644 --- a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir +++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir @@ -29,8 +29,8 @@ # CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) # CHECK-NEXT: [[RESTORE0:%[0-9]+]]:vreg_512_align2 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) # CHECK-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[RESTORE0]], 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec -# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY undef $vgpr2_vgpr3 { -# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY undef $vgpr0 +# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = IMPLICIT_DEF { +# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = IMPLICIT_DEF # CHECK-NEXT: } # CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 { # CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0 @@ -120,8 +120,8 @@ body: | # CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) # CHECK-NEXT: [[RESTORE_0:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) # CHECK-NEXT: S_NOP 0, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit [[RESTORE_0]].sub0_sub1_sub2_sub3, implicit [[RESTORE_0]].sub4_sub5_sub6_sub7 -# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY undef $vgpr2_vgpr3 { -# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY undef $vgpr0 +# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = IMPLICIT_DEF { +# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = IMPLICIT_DEF # CHECK-NEXT: } # CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 { # CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0 diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir b/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir index 440753fc88277..ce1bb1f6f4b16 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir @@ -1,80 +1,115 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs --start-before=greedy,2 --stop-after=greedy,2 %s -o - | FileCheck %s +# RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -start-before=greedy,1 -stop-after=virtregrewriter,2 -o - -verify-regalloc %s 2> %t.err | FileCheck %s +# RUN: FileCheck -check-prefix=ERR %s < %t.err -# Make sure there's no machine verifier error +# Make sure there's no machine verifier error after failure. -# If RA is unable to find a register to allocate, then cleanupFailedVReg will do ad-hoc rewriting and will insert undefs to make the LiveRanges workable. -# %30:av_128 = COPY undef $vgpr0_vgpr1_vgpr2_vgpr3 is an example of such a rewrite / undef. If we were to want to spill %30, we should not be inserting -# actual spill code, as the source operand is undef. -# Check that there are no verfier issues with the LiveRange of $vgpr0_vgpr1_vgpr2_vgpr3 / that we do not insert spill code for %30. +# ERR: error: inline assembly requires more registers than available +# ERR: error: inline assembly requires more registers than available +# ERR: error: inline assembly requires more registers than available +# ERR: error: inline assembly requires more registers than available +# This testcase cannot be compiled with the enforced register +# budget. Previously, tryLastChanceRecoloring would assert here. It +# was attempting to recolor a superregister with an overlapping +# subregister over the same range. --- | - define void @foo() #0 { + define void @dead_copy() #0 { ret void } - attributes #0 = { "amdgpu-waves-per-eu"="8,8" } + define void @copy_kill() #0 { + ret void + } + + define void @copy_subreg() #0 { + ret void + } + + define void @copy_subreg2() #0 { + ret void + } + + attributes #0 = { "amdgpu-num-vgpr"="6" } + +... + +# CHECK-LABEL: name: dead_copy +# CHECK: renamable $agpr0_agpr1_agpr2 = IMPLICIT_DEF +# CHECK: dead renamable $vgpr0_vgpr1_vgpr2 = COPY renamable $agpr0_agpr1_agpr2 + +--- +name: dead_copy +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + + INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 11534346 /* regdef:VReg_512 */, def %0:vreg_512, 10158090 /* regdef:VReg_256 */, def %1:vreg_256, 4784138 /* regdef:VReg_128 */, def %2:vreg_128, 3670026 /* regdef:VReg_96 */, def %3:vreg_96, 3670026 /* regdef:VReg_96 */, def %4:vreg_96 + %5:vreg_96 = COPY %3 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %3 + SI_RETURN + +... + + +# CHECK-LABEL: name: copy_kill +# CHECK: renamable $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF +--- +name: copy_kill +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + + INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 11534346 /* regdef:VReg_512 */, def %0:vreg_512, 10158090 /* regdef:VReg_256 */, def %1:vreg_256, 4784138 /* regdef:VReg_128 */, def %2:vreg_128, 3670026 /* regdef:VReg_96 */, def %3:vreg_96, 3670026 /* regdef:VReg_96 */, def %4:vreg_96 + %5:vreg_96 = COPY %3 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %5 + SI_RETURN ... +# CHECK-LABEL: name: copy_subreg +# CHECK: renamable $vgpr1_vgpr2 = IMPLICIT_DEF +# CHECK: renamable $vgpr0 = COPY renamable $vgpr1 --- -name: foo +name: copy_subreg tracksRegLiveness: true -stack: - - { id: 0, type: spill-slot, size: 32, alignment: 4 } machineFunctionInfo: - maxKernArgAlign: 4 - isEntryFunction: true - waveLimiter: true - scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - hasSpilledVGPRs: true - argumentInfo: - privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } - dispatchPtr: { reg: '$sgpr4_sgpr5' } - kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } - workGroupIDX: { reg: '$sgpr8' } - privateSegmentWaveByteOffset: { reg: '$sgpr9' } body: | bb.0: - ; CHECK-LABEL: name: foo - ; CHECK: INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 10 /* regdef */, def %10, 10 /* regdef */, def %1, 10 /* regdef */, def %2, 10 /* regdef */, def $vgpr0_vgpr1_vgpr2_vgpr3, 10 /* regdef */, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY undef $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK-NEXT: SI_SPILL_AV128_SAVE [[COPY]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: SI_SPILL_AV160_SAVE %2, %stack.1, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: SI_SPILL_AV256_SAVE %1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.3, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_512 = COPY %10 - ; CHECK-NEXT: SI_SPILL_V512_SAVE [[COPY1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_512 = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; CHECK-NEXT: SI_SPILL_V512_SAVE [[COPY2]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.6, align 4, addrspace 5) - ; CHECK-NEXT: INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 - ; CHECK-NEXT: [[SI_SPILL_V512_RESTORE:%[0-9]+]]:vreg_512 = SI_SPILL_V512_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.6, align 4, addrspace 5) - ; CHECK-NEXT: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = COPY [[SI_SPILL_V512_RESTORE]] - ; CHECK-NEXT: [[SI_SPILL_V512_RESTORE1:%[0-9]+]]:vreg_512 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: SI_SPILL_V512_SAVE [[SI_SPILL_V512_RESTORE1]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: [[SI_SPILL_AV256_RESTORE:%[0-9]+]]:vreg_256 = SI_SPILL_AV256_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.3, align 4, addrspace 5) - ; CHECK-NEXT: SI_SPILL_V256_SAVE [[SI_SPILL_AV256_RESTORE]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.5, align 4, addrspace 5) - ; CHECK-NEXT: [[SI_SPILL_AV160_RESTORE:%[0-9]+]]:vreg_160 = SI_SPILL_AV160_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: [[SI_SPILL_AV512_RESTORE:%[0-9]+]]:av_512 = SI_SPILL_AV512_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: [[SI_SPILL_V256_RESTORE:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.5, align 4, addrspace 5) - ; CHECK-NEXT: INLINEASM &"; use $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 9 /* reguse */, [[SI_SPILL_AV512_RESTORE]], 9 /* reguse */, [[SI_SPILL_V256_RESTORE]], 9 /* reguse */, [[SI_SPILL_AV160_RESTORE]], 9 /* reguse */, undef $vgpr0_vgpr1_vgpr2_vgpr3, 9 /* reguse */, $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 - ; CHECK-NEXT: SI_RETURN - INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 10, def %22:vreg_512, 10, def %25:vreg_256, 10, def %28:vreg_160, 10, def $vgpr0_vgpr1_vgpr2_vgpr3, 10, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - %30:av_128 = COPY undef $vgpr0_vgpr1_vgpr2_vgpr3 - %27:av_160 = COPY %28:vreg_160 - %24:av_256 = COPY %25:vreg_256 - SI_SPILL_V512_SAVE %22:vreg_512, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) - %18:vreg_512 = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 10, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 - $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = COPY %18:vreg_512 - %23:vreg_512 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) - %26:vreg_256 = COPY %24:av_256 - %29:vreg_160 = COPY %27:av_160 - $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %30:av_128 - INLINEASM &"; use $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 9, %23:vreg_512, 9, %26:vreg_256, 9, %29:vreg_160, 9, undef $vgpr0_vgpr1_vgpr2_vgpr3, 9, $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 + + INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 11534346 /* regdef:VReg_512 */, def %0:vreg_512, 10158090 /* regdef:VReg_256 */, def %1:vreg_256, 4784138 /* regdef:VReg_128 */, def %2:vreg_128, 3670026 /* regdef:VReg_96 */, def %3:vreg_96, 3670026 /* regdef:VReg_96 */, def %4:vreg_96 + %3.sub0 = COPY %3.sub1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %3 SI_RETURN +... + +# CHECK-LABEL: name: copy_subreg2 +# CHECK: renamable $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF +--- +name: copy_subreg2 +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + + INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 11534346 /* regdef:VReg_512 */, def %0:vreg_512, 10158090 /* regdef:VReg_256 */, def %1:vreg_256, 4784138 /* regdef:VReg_128 */, def %2:vreg_128, 3670026 /* regdef:VReg_96 */, def %3:vreg_96, 3670026 /* regdef:VReg_96 */, def %4:vreg_96 + undef %5.sub0:vreg_96 = COPY %3.sub0 + %5.sub1_sub2:vreg_96 = COPY %3.sub1_sub2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %5 + SI_RETURN ...