Skip to content

Commit 59a7ae9

Browse files
arsenmmemfrob
authored andcommitted
AMDGPU: Fix redundant FP spilling/assert in some functions
If a function has stack objects, and a call, we require an FP. If we did not initially have any stack objects, and only introduced them during PrologEpilogInserter for CSR VGPR spills, SILowerSGPRSpills would end up spilling the FP register as if it were a normal register. This would result in an assert in a debug build, or redundant handling of the FP register in a release build. Try to predict that we will have an FP later, although this is ugly.
1 parent 784d83b commit 59a7ae9

File tree

2 files changed

+126
-0
lines changed

2 files changed

+126
-0
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1333,7 +1333,21 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
13331333

13341334
// The SP is specifically managed and we don't want extra spills of it.
13351335
SavedRegs.reset(MFI->getStackPtrOffsetReg());
1336+
1337+
const BitVector AllSavedRegs = SavedRegs;
13361338
SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
1339+
1340+
// If clearing VGPRs changed the mask, we will have some CSR VGPR spills.
1341+
const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs;
1342+
1343+
// We have to anticipate introducing CSR VGPR spills if we don't have any
1344+
// stack objects already, since we require an FP if there is a call and stack.
1345+
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1346+
const bool WillHaveFP = FrameInfo.hasCalls() && HaveAnyCSRVGPR;
1347+
1348+
// FP will be specially managed like SP.
1349+
if (WillHaveFP || hasFP(MF))
1350+
SavedRegs.reset(MFI->getFrameOffsetReg());
13371351
}
13381352

13391353
bool SIFrameLowering::assignCalleeSavedSpillSlots(
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
3+
4+
; FP is in CSR range, modified.
5+
define hidden fastcc void @callee_has_fp() #1 {
6+
; CHECK-LABEL: callee_has_fp:
7+
; CHECK: ; %bb.0:
8+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9+
; CHECK-NEXT: s_mov_b32 s4, s33
10+
; CHECK-NEXT: s_mov_b32 s33, s32
11+
; CHECK-NEXT: s_add_u32 s32, s32, 0x200
12+
; CHECK-NEXT: v_mov_b32_e32 v0, 1
13+
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
14+
; CHECK-NEXT: s_waitcnt vmcnt(0)
15+
; CHECK-NEXT: s_sub_u32 s32, s32, 0x200
16+
; CHECK-NEXT: s_mov_b32 s33, s4
17+
; CHECK-NEXT: s_setpc_b64 s[30:31]
18+
%alloca = alloca i32, addrspace(5)
19+
store volatile i32 1, i32 addrspace(5)* %alloca
20+
ret void
21+
}
22+
23+
; Has no stack objects, but introduces them due to the CSR spill. We
24+
; see the FP modified in the callee with IPRA. We should not have
25+
; redundant spills of s33 or assert.
26+
define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
27+
; CHECK-LABEL: csr_vgpr_spill_fp_callee:
28+
; CHECK: ; %bb.0: ; %bb
29+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30+
; CHECK-NEXT: s_mov_b32 s8, s33
31+
; CHECK-NEXT: s_mov_b32 s33, s32
32+
; CHECK-NEXT: s_add_u32 s32, s32, 0x400
33+
; CHECK-NEXT: s_getpc_b64 s[4:5]
34+
; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4
35+
; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12
36+
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
37+
; CHECK-NEXT: s_mov_b64 s[6:7], s[30:31]
38+
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
39+
; CHECK-NEXT: ;;#ASMSTART
40+
; CHECK-NEXT: ; clobber csr v40
41+
; CHECK-NEXT: ;;#ASMEND
42+
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
43+
; CHECK-NEXT: s_sub_u32 s32, s32, 0x400
44+
; CHECK-NEXT: s_mov_b32 s33, s8
45+
; CHECK-NEXT: s_waitcnt vmcnt(0)
46+
; CHECK-NEXT: s_setpc_b64 s[6:7]
47+
bb:
48+
call fastcc void @callee_has_fp()
49+
call void asm sideeffect "; clobber csr v40", "~{v40}"()
50+
ret void
51+
}
52+
53+
define amdgpu_kernel void @kernel_call() {
54+
; CHECK-LABEL: kernel_call:
55+
; CHECK: ; %bb.0: ; %bb
56+
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
57+
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
58+
; CHECK-NEXT: s_add_u32 s0, s0, s7
59+
; CHECK-NEXT: s_addc_u32 s1, s1, 0
60+
; CHECK-NEXT: s_getpc_b64 s[4:5]
61+
; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_callee@rel32@lo+4
62+
; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_callee@rel32@hi+12
63+
; CHECK-NEXT: s_mov_b32 s32, 0
64+
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
65+
; CHECK-NEXT: s_endpgm
66+
bb:
67+
tail call fastcc void @csr_vgpr_spill_fp_callee()
68+
ret void
69+
}
70+
71+
; Same, except with a tail call.
72+
define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 {
73+
; CHECK-LABEL: csr_vgpr_spill_fp_tailcall_callee:
74+
; CHECK: ; %bb.0: ; %bb
75+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
76+
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
77+
; CHECK-NEXT: ;;#ASMSTART
78+
; CHECK-NEXT: ; clobber csr v40
79+
; CHECK-NEXT: ;;#ASMEND
80+
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
81+
; CHECK-NEXT: v_writelane_b32 v1, s33, 0
82+
; CHECK-NEXT: s_getpc_b64 s[4:5]
83+
; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4
84+
; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12
85+
; CHECK-NEXT: v_readlane_b32 s33, v1, 0
86+
; CHECK-NEXT: s_setpc_b64 s[4:5]
87+
bb:
88+
call void asm sideeffect "; clobber csr v40", "~{v40}"()
89+
tail call fastcc void @callee_has_fp()
90+
ret void
91+
}
92+
93+
define amdgpu_kernel void @kernel_tailcall() {
94+
; CHECK-LABEL: kernel_tailcall:
95+
; CHECK: ; %bb.0: ; %bb
96+
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
97+
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
98+
; CHECK-NEXT: s_add_u32 s0, s0, s7
99+
; CHECK-NEXT: s_addc_u32 s1, s1, 0
100+
; CHECK-NEXT: s_getpc_b64 s[4:5]
101+
; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4
102+
; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12
103+
; CHECK-NEXT: s_mov_b32 s32, 0
104+
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
105+
; CHECK-NEXT: s_endpgm
106+
bb:
107+
tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee()
108+
ret void
109+
}
110+
111+
attributes #0 = { "frame-pointer"="none" noinline }
112+
attributes #1 = { "frame-pointer"="all" noinline }

0 commit comments

Comments
 (0)