Skip to content

Commit 611a648

Browse files
authored
[AMDGPU] Add llvm.amdgcn.dead intrinsic (#123190)
Shaders that use the llvm.amdgcn.init.whole.wave intrinsic need to explicitly preserve the inactive lanes of VGPRs of interest by adding them as dummy arguments. The code usually looks something like this: ``` define amdgcn_cs_chain void f(active vgpr args..., i32 %inactive.vgpr1, ..., i32 %inactive.vgprN) { entry: %c = call i1 @llvm.amdgcn.init.whole.wave() br i1 %c, label %shader, label %tail shader: [...] tail: %inactive.vgpr.arg1 = phi i32 [ %inactive.vgpr1, %entry], [poison, %shader] [...] ; %inactive.vgpr* then get passed into a llvm.amdgcn.cs.chain call ``` Unfortunately, this kind of phi node will get optimized away and the backend won't be able to figure out that it's ok to use the active lanes of `%inactive.vgpr*` inside `shader`. This patch fixes the issue by introducing a llvm.amdgcn.dead intrinsic, whose result can be used as a PHI operand instead of the poison. This will be selected to an IMPLICIT_DEF, which the backend can work with. At the moment, the llvm.amdgcn.dead intrinsic works only on i32 values. Support for other types can be added later if needed.
1 parent 44dc572 commit 611a648

File tree

8 files changed

+231
-1
lines changed

8 files changed

+231
-1
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3463,4 +3463,11 @@ def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
34633463
[llvm_anyptr_ty], [llvm_anyptr_ty],
34643464
[IntrNoMem, IntrSpeculatable]
34653465
>;
3466+
3467+
/// Make it clear to the backend that this value is really dead. For instance,
3468+
/// when used as an input to a phi node, it will make it possible for the
3469+
/// backend to allocate the dead lanes for operations within the corresponding
3470+
/// incoming block.
3471+
def int_amdgcn_dead: DefaultAttrsIntrinsic<[llvm_any_ty], [],
3472+
[IntrNoMem, IntrWillReturn, IntrNoCallback]>;
34663473
}

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1190,6 +1190,12 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
11901190
case Intrinsic::amdgcn_permlane16_swap:
11911191
case Intrinsic::amdgcn_permlane32_swap:
11921192
return selectPermlaneSwapIntrin(I, IntrinsicID);
1193+
case Intrinsic::amdgcn_dead: {
1194+
I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
1195+
I.removeOperand(1); // drop intrinsic ID
1196+
return RBI.constrainGenericRegister(I.getOperand(0).getReg(),
1197+
AMDGPU::VGPR_32RegClass, *MRI);
1198+
}
11931199
default:
11941200
return selectImpl(I, *CoverageInfo);
11951201
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4676,6 +4676,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
46764676
case Intrinsic::amdgcn_set_inactive_chain_arg:
46774677
case Intrinsic::amdgcn_permlane64:
46784678
case Intrinsic::amdgcn_ds_bpermute_fi_b32:
4679+
case Intrinsic::amdgcn_dead:
46794680
return getDefaultMappingAllVGPR(MI);
46804681
case Intrinsic::amdgcn_cvt_pkrtz:
46814682
if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,8 @@ def : SourceOfDivergence<int_amdgcn_inverse_ballot>;
362362
foreach intr = AMDGPUImageDimAtomicIntrinsics in
363363
def : SourceOfDivergence<intr>;
364364

365+
def : SourceOfDivergence<int_amdgcn_dead>;
366+
365367
class AlwaysUniform<Intrinsic intr> {
366368
Intrinsic Intr = intr;
367369
}

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4276,3 +4276,9 @@ def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> {
42764276
let hasSideEffects = 1;
42774277
let SubtargetPredicate = isGFX10Plus;
42784278
}
4279+
4280+
// FIXME: Would be nice if we could set the register class for the destination
4281+
// register too.
4282+
def IMP_DEF_FROM_INTRINSIC: Pat<
4283+
(i32 (int_amdgcn_dead)), (IMPLICIT_DEF)>;
4284+

llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -520,7 +520,12 @@ define amdgpu_kernel void @v_permlane32_swap(ptr addrspace(1) %out, i32 %src0, i
520520
ret void
521521
}
522522

523-
523+
; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.dead.i32()
524+
define amdgpu_cs_chain void @dead(ptr addrspace(1) %out) {
525+
%v = call i32 @llvm.amdgcn.dead.i32()
526+
store i32 %v, ptr addrspace(1) %out
527+
ret void
528+
}
524529

525530
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
526531
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
@@ -558,5 +563,7 @@ declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1))
558563
declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1))
559564
declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1))
560565

566+
declare i32 @llvm.amdgcn.dead.i32()
567+
561568
attributes #0 = { nounwind convergent }
562569
attributes #1 = { nounwind readnone convergent }
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=ASM-DAG %s
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=ASM-GISEL %s
4+
5+
; Test that we can use v0 for temporaries in the if.then block.
6+
define i32 @dead(i1 %cond, i32 %x, ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2) #0 {
7+
; ASM-DAG-LABEL: dead:
8+
; ASM-DAG: ; %bb.0: ; %entry
9+
; ASM-DAG-NEXT: s_wait_loadcnt_dscnt 0x0
10+
; ASM-DAG-NEXT: s_wait_expcnt 0x0
11+
; ASM-DAG-NEXT: s_wait_samplecnt 0x0
12+
; ASM-DAG-NEXT: s_wait_bvhcnt 0x0
13+
; ASM-DAG-NEXT: s_wait_kmcnt 0x0
14+
; ASM-DAG-NEXT: v_mov_b32_e32 v4, v0
15+
; ASM-DAG-NEXT: v_mov_b32_e32 v0, v1
16+
; ASM-DAG-NEXT: s_mov_b32 s0, exec_lo
17+
; ASM-DAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
18+
; ASM-DAG-NEXT: v_and_b32_e32 v1, 1, v4
19+
; ASM-DAG-NEXT: v_cmpx_eq_u32_e32 1, v1
20+
; ASM-DAG-NEXT: s_cbranch_execz .LBB0_2
21+
; ASM-DAG-NEXT: ; %bb.1: ; %if.then
22+
; ASM-DAG-NEXT: v_add_nc_u32_e32 v0, 1, v0
23+
; ASM-DAG-NEXT: global_store_b32 v[2:3], v0, off
24+
; ASM-DAG-NEXT: ; implicit-def: $vgpr0
25+
; ASM-DAG-NEXT: .LBB0_2: ; %if.end
26+
; ASM-DAG-NEXT: s_wait_alu 0xfffe
27+
; ASM-DAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
28+
; ASM-DAG-NEXT: s_setpc_b64 s[30:31]
29+
;
30+
; ASM-GISEL-LABEL: dead:
31+
; ASM-GISEL: ; %bb.0: ; %entry
32+
; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
33+
; ASM-GISEL-NEXT: s_wait_expcnt 0x0
34+
; ASM-GISEL-NEXT: s_wait_samplecnt 0x0
35+
; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0
36+
; ASM-GISEL-NEXT: s_wait_kmcnt 0x0
37+
; ASM-GISEL-NEXT: v_mov_b32_e32 v4, v0
38+
; ASM-GISEL-NEXT: v_mov_b32_e32 v0, v1
39+
; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo
40+
; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
41+
; ASM-GISEL-NEXT: v_and_b32_e32 v1, 1, v4
42+
; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v1
43+
; ASM-GISEL-NEXT: s_cbranch_execz .LBB0_2
44+
; ASM-GISEL-NEXT: ; %bb.1: ; %if.then
45+
; ASM-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0
46+
; ASM-GISEL-NEXT: global_store_b32 v[2:3], v0, off
47+
; ASM-GISEL-NEXT: ; implicit-def: $vgpr0
48+
; ASM-GISEL-NEXT: .LBB0_2: ; %if.end
49+
; ASM-GISEL-NEXT: s_wait_alu 0xfffe
50+
; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
51+
; ASM-GISEL-NEXT: s_setpc_b64 s[30:31]
52+
entry:
53+
%dead = call i32 @llvm.amdgcn.dead.i32()
54+
br i1 %cond, label %if.then, label %if.end
55+
56+
if.then: ; preds = %entry
57+
%temp = add i32 %x, 1
58+
store i32 %temp, ptr addrspace(1) %ptr1
59+
br label %if.end
60+
61+
if.end:
62+
%res = phi i32 [ %x, %entry ], [ %dead, %if.then ]
63+
ret i32 %res
64+
}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1115,4 +1115,141 @@ tail:
11151115
unreachable
11161116
}
11171117

1118+
; Since functions that contain amdgcn.init.whole.wave do not preserve the inactive
1119+
; lanes of any VGPRs, the middle end will explicitly preserve them if needed by adding
1120+
; dummy VGPR arguments. Since only the inactive lanes are important, we need to make
1121+
; it clear to the backend that it's safe to allocate v9's active lanes inside
1122+
; shader. This is achieved by using the llvm.amdgcn.dead intrinsic.
1123+
define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %exec, i32 inreg %sgpr, i32 %active.vgpr, i32 %inactive.vgpr) {
1124+
; GISEL12-LABEL: with_inactive_vgprs:
1125+
; GISEL12: ; %bb.0: ; %entry
1126+
; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1127+
; GISEL12-NEXT: s_wait_expcnt 0x0
1128+
; GISEL12-NEXT: s_wait_samplecnt 0x0
1129+
; GISEL12-NEXT: s_wait_bvhcnt 0x0
1130+
; GISEL12-NEXT: s_wait_kmcnt 0x0
1131+
; GISEL12-NEXT: s_or_saveexec_b32 s6, -1
1132+
; GISEL12-NEXT: s_mov_b32 s4, s0
1133+
; GISEL12-NEXT: s_mov_b32 s5, s1
1134+
; GISEL12-NEXT: s_mov_b32 s0, s3
1135+
; GISEL12-NEXT: s_wait_alu 0xfffe
1136+
; GISEL12-NEXT: s_and_saveexec_b32 s1, s6
1137+
; GISEL12-NEXT: s_cbranch_execz .LBB6_2
1138+
; GISEL12-NEXT: ; %bb.1: ; %shader
1139+
; GISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
1140+
; GISEL12-NEXT: flat_load_b32 v11, v[9:10]
1141+
; GISEL12-NEXT: ;;#ASMSTART
1142+
; GISEL12-NEXT: ; use v0-7
1143+
; GISEL12-NEXT: ;;#ASMEND
1144+
; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1145+
; GISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
1146+
; GISEL12-NEXT: flat_store_b32 v[9:10], v11
1147+
; GISEL12-NEXT: ; implicit-def: $vgpr9
1148+
; GISEL12-NEXT: .LBB6_2: ; %tail.block
1149+
; GISEL12-NEXT: s_wait_alu 0xfffe
1150+
; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s1
1151+
; GISEL12-NEXT: s_mov_b32 exec_lo, s2
1152+
; GISEL12-NEXT: s_setpc_b64 s[4:5]
1153+
;
1154+
; DAGISEL12-LABEL: with_inactive_vgprs:
1155+
; DAGISEL12: ; %bb.0: ; %entry
1156+
; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1157+
; DAGISEL12-NEXT: s_wait_expcnt 0x0
1158+
; DAGISEL12-NEXT: s_wait_samplecnt 0x0
1159+
; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
1160+
; DAGISEL12-NEXT: s_wait_kmcnt 0x0
1161+
; DAGISEL12-NEXT: s_or_saveexec_b32 s6, -1
1162+
; DAGISEL12-NEXT: s_mov_b32 s5, s1
1163+
; DAGISEL12-NEXT: s_mov_b32 s4, s0
1164+
; DAGISEL12-NEXT: s_wait_alu 0xfffe
1165+
; DAGISEL12-NEXT: s_and_saveexec_b32 s0, s6
1166+
; DAGISEL12-NEXT: s_cbranch_execz .LBB6_2
1167+
; DAGISEL12-NEXT: ; %bb.1: ; %shader
1168+
; DAGISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
1169+
; DAGISEL12-NEXT: flat_load_b32 v11, v[9:10]
1170+
; DAGISEL12-NEXT: ;;#ASMSTART
1171+
; DAGISEL12-NEXT: ; use v0-7
1172+
; DAGISEL12-NEXT: ;;#ASMEND
1173+
; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1174+
; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
1175+
; DAGISEL12-NEXT: flat_store_b32 v[9:10], v11
1176+
; DAGISEL12-NEXT: ; implicit-def: $vgpr9
1177+
; DAGISEL12-NEXT: .LBB6_2: ; %tail.block
1178+
; DAGISEL12-NEXT: s_wait_alu 0xfffe
1179+
; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s0
1180+
; DAGISEL12-NEXT: s_mov_b32 s0, s3
1181+
; DAGISEL12-NEXT: s_mov_b32 exec_lo, s2
1182+
; DAGISEL12-NEXT: s_wait_alu 0xfffe
1183+
; DAGISEL12-NEXT: s_setpc_b64 s[4:5]
1184+
;
1185+
; GISEL10-LABEL: with_inactive_vgprs:
1186+
; GISEL10: ; %bb.0: ; %entry
1187+
; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1188+
; GISEL10-NEXT: s_or_saveexec_b32 s6, -1
1189+
; GISEL10-NEXT: s_mov_b32 s4, s0
1190+
; GISEL10-NEXT: s_mov_b32 s5, s1
1191+
; GISEL10-NEXT: s_mov_b32 s0, s3
1192+
; GISEL10-NEXT: s_and_saveexec_b32 s1, s6
1193+
; GISEL10-NEXT: s_cbranch_execz .LBB6_2
1194+
; GISEL10-NEXT: ; %bb.1: ; %shader
1195+
; GISEL10-NEXT: v_mov_b32_e32 v10, s5
1196+
; GISEL10-NEXT: v_mov_b32_e32 v9, s4
1197+
; GISEL10-NEXT: flat_load_dword v11, v[9:10]
1198+
; GISEL10-NEXT: ;;#ASMSTART
1199+
; GISEL10-NEXT: ; use v0-7
1200+
; GISEL10-NEXT: ;;#ASMEND
1201+
; GISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1202+
; GISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
1203+
; GISEL10-NEXT: flat_store_dword v[9:10], v11
1204+
; GISEL10-NEXT: ; implicit-def: $vgpr9
1205+
; GISEL10-NEXT: .LBB6_2: ; %tail.block
1206+
; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s1
1207+
; GISEL10-NEXT: s_mov_b32 exec_lo, s2
1208+
; GISEL10-NEXT: s_setpc_b64 s[4:5]
1209+
;
1210+
; DAGISEL10-LABEL: with_inactive_vgprs:
1211+
; DAGISEL10: ; %bb.0: ; %entry
1212+
; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1213+
; DAGISEL10-NEXT: s_or_saveexec_b32 s6, -1
1214+
; DAGISEL10-NEXT: s_mov_b32 s5, s1
1215+
; DAGISEL10-NEXT: s_mov_b32 s4, s0
1216+
; DAGISEL10-NEXT: s_and_saveexec_b32 s0, s6
1217+
; DAGISEL10-NEXT: s_cbranch_execz .LBB6_2
1218+
; DAGISEL10-NEXT: ; %bb.1: ; %shader
1219+
; DAGISEL10-NEXT: v_mov_b32_e32 v10, s5
1220+
; DAGISEL10-NEXT: v_mov_b32_e32 v9, s4
1221+
; DAGISEL10-NEXT: flat_load_dword v11, v[9:10]
1222+
; DAGISEL10-NEXT: ;;#ASMSTART
1223+
; DAGISEL10-NEXT: ; use v0-7
1224+
; DAGISEL10-NEXT: ;;#ASMEND
1225+
; DAGISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1226+
; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
1227+
; DAGISEL10-NEXT: flat_store_dword v[9:10], v11
1228+
; DAGISEL10-NEXT: ; implicit-def: $vgpr9
1229+
; DAGISEL10-NEXT: .LBB6_2: ; %tail.block
1230+
; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s0
1231+
; DAGISEL10-NEXT: s_mov_b32 s0, s3
1232+
; DAGISEL10-NEXT: s_mov_b32 exec_lo, s2
1233+
; DAGISEL10-NEXT: s_setpc_b64 s[4:5]
1234+
entry:
1235+
%imp.def = call i32 @llvm.amdgcn.dead()
1236+
%initial.exec = call i1 @llvm.amdgcn.init.whole.wave()
1237+
br i1 %initial.exec, label %shader, label %tail.block
1238+
1239+
shader: ; preds = %entry
1240+
%use.another.vgpr = load i32, ptr %callee ; smth that won't be moved past the inline asm
1241+
call void asm sideeffect "; use v0-7", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"()
1242+
store i32 %use.another.vgpr, ptr %callee
1243+
%active.vgpr.new = add i32 %active.vgpr, %use.another.vgpr
1244+
br label %tail.block
1245+
1246+
tail.block: ; preds = %.exit27, %.exit49, %244, %243, %entry
1247+
%active.vgpr.arg = phi i32 [ %active.vgpr, %entry ], [ %active.vgpr.new, %shader ]
1248+
%inactive.vgpr.arg = phi i32 [ %inactive.vgpr, %entry ], [ %imp.def, %shader ]
1249+
%vgprs.0 = insertvalue { i32, i32 } poison, i32 %active.vgpr.arg, 0
1250+
%vgprs = insertvalue { i32, i32 } %vgprs.0, i32 %inactive.vgpr.arg, 1
1251+
call void (ptr, i32, i32, { i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.i32.sl_i32i32(ptr inreg %callee, i32 inreg %exec, i32 inreg %sgpr, { i32, i32} %vgprs, i32 0)
1252+
unreachable
1253+
}
1254+
11181255
declare amdgpu_gfx <16 x i32> @write_v0_v15(<16 x i32>)

0 commit comments

Comments
 (0)