Skip to content

Commit 2661513

Browse files
committed
[DAG] Fold (setcc ((x | x >> c0 | ...) & mask)) sequences
Fold sequences where we extract a bunch of contiguous bits from a value, merge them into the low bit and then check if the low bits are zero or not. It seems like a strange sequence at first but it's an idiom used by device libs in device libs to check workitem IDs for AMDGPU. The reason I put this in DAGCombiner instead of the target combiner is because this is a generic, valid transform that's also fairly niche, so there isn't much risk of a combine loop I think. See #136727
1 parent 2b9f0b5 commit 2661513

File tree

2 files changed

+91
-29
lines changed

2 files changed

+91
-29
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28912,13 +28912,97 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
2891228912
return SDValue();
2891328913
}
2891428914

28915+
static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
28916+
const TargetLowering &TLI) {
28917+
// Match a pattern such as:
28918+
// (X | (X >> C0) | (X >> C1) | ...) & Mask
28919+
// This extracts contiguous parts of X and ORs them together before comparing.
28920+
// We can optimize this so that we directly check (X & SomeMask) instead,
28921+
// eliminating the shifts.
28922+
28923+
EVT VT = Root.getValueType();
28924+
28925+
if (Root.getOpcode() != ISD::AND)
28926+
return SDValue();
28927+
28928+
SDValue N0 = Root.getOperand(0);
28929+
SDValue N1 = Root.getOperand(1);
28930+
28931+
if (N0.getOpcode() != ISD::OR || !isa<ConstantSDNode>(N1))
28932+
return SDValue();
28933+
28934+
APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal();
28935+
if (!RootMask.isMask())
28936+
return SDValue();
28937+
28938+
SDValue Src;
28939+
const auto IsSrc = [&](SDValue V) {
28940+
if (!Src) {
28941+
Src = V;
28942+
return true;
28943+
}
28944+
28945+
return Src == V;
28946+
};
28947+
28948+
SmallVector<SDValue> Worklist = {N0};
28949+
APInt PartsMask(VT.getSizeInBits(), 0);
28950+
while (!Worklist.empty()) {
28951+
SDValue V = Worklist.pop_back_val();
28952+
if (!V.hasOneUse() && Src != V)
28953+
return SDValue();
28954+
28955+
if (V.getOpcode() == ISD::OR) {
28956+
Worklist.push_back(V.getOperand(0));
28957+
Worklist.push_back(V.getOperand(1));
28958+
continue;
28959+
}
28960+
28961+
if (V.getOpcode() == ISD::SRL) {
28962+
SDValue ShiftSrc = V.getOperand(0);
28963+
SDValue ShiftAmt = V.getOperand(1);
28964+
28965+
if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt))
28966+
return SDValue();
28967+
28968+
PartsMask |= (RootMask << cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal());
28969+
continue;
28970+
}
28971+
28972+
if (IsSrc(V)) {
28973+
PartsMask |= RootMask;
28974+
continue;
28975+
}
28976+
28977+
return SDValue();
28978+
}
28979+
28980+
if (!RootMask.isMask() || !Src)
28981+
return SDValue();
28982+
28983+
SDLoc DL(Root);
28984+
return DAG.getNode(ISD::AND, DL, VT,
28985+
{Src, DAG.getConstant(PartsMask, DL, VT)});
28986+
}
28987+
2891528988
/// This is a stub for TargetLowering::SimplifySetCC.
2891628989
SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
2891728990
ISD::CondCode Cond, const SDLoc &DL,
2891828991
bool foldBooleans) {
2891928992
TargetLowering::DAGCombinerInfo
2892028993
DagCombineInfo(DAG, Level, false, this);
28921-
return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
28994+
if (SDValue C =
28995+
TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL))
28996+
return C;
28997+
28998+
if ((Cond == ISD::SETNE || Cond == ISD::SETEQ) &&
28999+
N0.getOpcode() == ISD::AND && isNullConstant(N1)) {
29000+
29001+
if (SDValue Res = matchMergedBFX(N0, DAG, TLI))
29002+
return DAG.getSetCC(DL, VT, Res, N1, Cond);
29003+
}
29004+
29005+
return SDValue();
2892229006
}
2892329007

2892429008
/// Given an ISD::SDIV node expressing a divide by constant, return

llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,15 @@ define i1 @workitem_zero() {
1212
; DAGISEL-GFX8-LABEL: workitem_zero:
1313
; DAGISEL-GFX8: ; %bb.0: ; %entry
1414
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15-
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v1, 10, v31
16-
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v0, 20, v31
17-
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v1, v31, v1
18-
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v0, v1, v0
19-
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3ff, v0
15+
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
2016
; DAGISEL-GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2117
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
2218
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
2319
;
2420
; DAGISEL-GFX942-LABEL: workitem_zero:
2521
; DAGISEL-GFX942: ; %bb.0: ; %entry
2622
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31
28-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31
29-
; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0
30-
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
23+
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
3124
; DAGISEL-GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
3225
; DAGISEL-GFX942-NEXT: s_nop 1
3326
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -40,11 +33,7 @@ define i1 @workitem_zero() {
4033
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
4134
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
4235
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
43-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31
44-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31
45-
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
46-
; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0
47-
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
36+
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
4837
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
4938
; DAGISEL-GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
5039
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd
@@ -106,22 +95,15 @@ define i1 @workitem_nonzero() {
10695
; DAGISEL-GFX8-LABEL: workitem_nonzero:
10796
; DAGISEL-GFX8: ; %bb.0: ; %entry
10897
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109-
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v1, 10, v31
110-
; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v0, 20, v31
111-
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v1, v31, v1
112-
; DAGISEL-GFX8-NEXT: v_or_b32_e32 v0, v1, v0
113-
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3ff, v0
98+
; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
11499
; DAGISEL-GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
115100
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
116101
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
117102
;
118103
; DAGISEL-GFX942-LABEL: workitem_nonzero:
119104
; DAGISEL-GFX942: ; %bb.0: ; %entry
120105
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31
122-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31
123-
; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0
124-
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
106+
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
125107
; DAGISEL-GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
126108
; DAGISEL-GFX942-NEXT: s_nop 1
127109
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -134,11 +116,7 @@ define i1 @workitem_nonzero() {
134116
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
135117
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
136118
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
137-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31
138-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31
139-
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
140-
; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0
141-
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
119+
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
142120
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
143121
; DAGISEL-GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
144122
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd

0 commit comments

Comments
 (0)