Skip to content

Commit 7ff0806

Browse files
committed
[DAG] Fold (setcc ((x | x >> c0 | ...) & mask)) sequences
Fold sequences where we extract a bunch of contiguous bits from a value, merge them into the low bit and then check if the low bits are zero or not. It seems like a strange sequence at first but it's an idiom used by device libs in device libs to check workitem IDs for AMDGPU. The reason I put this in DAGCombiner instead of the target combiner is because this is a generic, valid transform that's also fairly niche, so there isn't much risk of a combine loop I think. See #136727
1 parent 3f62ab3 commit 7ff0806

File tree

2 files changed

+91
-29
lines changed

2 files changed

+91
-29
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28909,13 +28909,97 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
2890928909
return SDValue();
2891028910
}
2891128911

28912+
static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
28913+
const TargetLowering &TLI) {
28914+
// Match a pattern such as:
28915+
// (X | (X >> C0) | (X >> C1) | ...) & Mask
28916+
// This extracts contiguous parts of X and ORs them together before comparing.
28917+
// We can optimize this so that we directly check (X & SomeMask) instead,
28918+
// eliminating the shifts.
28919+
28920+
EVT VT = Root.getValueType();
28921+
28922+
if (Root.getOpcode() != ISD::AND)
28923+
return SDValue();
28924+
28925+
SDValue N0 = Root.getOperand(0);
28926+
SDValue N1 = Root.getOperand(1);
28927+
28928+
if (N0.getOpcode() != ISD::OR || !isa<ConstantSDNode>(N1))
28929+
return SDValue();
28930+
28931+
APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal();
28932+
if (!RootMask.isMask())
28933+
return SDValue();
28934+
28935+
SDValue Src;
28936+
const auto IsSrc = [&](SDValue V) {
28937+
if (!Src) {
28938+
Src = V;
28939+
return true;
28940+
}
28941+
28942+
return Src == V;
28943+
};
28944+
28945+
SmallVector<SDValue> Worklist = {N0};
28946+
APInt PartsMask(VT.getSizeInBits(), 0);
28947+
while (!Worklist.empty()) {
28948+
SDValue V = Worklist.pop_back_val();
28949+
if (!V.hasOneUse() && Src != V)
28950+
return SDValue();
28951+
28952+
if (V.getOpcode() == ISD::OR) {
28953+
Worklist.push_back(V.getOperand(0));
28954+
Worklist.push_back(V.getOperand(1));
28955+
continue;
28956+
}
28957+
28958+
if (V.getOpcode() == ISD::SRL) {
28959+
SDValue ShiftSrc = V.getOperand(0);
28960+
SDValue ShiftAmt = V.getOperand(1);
28961+
28962+
if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt))
28963+
return SDValue();
28964+
28965+
PartsMask |= (RootMask << cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal());
28966+
continue;
28967+
}
28968+
28969+
if (IsSrc(V)) {
28970+
PartsMask |= RootMask;
28971+
continue;
28972+
}
28973+
28974+
return SDValue();
28975+
}
28976+
28977+
if (!RootMask.isMask() || !Src)
28978+
return SDValue();
28979+
28980+
SDLoc DL(Root);
28981+
return DAG.getNode(ISD::AND, DL, VT,
28982+
{Src, DAG.getConstant(PartsMask, DL, VT)});
28983+
}
28984+
2891228985
/// This is a stub for TargetLowering::SimplifySetCC.
2891328986
SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
2891428987
ISD::CondCode Cond, const SDLoc &DL,
2891528988
bool foldBooleans) {
2891628989
TargetLowering::DAGCombinerInfo
2891728990
DagCombineInfo(DAG, Level, false, this);
28918-
return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
28991+
if (SDValue C =
28992+
TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL))
28993+
return C;
28994+
28995+
if ((Cond == ISD::SETNE || Cond == ISD::SETEQ) &&
28996+
N0.getOpcode() == ISD::AND && isNullConstant(N1)) {
28997+
28998+
if (SDValue Res = matchMergedBFX(N0, DAG, TLI))
28999+
return DAG.getSetCC(DL, VT, Res, N1, Cond);
29000+
}
29001+
29002+
return SDValue();
2891929003
}
2892029004

2892129005
/// Given an ISD::SDIV node expressing a divide by constant, return

llvm/test/CodeGen/AMDGPU/workitems-intrinsics-opts.ll

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,15 @@ define i1 @workitem_zero() {
1212
; DAGISEL-GFX9-LABEL: workitem_zero:
1313
; DAGISEL-GFX9: ; %bb.0: ; %entry
1414
; DAGISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15-
; DAGISEL-GFX9-NEXT: v_lshrrev_b32_e32 v1, 10, v31
16-
; DAGISEL-GFX9-NEXT: v_lshrrev_b32_e32 v0, 20, v31
17-
; DAGISEL-GFX9-NEXT: v_or_b32_e32 v1, v31, v1
18-
; DAGISEL-GFX9-NEXT: v_or_b32_e32 v0, v1, v0
19-
; DAGISEL-GFX9-NEXT: v_and_b32_e32 v0, 0x3ff, v0
15+
; DAGISEL-GFX9-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
2016
; DAGISEL-GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2117
; DAGISEL-GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
2218
; DAGISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
2319
;
2420
; DAGISEL-GFX942-LABEL: workitem_zero:
2521
; DAGISEL-GFX942: ; %bb.0: ; %entry
2622
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31
28-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31
29-
; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0
30-
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
23+
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
3124
; DAGISEL-GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
3225
; DAGISEL-GFX942-NEXT: s_nop 1
3326
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -40,11 +33,7 @@ define i1 @workitem_zero() {
4033
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
4134
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
4235
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
43-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31
44-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31
45-
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
46-
; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0
47-
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
36+
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
4837
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
4938
; DAGISEL-GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
5039
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd
@@ -106,22 +95,15 @@ define i1 @workitem_nonzero() {
10695
; DAGISEL-GFX9-LABEL: workitem_nonzero:
10796
; DAGISEL-GFX9: ; %bb.0: ; %entry
10897
; DAGISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109-
; DAGISEL-GFX9-NEXT: v_lshrrev_b32_e32 v1, 10, v31
110-
; DAGISEL-GFX9-NEXT: v_lshrrev_b32_e32 v0, 20, v31
111-
; DAGISEL-GFX9-NEXT: v_or_b32_e32 v1, v31, v1
112-
; DAGISEL-GFX9-NEXT: v_or_b32_e32 v0, v1, v0
113-
; DAGISEL-GFX9-NEXT: v_and_b32_e32 v0, 0x3ff, v0
98+
; DAGISEL-GFX9-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
11499
; DAGISEL-GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
115100
; DAGISEL-GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
116101
; DAGISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
117102
;
118103
; DAGISEL-GFX942-LABEL: workitem_nonzero:
119104
; DAGISEL-GFX942: ; %bb.0: ; %entry
120105
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31
122-
; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31
123-
; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0
124-
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
106+
; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
125107
; DAGISEL-GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
126108
; DAGISEL-GFX942-NEXT: s_nop 1
127109
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -134,11 +116,7 @@ define i1 @workitem_nonzero() {
134116
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
135117
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
136118
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
137-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31
138-
; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31
139-
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
140-
; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0
141-
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
119+
; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
142120
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
143121
; DAGISEL-GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
144122
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd

0 commit comments

Comments
 (0)