Skip to content

Commit 5157f98

Browse files
committed
[AMDGPU] Enable divergence-driven XNOR selection
Currently not (xor_one_use) pattern is always selected to S_XNOR irrelative od the node divergence. This relies on further custom selection pass which converts to VALU if necessary and replaces with V_NOT_B32 ( V_XOR_B32) on those targets which have no V_XNOR. Current change enables the patterns which explicitly select the not (xor_one_use) to appropriate form. We assume that xor (not) is already turned into the not (xor) by the combiner. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D116270
1 parent bfd5696 commit 5157f98

File tree

12 files changed

+149
-28
lines changed

12 files changed

+149
-28
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3292,6 +3292,17 @@ class TargetLowering : public TargetLoweringBase {
32923292
return false;
32933293
}
32943294

3295+
// Lets target to control the following reassociation of operands: (op (op x,
3296+
// c1), y) -> (op (op x, y), c1) where N0 is (op x, c1) and N1 is y. By
3297+
// default consider profitable any case where N0 has single use. This
3298+
// behavior reflects the condition replaced by this target hook call in the
3299+
// DAGCombiner. Any particular target can implement its own heuristic to
3300+
// restrict common combiner.
3301+
virtual bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
3302+
SDValue N1) const {
3303+
return N0.hasOneUse();
3304+
}
3305+
32953306
virtual bool isSDNodeAlwaysUniform(const SDNode * N) const {
32963307
return false;
32973308
}

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1070,7 +1070,7 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
10701070
return DAG.getNode(Opc, DL, VT, N00, OpNode);
10711071
return SDValue();
10721072
}
1073-
if (N0.hasOneUse()) {
1073+
if (TLI.isReassocProfitable(DAG, N0, N1)) {
10741074
// Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
10751075
// iff (op x, c1) has one use
10761076
if (SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1))

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9639,6 +9639,9 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
96399639

96409640
SDValue SITargetLowering::performXorCombine(SDNode *N,
96419641
DAGCombinerInfo &DCI) const {
9642+
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
9643+
return RV;
9644+
96429645
EVT VT = N->getValueType(0);
96439646
if (VT != MVT::i64)
96449647
return SDValue();
@@ -10551,6 +10554,9 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
1055110554
if (VT != MVT::i32 && VT != MVT::i64)
1055210555
return SDValue();
1055310556

10557+
if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
10558+
return SDValue();
10559+
1055410560
unsigned Opc = N->getOpcode();
1055510561
SDValue Op0 = N->getOperand(0);
1055610562
SDValue Op1 = N->getOperand(1);
@@ -10572,12 +10578,6 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
1057210578
if (Op1->isDivergent())
1057310579
std::swap(Op1, Op2);
1057410580

10575-
// If either operand is constant this will conflict with
10576-
// DAGCombiner::ReassociateOps().
10577-
if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
10578-
DAG.isConstantIntBuildVectorOrConstantInt(Op1))
10579-
return SDValue();
10580-
1058110581
SDLoc SL(N);
1058210582
SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
1058310583
return DAG.getNode(Opc, SL, VT, Add1, Op2);
@@ -12578,3 +12578,27 @@ SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
1257812578
Cost.first += (Size + 255) / 256;
1257912579
return Cost;
1258012580
}
12581+
12582+
bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
12583+
SDNode::use_iterator I = N->use_begin(), E = N->use_end();
12584+
for (; I != E; ++I) {
12585+
if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
12586+
if (getBasePtrIndex(M) == I.getOperandNo())
12587+
return true;
12588+
}
12589+
}
12590+
return false;
12591+
}
12592+
12593+
bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
12594+
SDValue N1) const {
12595+
if (!N0.hasOneUse())
12596+
return false;
12597+
// Take care of the oportunity to keep N0 uniform
12598+
if (N0->isDivergent() || !N1->isDivergent())
12599+
return true;
12600+
// Check if we have a good chance to form the memory access pattern with the
12601+
// base and offset
12602+
return (DAG.isBaseWithConstantOffset(N0) &&
12603+
hasMemSDNodeUser(*N0->use_begin()));
12604+
}

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
449449
bool isSDNodeSourceOfDivergence(const SDNode *N,
450450
FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override;
451451

452+
bool hasMemSDNodeUser(SDNode *N) const;
453+
454+
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
455+
SDValue N1) const override;
456+
452457
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
453458
unsigned MaxDepth = 5) const;
454459
bool isCanonicalized(Register Reg, MachineFunction &MF,

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -550,11 +550,11 @@ def S_XOR_B64 : SOP2_64 <"s_xor_b64",
550550
>;
551551

552552
def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
553-
[(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))]
553+
[(set i32:$sdst, (UniformUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1)))]
554554
>;
555555

556556
def S_XNOR_B64 : SOP2_64 <"s_xnor_b64",
557-
[(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))]
557+
[(set i64:$sdst, (UniformUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1)))]
558558
>;
559559

560560
def S_NAND_B32 : SOP2_32 <"s_nand_b32",

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -637,9 +637,9 @@ class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
637637
)
638638
>;
639639

640-
def : divergent_i64_BinOp <and, V_AND_B32_e32>;
641-
def : divergent_i64_BinOp <or, V_OR_B32_e32>;
642-
def : divergent_i64_BinOp <xor, V_XOR_B32_e32>;
640+
def : divergent_i64_BinOp <and, V_AND_B32_e64>;
641+
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
642+
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
643643

644644
let SubtargetPredicate = Has16BitInsts in {
645645

@@ -688,6 +688,36 @@ let SubtargetPredicate = HasDLInsts in {
688688
let isReMaterializable = 1 in
689689
defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>;
690690

691+
def : GCNPat<
692+
(i32 (DivergentUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1))),
693+
(i32 (V_XNOR_B32_e64 $src0, $src1))
694+
>;
695+
696+
def : GCNPat<
697+
(i32 (DivergentBinFrag<xor_oneuse> (not i32:$src0), i32:$src1)),
698+
(i32 (V_XNOR_B32_e64 $src0, $src1))
699+
>;
700+
701+
def : GCNPat<
702+
(i64 (DivergentUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1))),
703+
(REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64
704+
(i32 (EXTRACT_SUBREG $src0, sub0)),
705+
(i32 (EXTRACT_SUBREG $src1, sub0)))), sub0,
706+
(i32 (V_XNOR_B32_e64
707+
(i32 (EXTRACT_SUBREG $src0, sub1)),
708+
(i32 (EXTRACT_SUBREG $src1, sub1)))), sub1)
709+
>;
710+
711+
def : GCNPat<
712+
(i64 (DivergentBinFrag<xor_oneuse> (not i64:$src0), i64:$src1)),
713+
(REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64
714+
(i32 (EXTRACT_SUBREG $src0, sub0)),
715+
(i32 (EXTRACT_SUBREG $src1, sub0)))), sub0,
716+
(i32 (V_XNOR_B32_e64
717+
(i32 (EXTRACT_SUBREG $src0, sub1)),
718+
(i32 (EXTRACT_SUBREG $src1, sub1)))), sub1)
719+
>;
720+
691721
let Constraints = "$vdst = $src2",
692722
DisableEncoding = "$src2",
693723
isConvertibleToThreeAddress = 1,
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s
2+
; RUN: llc -march=amdgcn -mcpu=gfx906 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN_DL %s
3+
4+
; GCN-LABEL: name: uniform_xnor_i64
5+
; GCN: S_XNOR_B64
6+
define amdgpu_kernel void @uniform_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
7+
%xor = xor i64 %a, %b
8+
%res = xor i64 %xor, -1
9+
store i64 %res, i64 addrspace(1)* %out
10+
ret void
11+
}
12+
; GCN-LABEL: name: divergent_xnor_i64
13+
; GCN: V_XOR_B32_e64
14+
; GCN: V_XOR_B32_e64
15+
; GCN: V_NOT_B32_e32
16+
; GCN: V_NOT_B32_e32
17+
; GCN_DL: V_XNOR_B32_e64
18+
; GCN_DL: V_XNOR_B32_e64
19+
define i64 @divergent_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
20+
%xor = xor i64 %a, %b
21+
%res = xor i64 %xor, -1
22+
ret i64 %res
23+
}
24+
25+
; GCN-LABEL: name: uniform_xnor_i32
26+
; GCN: S_XNOR_B32
27+
define amdgpu_kernel void @uniform_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
28+
%xor = xor i32 %a, %b
29+
%res = xor i32 %xor, -1
30+
store i32 %res, i32 addrspace(1)* %out
31+
ret void
32+
}
33+
34+
; GCN-LABEL: name: divergent_xnor_i32
35+
; GCN: V_XOR_B32_e64
36+
; GCN: V_NOT_B32_e32
37+
; GCN_DL: V_XNOR_B32_e64
38+
define i32 @divergent_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
39+
%xor = xor i32 %a, %b
40+
%res = xor i32 %xor, -1
41+
ret i32 %res
42+
}
43+
44+
declare i32 @llvm.amdgcn.workitem.id.x() #0

llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,8 +163,8 @@ define amdgpu_kernel void @divergent_xor3_b64(<3 x i64> addrspace(1)* %arg) {
163163
; GCN-NEXT: v_xor_b32_e32 v1, v3, v1
164164
; GCN-NEXT: v_xor_b32_e32 v0, v2, v0
165165
; GCN-NEXT: s_waitcnt vmcnt(0)
166-
; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4
167166
; GCN-NEXT: v_xnor_b32_e32 v1, v1, v5
167+
; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4
168168
; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
169169
; GCN-NEXT: s_endpgm
170170
bb:

llvm/test/CodeGen/AMDGPU/permute.ll

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,11 @@ bb:
106106
}
107107

108108
; GCN-LABEL: {{^}}and_or_or_and:
109-
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
110-
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
109+
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff00
110+
; GCN: s_or_b32 [[SREG:s[0-9]+]], s{{[0-9]+}}, 0xffff0000
111+
; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 0xff00ff, v{{[0-9]+}}
112+
; GCN: v_or_b32_e32 v{{[0-9]+}}, [[SREG]], [[VREG]]
113+
; FIXME here should have been "v_perm_b32" with 0xffff0500 mask.
111114
define amdgpu_kernel void @and_or_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
112115
bb:
113116
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -153,10 +156,14 @@ bb:
153156
}
154157

155158
; GCN-LABEL: {{^}}known_ffff0500:
156-
; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
157-
; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004
158-
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
159+
; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004
160+
; GCN: s_and_b32 [[SREG:s[0-9]+]], [[SREG]], 0xff00
161+
; GCN: s_or_b32 [[SREG]], [[SREG]], 0xffff0000
162+
; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 0xff00ff, [[VREG]]
163+
; GCN: v_or_b32_e32 [[VREG]], [[SREG]], [[VREG]]
164+
; GCN: store_dword v[{{[0-9:]+}}], [[VREG]]{{$}}
159165
; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
166+
; FIXME here should have been "v_perm_b32" with 0xffff0500 mask.
160167
define amdgpu_kernel void @known_ffff0500(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
161168
bb:
162169
%id = tail call i32 @llvm.amdgcn.workitem.id.x()

llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -472,10 +472,10 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
472472
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
473473
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0
474474
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
475-
; GFX9-O0-NEXT: v_or_b32_e32 v0, v0, v3
475+
; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v3
476476
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
477477
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
478-
; GFX9-O0-NEXT: v_or_b32_e32 v6, v1, v2
478+
; GFX9-O0-NEXT: v_or_b32_e64 v6, v1, v2
479479
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
480480
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0
481481
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6

0 commit comments

Comments
 (0)