[AMDGPU] Enable divergence-driven XNOR selection

alex-t · alex-t · commit 5157f984ae2c · 2022-01-26T15:33:10.000+03:00
Currently not (xor_one_use) pattern is always selected to S_XNOR irrelative od the node divergence. This relies on further custom selection pass which converts to VALU if necessary and replaces with V_NOT_B32 ( V_XOR_B32) on those targets which have no V_XNOR. Current change enables the patterns which explicitly select the not (xor_one_use) to appropriate form. We assume that xor (not) is already turned into the not (xor) by the combiner. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D116270
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3292,6 +3292,17 @@ class TargetLowering : public TargetLoweringBase {
     return false;
   }
 
+  // Lets target to control the following reassociation of operands: (op (op x,
+  // c1), y) -> (op (op x, y), c1) where N0 is (op x, c1) and N1 is y. By
+  // default consider profitable any case where N0 has single use.  This
+  // behavior reflects the condition replaced by this target hook call in the
+  // DAGCombiner.  Any particular target can implement its own heuristic to
+  // restrict common combiner.
+  virtual bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+                                   SDValue N1) const {
+    return N0.hasOneUse();
+  }
+
   virtual bool isSDNodeAlwaysUniform(const SDNode * N) const {
     return false;
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1070,7 +1070,7 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
         return DAG.getNode(Opc, DL, VT, N00, OpNode);
       return SDValue();
     }
-    if (N0.hasOneUse()) {
+    if (TLI.isReassocProfitable(DAG, N0, N1)) {
       // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
       //              iff (op x, c1) has one use
       if (SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9639,6 +9639,9 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
 
 SDValue SITargetLowering::performXorCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
+    return RV;
+
   EVT VT = N->getValueType(0);
   if (VT != MVT::i64)
     return SDValue();
@@ -10551,6 +10554,9 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
   if (VT != MVT::i32 && VT != MVT::i64)
     return SDValue();
 
+  if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
+    return SDValue();
+
   unsigned Opc = N->getOpcode();
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
@@ -10572,12 +10578,6 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
   if (Op1->isDivergent())
     std::swap(Op1, Op2);
 
-  // If either operand is constant this will conflict with
-  // DAGCombiner::ReassociateOps().
-  if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
-      DAG.isConstantIntBuildVectorOrConstantInt(Op1))
-    return SDValue();
-
   SDLoc SL(N);
   SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
   return DAG.getNode(Opc, SL, VT, Add1, Op2);
@@ -12578,3 +12578,27 @@ SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
   Cost.first += (Size + 255) / 256;
   return Cost;
 }
+
+bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
+  SDNode::use_iterator I = N->use_begin(), E = N->use_end();
+  for (; I != E; ++I) {
+    if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
+      if (getBasePtrIndex(M) == I.getOperandNo())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+                                           SDValue N1) const {
+  if (!N0.hasOneUse())
+    return false;
+  // Take care of the oportunity to keep N0 uniform
+  if (N0->isDivergent() || !N1->isDivergent())
+    return true;
+  // Check if we have a good chance to form the memory access pattern with the
+  // base and offset
+  return (DAG.isBaseWithConstantOffset(N0) &&
+          hasMemSDNodeUser(*N0->use_begin()));
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -449,6 +449,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool isSDNodeSourceOfDivergence(const SDNode *N,
     FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override;
 
+  bool hasMemSDNodeUser(SDNode *N) const;
+
+  bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+                           SDValue N1) const override;
+
   bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
                        unsigned MaxDepth = 5) const;
   bool isCanonicalized(Register Reg, MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -550,11 +550,11 @@ def S_XOR_B64 : SOP2_64 <"s_xor_b64",
 >;
 
 def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
-  [(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))]
+  [(set i32:$sdst, (UniformUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1)))]
 >;
 
 def S_XNOR_B64 : SOP2_64 <"s_xnor_b64",
-  [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))]
+  [(set i64:$sdst, (UniformUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1)))]
 >;
 
 def S_NAND_B32 : SOP2_32 <"s_nand_b32",
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -637,9 +637,9 @@ class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
       )
   >;
 
-def :  divergent_i64_BinOp <and, V_AND_B32_e32>;
-def :  divergent_i64_BinOp <or,  V_OR_B32_e32>;
-def :  divergent_i64_BinOp <xor, V_XOR_B32_e32>;
+def :  divergent_i64_BinOp <and, V_AND_B32_e64>;
+def :  divergent_i64_BinOp <or,  V_OR_B32_e64>;
+def :  divergent_i64_BinOp <xor, V_XOR_B32_e64>;
 
 let SubtargetPredicate = Has16BitInsts in {
 
@@ -688,6 +688,36 @@ let SubtargetPredicate = HasDLInsts in {
 let isReMaterializable = 1 in
 defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>;
 
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1))),
+  (i32 (V_XNOR_B32_e64 $src0, $src1))
+>;
+
+def : GCNPat<
+  (i32 (DivergentBinFrag<xor_oneuse> (not i32:$src0), i32:$src1)),
+  (i32 (V_XNOR_B32_e64 $src0, $src1))
+>;
+
+def : GCNPat<
+  (i64 (DivergentUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1))),
+  (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64
+                            (i32 (EXTRACT_SUBREG $src0, sub0)),
+                            (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0,
+                     (i32 (V_XNOR_B32_e64
+                            (i32 (EXTRACT_SUBREG $src0, sub1)),
+                            (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1)
+>;
+
+def : GCNPat<
+  (i64 (DivergentBinFrag<xor_oneuse> (not i64:$src0), i64:$src1)),
+  (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64
+                            (i32 (EXTRACT_SUBREG $src0, sub0)),
+                            (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0,
+                     (i32 (V_XNOR_B32_e64
+                            (i32 (EXTRACT_SUBREG $src0, sub1)),
+                            (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1)
+>;
+
 let Constraints = "$vdst = $src2",
     DisableEncoding = "$src2",
     isConvertibleToThreeAddress = 1,
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll
@@ -0,0 +1,44 @@
+; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx906 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN_DL %s
+
+; GCN-LABEL: name:            uniform_xnor_i64
+; GCN: S_XNOR_B64
+define amdgpu_kernel void @uniform_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %xor = xor i64 %a, %b
+  %res = xor i64 %xor, -1
+  store i64 %res, i64 addrspace(1)* %out
+  ret void
+}
+; GCN-LABEL: name:            divergent_xnor_i64
+; GCN: V_XOR_B32_e64
+; GCN: V_XOR_B32_e64
+; GCN: V_NOT_B32_e32
+; GCN: V_NOT_B32_e32
+; GCN_DL: V_XNOR_B32_e64
+; GCN_DL: V_XNOR_B32_e64
+define i64 @divergent_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %xor = xor i64 %a, %b
+  %res = xor i64 %xor, -1
+  ret i64 %res
+}
+
+; GCN-LABEL: name:            uniform_xnor_i32
+; GCN: S_XNOR_B32
+define amdgpu_kernel void @uniform_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %xor = xor i32 %a, %b
+  %res = xor i32 %xor, -1
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: name:            divergent_xnor_i32
+; GCN: V_XOR_B32_e64
+; GCN: V_NOT_B32_e32
+; GCN_DL: V_XNOR_B32_e64
+define i32 @divergent_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %xor = xor i32 %a, %b
+  %res = xor i32 %xor, -1
+  ret i32 %res
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
@@ -163,8 +163,8 @@ define amdgpu_kernel void @divergent_xor3_b64(<3 x i64> addrspace(1)* %arg) {
 ; GCN-NEXT:    v_xor_b32_e32 v1, v3, v1
 ; GCN-NEXT:    v_xor_b32_e32 v0, v2, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_xnor_b32_e32 v0, v0, v4
 ; GCN-NEXT:    v_xnor_b32_e32 v1, v1, v5
+; GCN-NEXT:    v_xnor_b32_e32 v0, v0, v4
 ; GCN-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll
@@ -106,8 +106,11 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}and_or_or_and:
-; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
-; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff00
+; GCN: s_or_b32 [[SREG:s[0-9]+]], s{{[0-9]+}}, 0xffff0000
+; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 0xff00ff, v{{[0-9]+}}
+; GCN: v_or_b32_e32 v{{[0-9]+}}, [[SREG]], [[VREG]]
+; FIXME here should have been "v_perm_b32" with 0xffff0500 mask.
 define amdgpu_kernel void @and_or_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -153,10 +156,14 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}known_ffff0500:
-; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
-; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004
-; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004
+; GCN: s_and_b32 [[SREG:s[0-9]+]], [[SREG]], 0xff00
+; GCN: s_or_b32 [[SREG]], [[SREG]], 0xffff0000
+; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 0xff00ff, [[VREG]]
+; GCN: v_or_b32_e32 [[VREG]], [[SREG]], [[VREG]]
+; GCN: store_dword v[{{[0-9:]+}}], [[VREG]]{{$}}
 ; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
+; FIXME here should have been "v_perm_b32" with 0xffff0500 mask.
 define amdgpu_kernel void @known_ffff0500(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -472,10 +472,10 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O0-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
-; GFX9-O0-NEXT:    v_or_b32_e32 v6, v1, v2
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v1, v2
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
diff --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll
@@ -61,8 +61,8 @@ entry:
 
 ; GCN-LABEL: {{^}}vector_xnor_i32_one_use
 ; GCN-NOT: s_xnor_b32
-; GCN: v_not_b32
 ; GCN: v_xor_b32
+; GCN: v_not_b32
 ; GCN-DL: v_xnor_b32
 define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) {
 entry:
@@ -73,10 +73,10 @@ entry:
 
 ; GCN-LABEL: {{^}}vector_xnor_i64_one_use
 ; GCN-NOT: s_xnor_b64
-; GCN: v_not_b32
-; GCN: v_not_b32
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
+; GCN: v_not_b32
+; GCN: v_not_b32
 ; GCN-DL: v_xnor_b32
 ; GCN-DL: v_xnor_b32
 define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) {
@@ -150,8 +150,8 @@ entry:
 
 ; GCN-LABEL: {{^}}vector_xor_na_b_i32_one_use
 ; GCN-NOT: s_xnor_b32
-; GCN: v_not_b32
 ; GCN: v_xor_b32
+; GCN: v_not_b32
 ; GCN-DL: v_xnor_b32
 define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) {
 entry:
@@ -162,8 +162,8 @@ entry:
 
 ; GCN-LABEL: {{^}}vector_xor_a_nb_i32_one_use
 ; GCN-NOT: s_xnor_b32
-; GCN: v_not_b32
 ; GCN: v_xor_b32
+; GCN: v_not_b32
 ; GCN-DL: v_xnor_b32
 define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll
@@ -26,13 +26,13 @@ define amdgpu_ps float @xor3(i32 %a, i32 %b, i32 %c) {
 define amdgpu_ps float @xor3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {
 ; GFX9-LABEL: xor3_vgpr_b:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, s3, v0
+; GFX9-NEXT:    s_xor_b32 s0, s3, s2
+; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: xor3_vgpr_b:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_xor3_b32 v0, s2, v0, s3
+; GFX10-NEXT:    v_xor3_b32 v0, s3, s2, v0
 ; GFX10-NEXT:    ; return to shader part epilog
   %x = xor i32 %a, %b
   %result = xor i32 %x, %c

Original file line number	Diff line number	Diff line change
`@@ -1070,7 +1070,7 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,`
`1070`	`1070`	`return DAG.getNode(Opc, DL, VT, N00, OpNode);`
`1071`	`1071`	`return SDValue();`
`1072`	`1072`	`}`
`1073`		`- if (N0.hasOneUse()) {`
	`1073`	`+ if (TLI.isReassocProfitable(DAG, N0, N1)) {`
`1074`	`1074`	`// Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)`
`1075`	`1075`	`// iff (op x, c1) has one use`
`1076`	`1076`	`if (SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1))`