[NVPTX] lower VECREDUCE intrinsics to tree reduction

Prince781 · Prince781 · commit 0b532935066f · 2025-04-16T18:10:36.000-07:00
Also adds support for sm_100+ fmax3/fmin3 instructions, introduced in
PTX 8.8.

This method of tree reduction has a few benefits over the default in
DAGTypeLegalizer:
- Produces optimal number of operations supported by the target. Instead
  of progresisvely splitting the vector operand top-down, first
  scalarize it and then build the tree bottom-up. This uses larger
  operations when available and leaves smaller ones for the remaining
  elements.
- Faster compile time. Happens in one pass over the intrinsic, rather
  than O(N) passes if iteratively splitting the vector operands.
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -36,17 +36,19 @@ class FeaturePTX<int version>:
 
 foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
               60, 61, 62, 70, 72, 75, 80, 86, 87,
-              89, 90, 100, 101, 120] in
+              89, 90, 100, 101, 103, 120, 121] in
   def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
 
 def SM90a: FeatureSM<"90a", 901>;
 def SM100a: FeatureSM<"100a", 1001>;
 def SM101a: FeatureSM<"101a", 1011>;
+def SM103a: FeatureSM<"103a", 1031>;
 def SM120a: FeatureSM<"120a", 1201>;
+def SM121a: FeatureSM<"121a", 1211>;
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
                    70, 71, 72, 73, 74, 75, 76, 77, 78,
-                   80, 81, 82, 83, 84, 85, 86, 87] in
+                   80, 81, 82, 83, 84, 85, 86, 87, 88] in
   def PTX#version: FeaturePTX<version>;
 
 //===----------------------------------------------------------------------===//
@@ -81,8 +83,12 @@ def : Proc<"sm_100", [SM100, PTX86]>;
 def : Proc<"sm_100a", [SM100a, PTX86]>;
 def : Proc<"sm_101", [SM101, PTX86]>;
 def : Proc<"sm_101a", [SM101a, PTX86]>;
+def : Proc<"sm_103", [SM103, PTX88]>;
+def : Proc<"sm_103a", [SM103a, PTX88]>;
 def : Proc<"sm_120", [SM120, PTX87]>;
 def : Proc<"sm_120a", [SM120a, PTX87]>;
+def : Proc<"sm_121", [SM121, PTX88]>;
+def : Proc<"sm_121a", [SM121a, PTX88]>;
 
 def NVPTXInstrInfo : InstrInfo {
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -85,6 +85,12 @@ static cl::opt<unsigned> FMAContractLevelOpt(
              " 1: do it  2: do it aggressively"),
     cl::init(2));
 
+static cl::opt<bool> DisableFOpTreeReduce(
+    "nvptx-disable-fop-tree-reduce", cl::Hidden,
+    cl::desc("NVPTX Specific: don't emit tree reduction for floating-point "
+             "reduction operations"),
+    cl::init(false));
+
 static cl::opt<int> UsePrecDivF32(
     "nvptx-prec-divf32", cl::Hidden,
     cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
@@ -831,6 +837,15 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   if (STI.allowFP16Math() || STI.hasBF16Math())
     setTargetDAGCombine(ISD::SETCC);
 
+  // Vector reduction operations. These are transformed into a tree evaluation
+  // of nodes which may or may not be legal.
+  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+    setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMUL,
+                        ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
+                        ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
+                       VT, Custom);
+  }
+
   // Promote fp16 arithmetic if fp16 hardware isn't available or the
   // user passed --nvptx-no-fp16-math. The flag is useful because,
   // although sm_53+ GPUs have some sort of FP16 support in
@@ -1082,6 +1097,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(NVPTXISD::BFI)
     MAKE_CASE(NVPTXISD::PRMT)
     MAKE_CASE(NVPTXISD::FCOPYSIGN)
+    MAKE_CASE(NVPTXISD::FMAXNUM3)
+    MAKE_CASE(NVPTXISD::FMINNUM3)
+    MAKE_CASE(NVPTXISD::FMAXIMUM3)
+    MAKE_CASE(NVPTXISD::FMINIMUM3)
     MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
     MAKE_CASE(NVPTXISD::STACKRESTORE)
     MAKE_CASE(NVPTXISD::STACKSAVE)
@@ -2136,6 +2155,108 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
 }
 
+/// A generic routine for constructing a tree reduction for a vector operand.
+/// This method differs from iterative splitting in DAGTypeLegalizer by
+/// first scalarizing the vector and then progressively grouping elements
+/// bottom-up. This allows easily building the optimal (minimum) number of nodes
+/// with different numbers of operands (eg. max3 vs max2).
+static SDValue BuildTreeReduction(
+    const SDValue &VectorOp,
+    ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
+    const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
+  EVT VectorTy = VectorOp.getValueType();
+  EVT EltTy = VectorTy.getVectorElementType();
+  const unsigned NumElts = VectorTy.getVectorNumElements();
+
+  // scalarize vector
+  SmallVector<SDValue> Elements(NumElts);
+  for (unsigned I = 0, E = NumElts; I != E; ++I) {
+    Elements[I] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorOp,
+                              DAG.getConstant(I, DL, MVT::i64));
+  }
+
+  // now build the computation graph in place at each level
+  SmallVector<SDValue> Level = Elements;
+  for (unsigned OpIdx = 0; Level.size() > 1 && OpIdx < Ops.size();) {
+    const auto [DefaultScalarOp, DefaultGroupSize] = Ops[OpIdx];
+
+    // partially reduce all elements in level
+    SmallVector<SDValue> ReducedLevel;
+    unsigned I = 0, E = Level.size();
+    for (; I + DefaultGroupSize <= E; I += DefaultGroupSize) {
+      // Reduce elements in groups of [DefaultGroupSize], as much as possible.
+      ReducedLevel.push_back(DAG.getNode(
+          DefaultScalarOp, DL, EltTy,
+          ArrayRef<SDValue>(Level).slice(I, DefaultGroupSize), Flags));
+    }
+
+    if (I < E) {
+      if (ReducedLevel.empty()) {
+        // The current operator requires more inputs than there are operands at
+        // this level. Pick a smaller operator and retry.
+        ++OpIdx;
+        assert(OpIdx < Ops.size() && "no smaller operators for reduction");
+        continue;
+      }
+
+      // Otherwise, we just have a remainder, which we push to the next level.
+      for (; I < E; ++I)
+        ReducedLevel.push_back(Level[I]);
+    }
+    Level = ReducedLevel;
+  }
+
+  return *Level.begin();
+}
+
+/// Lower fadd/fmul vector reductions. Builds a computation graph (tree) and
+/// serializes it.
+SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // If we can't reorder sub-operations, let DAGTypeLegalizer lower this op.
+  if (DisableFOpTreeReduce || !Op->getFlags().hasAllowReassociation())
+    return SDValue();
+
+  EVT EltTy = Op.getOperand(0).getValueType().getVectorElementType();
+  const bool CanUseMinMax3 = EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
+                             STI.getPTXVersion() >= 88;
+  SDLoc DL(Op);
+  SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> Operators;
+  switch (Op->getOpcode()) {
+  case ISD::VECREDUCE_FADD:
+    Operators = {{ISD::FADD, 2}};
+    break;
+  case ISD::VECREDUCE_FMUL:
+    Operators = {{ISD::FMUL, 2}};
+    break;
+  case ISD::VECREDUCE_FMAX:
+    if (CanUseMinMax3)
+      Operators.push_back({NVPTXISD::FMAXNUM3, 3});
+    Operators.push_back({ISD::FMAXNUM, 2});
+    break;
+  case ISD::VECREDUCE_FMIN:
+    if (CanUseMinMax3)
+      Operators.push_back({NVPTXISD::FMINNUM3, 3});
+    Operators.push_back({ISD::FMINNUM, 2});
+    break;
+  case ISD::VECREDUCE_FMAXIMUM:
+    if (CanUseMinMax3)
+      Operators.push_back({NVPTXISD::FMAXIMUM3, 3});
+    Operators.push_back({ISD::FMAXIMUM, 2});
+    break;
+  case ISD::VECREDUCE_FMINIMUM:
+    if (CanUseMinMax3)
+      Operators.push_back({NVPTXISD::FMINIMUM3, 3});
+    Operators.push_back({ISD::FMINIMUM, 2});
+    break;
+  default:
+    llvm_unreachable("unhandled vecreduce operation");
+  }
+
+  return BuildTreeReduction(Op.getOperand(0), Operators, DL, Op->getFlags(),
+                            DAG);
+}
+
 SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
   // Handle bitcasting from v2i8 without hitting the default promotion
   // strategy which goes through stack memory.
@@ -2879,6 +3000,13 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::CONCAT_VECTORS:
     return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:
+    return LowerVECREDUCE(Op, DAG);
   case ISD::STORE:
     return LowerSTORE(Op, DAG);
   case ISD::LOAD:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -73,6 +73,11 @@ enum NodeType : unsigned {
   UNPACK_VECTOR,
 
   FCOPYSIGN,
+  FMAXNUM3,
+  FMINNUM3,
+  FMAXIMUM3,
+  FMINIMUM3,
+
   DYNAMIC_STACKALLOC,
   STACKRESTORE,
   STACKSAVE,
@@ -296,6 +301,7 @@ class NVPTXTargetLowering : public TargetLowering {
 
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -368,6 +368,46 @@ multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
                Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
 }
 
+// 3-input min/max (sm_100+) for f32 only
+multiclass FMINIMUMMAXIMUM3<string OpcStr, SDNode OpNode> {
+   def f32rrr_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b, $c;"),
+               [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>,
+               Requires<[doF32FTZ, hasPTX<88>, hasSM<100>]>;
+   def f32rri_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
+               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b, $c;"),
+               [(set f32:$dst, (OpNode f32:$a, f32:$b, fpimm:$c))]>,
+               Requires<[doF32FTZ, hasPTX<88>, hasSM<100>]>;
+   def f32rii_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
+               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b, $c;"),
+               [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>,
+               Requires<[doF32FTZ, hasPTX<88>, hasSM<100>]>;
+   def f32rrr :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+               !strconcat(OpcStr, ".f32 \t$dst, $a, $b, $c;"),
+               [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>,
+               Requires<[hasPTX<88>, hasSM<100>]>;
+   def f32rri :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
+               !strconcat(OpcStr, ".f32 \t$dst, $a, $b, $c;"),
+               [(set f32:$dst, (OpNode f32:$a, Float32Regs:$b, fpimm:$c))]>,
+               Requires<[hasPTX<88>, hasSM<100>]>;
+   def f32rii :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
+               !strconcat(OpcStr, ".f32 \t$dst, $a, $b, $c;"),
+               [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>,
+               Requires<[hasPTX<88>, hasSM<100>]>;
+}
+
 // Template for instructions which take three FP args.  The
 // instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
 //
@@ -1101,6 +1141,20 @@ defm FMAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
 defm FMINNAN : FMINIMUMMAXIMUM<"min.NaN", /* NaN */ true, fminimum>;
 defm FMAXNAN : FMINIMUMMAXIMUM<"max.NaN", /* NaN */ true, fmaximum>;
 
+def nvptx_fminnum3 : SDNode<"NVPTXISD::FMINNUM3", SDTFPTernaryOp,
+                            [SDNPCommutative, SDNPAssociative]>;
+def nvptx_fmaxnum3 : SDNode<"NVPTXISD::FMAXNUM3", SDTFPTernaryOp,
+                             [SDNPCommutative, SDNPAssociative]>;
+def nvptx_fminimum3 : SDNode<"NVPTXISD::FMINIMUM3", SDTFPTernaryOp,
+                             [SDNPCommutative, SDNPAssociative]>;
+def nvptx_fmaximum3 : SDNode<"NVPTXISD::FMAXIMUM3", SDTFPTernaryOp,
+                             [SDNPCommutative, SDNPAssociative]>;
+
+defm FMIN3 : FMINIMUMMAXIMUM3<"min", nvptx_fminnum3>;
+defm FMAX3 : FMINIMUMMAXIMUM3<"max", nvptx_fmaxnum3>;
+defm FMINNAN3 : FMINIMUMMAXIMUM3<"min.NaN", nvptx_fminimum3>;
+defm FMAXNAN3 : FMINIMUMMAXIMUM3<"max.NaN", nvptx_fmaximum3>;
+
 defm FABS  : F2<"abs", fabs>;
 defm FNEG  : F2<"neg", fneg>;
 defm FABS_H: F2_Support_Half<"abs", fabs>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -83,6 +83,8 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
   }
   unsigned getMinVectorRegisterBitWidth() const { return 32; }
 
+  bool shouldExpandReduction(const IntrinsicInst *II) const { return false; }
+
   // We don't want to prevent inlining because of target-cpu and -features
   // attributes that were added to newer versions of LLVM/Clang: There are
   // no incompatible functions in PTX, ptxas will throw errors in such cases.
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll

Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,8 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {`
`83`	`83`	`}`
`84`	`84`	`unsigned getMinVectorRegisterBitWidth() const { return 32; }`
`85`	`85`
	`86`	`+ bool shouldExpandReduction(const IntrinsicInst *II) const { return false; }`
	`87`	`+`
`86`	`88`	`// We don't want to prevent inlining because of target-cpu and -features`
`87`	`89`	`// attributes that were added to newer versions of LLVM/Clang: There are`
`88`	`90`	`// no incompatible functions in PTX, ptxas will throw errors in such cases.`