Skip to content

Handle VECREDUCE intrinsics in NVPTX backend #136253

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
296 changes: 296 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -850,6 +850,27 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
if (STI.allowFP16Math() || STI.hasBF16Math())
setTargetDAGCombine(ISD::SETCC);

// Vector reduction operations. These may be turned into sequential, shuffle,
// or tree reductions depending on what instructions are available for each
// type.
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
MVT EltVT = VT.getVectorElementType();
if (EltVT == MVT::f16 || EltVT == MVT::bf16 || EltVT == MVT::f32 ||
EltVT == MVT::f64) {
setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMUL,
ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_SEQ_FMUL,
ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
VT, Custom);
} else if (EltVT.isScalarInteger()) {
setOperationAction(
{ISD::VECREDUCE_ADD, ISD::VECREDUCE_MUL, ISD::VECREDUCE_AND,
ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR, ISD::VECREDUCE_SMAX,
ISD::VECREDUCE_SMIN, ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN},
VT, Custom);
}
}

// Promote fp16 arithmetic if fp16 hardware isn't available or the
// user passed --nvptx-no-fp16-math. The flag is useful because,
// although sm_53+ GPUs have some sort of FP16 support in
Expand Down Expand Up @@ -1083,6 +1104,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::BFI)
MAKE_CASE(NVPTXISD::PRMT)
MAKE_CASE(NVPTXISD::FCOPYSIGN)
MAKE_CASE(NVPTXISD::FMAXNUM3)
MAKE_CASE(NVPTXISD::FMINNUM3)
MAKE_CASE(NVPTXISD::FMAXIMUM3)
MAKE_CASE(NVPTXISD::FMINIMUM3)
MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
MAKE_CASE(NVPTXISD::STACKRESTORE)
MAKE_CASE(NVPTXISD::STACKSAVE)
Expand Down Expand Up @@ -2038,6 +2063,259 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
}

/// A generic routine for constructing a tree reduction on a vector operand.
/// This method groups elements bottom-up, progressively building each level.
/// Unlike the shuffle reduction used in DAGTypeLegalizer and ExpandReductions,
/// adjacent elements are combined first, leading to shorter live ranges. This
/// approach makes the most sense if the shuffle reduction would use the same
/// amount of registers.
///
/// The flags on the original reduction operation will be propagated to
/// each scalar operation.
static SDValue BuildTreeReduction(
const SmallVector<SDValue> &Elements, EVT EltTy,
ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
// Build the reduction tree at each level, starting with all the elements.
SmallVector<SDValue> Level = Elements;

unsigned OpIdx = 0;
while (Level.size() > 1) {
// Try to reduce this level using the current operator.
const auto [DefaultScalarOp, DefaultGroupSize] = Ops[OpIdx];

// Build the next level by partially reducing all elements.
SmallVector<SDValue> ReducedLevel;
unsigned I = 0, E = Level.size();
for (; I + DefaultGroupSize <= E; I += DefaultGroupSize) {
// Reduce elements in groups of [DefaultGroupSize], as much as possible.
ReducedLevel.push_back(DAG.getNode(
DefaultScalarOp, DL, EltTy,
ArrayRef<SDValue>(Level).slice(I, DefaultGroupSize), Flags));
}

if (I < E) {
// Handle leftover elements.

if (ReducedLevel.empty()) {
// We didn't reduce anything at this level. We need to pick a smaller
// operator.
++OpIdx;
assert(OpIdx < Ops.size() && "no smaller operators for reduction");
continue;
}

// We reduced some things but there's still more left, meaning the
// operator's number of inputs doesn't evenly divide this level size. Move
// these elements to the next level.
for (; I < E; ++I)
ReducedLevel.push_back(Level[I]);
}

// Process the next level.
Level = ReducedLevel;
}

return *Level.begin();
}

/// Lower reductions to either a sequence of operations or a tree if
/// reassociations are allowed. This method will use larger operations like
/// max3/min3 when the target supports them.
SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
const SDNodeFlags Flags = Op->getFlags();
SDValue Vector;
SDValue Accumulator;

if (Op->getOpcode() == ISD::VECREDUCE_SEQ_FADD ||
Op->getOpcode() == ISD::VECREDUCE_SEQ_FMUL) {
// special case with accumulator as first arg
Accumulator = Op.getOperand(0);
Vector = Op.getOperand(1);
} else {
// default case
Vector = Op.getOperand(0);
}

EVT EltTy = Vector.getValueType().getVectorElementType();
const bool CanUseMinMax3 = EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
STI.getPTXVersion() >= 88;

// A list of SDNode opcodes with equivalent semantics, sorted descending by
// number of inputs they take.
SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;

// Whether we can lower to scalar operations in an arbitrary order.
bool IsAssociative = allowUnsafeFPMath(DAG.getMachineFunction());

// Whether the data type and operation can be represented with fewer ops and
// registers in a shuffle reduction.
bool PrefersShuffle;

switch (Op->getOpcode()) {
case ISD::VECREDUCE_FADD:
case ISD::VECREDUCE_SEQ_FADD:
ScalarOps = {{ISD::FADD, 2}};
IsAssociative |= Op->getOpcode() == ISD::VECREDUCE_FADD;
// Prefer add.{,b}f16x2 for v2{,b}f16
PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16;
break;
case ISD::VECREDUCE_FMUL:
case ISD::VECREDUCE_SEQ_FMUL:
ScalarOps = {{ISD::FMUL, 2}};
IsAssociative |= Op->getOpcode() == ISD::VECREDUCE_FMUL;
// Prefer mul.{,b}f16x2 for v2{,b}f16
PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16;
break;
case ISD::VECREDUCE_FMAX:
if (CanUseMinMax3)
ScalarOps.push_back({NVPTXISD::FMAXNUM3, 3});
ScalarOps.push_back({ISD::FMAXNUM, 2});
// Definition of maxNum in IEEE 754 2008 is non-associative due to handling
// of sNaN inputs. Allow overriding with fast-math or 'reassoc' attribute.
IsAssociative |= Flags.hasAllowReassociation();
PrefersShuffle = false;
break;
case ISD::VECREDUCE_FMIN:
if (CanUseMinMax3)
ScalarOps.push_back({NVPTXISD::FMINNUM3, 3});
ScalarOps.push_back({ISD::FMINNUM, 2});
// Definition of minNum in IEEE 754 2008 is non-associative due to handling
// of sNaN inputs. Allow overriding with fast-math or 'reassoc' attribute.
IsAssociative |= Flags.hasAllowReassociation();
PrefersShuffle = false;
break;
case ISD::VECREDUCE_FMAXIMUM:
if (CanUseMinMax3) {
ScalarOps.push_back({NVPTXISD::FMAXIMUM3, 3});
// Can't use fmax3 in shuffle reduction
PrefersShuffle = false;
} else {
// Prefer max.{,b}f16x2 for v2{,b}f16
PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16;
}
ScalarOps.push_back({ISD::FMAXIMUM, 2});
IsAssociative = true;
break;
case ISD::VECREDUCE_FMINIMUM:
if (CanUseMinMax3) {
ScalarOps.push_back({NVPTXISD::FMINIMUM3, 3});
// Can't use fmin3 in shuffle reduction
PrefersShuffle = false;
} else {
// Prefer min.{,b}f16x2 for v2{,b}f16
PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16;
}
ScalarOps.push_back({ISD::FMINIMUM, 2});
IsAssociative = true;
break;
case ISD::VECREDUCE_ADD:
ScalarOps = {{ISD::ADD, 2}};
IsAssociative = true;
// Prefer add.{s,u}16x2 for v2i16
PrefersShuffle = EltTy == MVT::i16;
break;
case ISD::VECREDUCE_MUL:
ScalarOps = {{ISD::MUL, 2}};
IsAssociative = true;
// Integer multiply doesn't support packed types
PrefersShuffle = false;
break;
case ISD::VECREDUCE_UMAX:
ScalarOps = {{ISD::UMAX, 2}};
IsAssociative = true;
// Prefer max.u16x2 for v2i16
PrefersShuffle = EltTy == MVT::i16;
break;
case ISD::VECREDUCE_UMIN:
ScalarOps = {{ISD::UMIN, 2}};
IsAssociative = true;
// Prefer min.u16x2 for v2i16
PrefersShuffle = EltTy == MVT::i16;
break;
case ISD::VECREDUCE_SMAX:
ScalarOps = {{ISD::SMAX, 2}};
IsAssociative = true;
// Prefer max.s16x2 for v2i16
PrefersShuffle = EltTy == MVT::i16;
break;
case ISD::VECREDUCE_SMIN:
ScalarOps = {{ISD::SMIN, 2}};
IsAssociative = true;
// Prefer min.s16x2 for v2i16
PrefersShuffle = EltTy == MVT::i16;
break;
case ISD::VECREDUCE_AND:
ScalarOps = {{ISD::AND, 2}};
IsAssociative = true;
// Prefer and.b32 for v2i16.
PrefersShuffle = EltTy == MVT::i16;
break;
case ISD::VECREDUCE_OR:
ScalarOps = {{ISD::OR, 2}};
IsAssociative = true;
// Prefer or.b32 for v2i16.
PrefersShuffle = EltTy == MVT::i16;
break;
case ISD::VECREDUCE_XOR:
ScalarOps = {{ISD::XOR, 2}};
IsAssociative = true;
// Prefer xor.b32 for v2i16.
PrefersShuffle = EltTy == MVT::i16;
break;
default:
llvm_unreachable("unhandled vecreduce operation");
}

// We don't expect an accumulator for reassociative vector reduction ops.
assert((!IsAssociative || !Accumulator) && "unexpected accumulator");

// If shuffle reduction is preferred, leave it to SelectionDAG.
if (IsAssociative && PrefersShuffle)
return SDValue();

// Otherwise, handle the reduction here.
SmallVector<SDValue> Elements;
DAG.ExtractVectorElements(Vector, Elements);

// Lower to tree reduction.
if (IsAssociative)
return BuildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);

// Lower to sequential reduction.
EVT VectorTy = Vector.getValueType();
const unsigned NumElts = VectorTy.getVectorNumElements();
for (unsigned OpIdx = 0, I = 0; I < NumElts; ++OpIdx) {
// Try to reduce the remaining sequence as much as possible using the
// current operator.
assert(OpIdx < ScalarOps.size() && "no smaller operators for reduction");
const auto [DefaultScalarOp, DefaultGroupSize] = ScalarOps[OpIdx];

if (!Accumulator) {
// Try to initialize the accumulator using the current operator.
if (I + DefaultGroupSize <= NumElts) {
Accumulator = DAG.getNode(
DefaultScalarOp, DL, EltTy,
ArrayRef(Elements).slice(I, I + DefaultGroupSize), Flags);
I += DefaultGroupSize;
}
}

if (Accumulator) {
for (; I + (DefaultGroupSize - 1) <= NumElts; I += DefaultGroupSize - 1) {
SmallVector<SDValue> Operands = {Accumulator};
for (unsigned K = 0; K < DefaultGroupSize - 1; ++K)
Operands.push_back(Elements[I + K]);
Accumulator = DAG.getNode(DefaultScalarOp, DL, EltTy, Operands, Flags);
}
}
}

return Accumulator;
}

SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
// Handle bitcasting from v2i8 without hitting the default promotion
// strategy which goes through stack memory.
Expand Down Expand Up @@ -2869,6 +3147,24 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::CONCAT_VECTORS:
return LowerCONCAT_VECTORS(Op, DAG);
case ISD::VECREDUCE_FADD:
case ISD::VECREDUCE_FMUL:
case ISD::VECREDUCE_SEQ_FADD:
case ISD::VECREDUCE_SEQ_FMUL:
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
case ISD::VECREDUCE_FMAXIMUM:
case ISD::VECREDUCE_FMINIMUM:
case ISD::VECREDUCE_ADD:
case ISD::VECREDUCE_MUL:
case ISD::VECREDUCE_UMAX:
case ISD::VECREDUCE_UMIN:
case ISD::VECREDUCE_SMAX:
case ISD::VECREDUCE_SMIN:
case ISD::VECREDUCE_AND:
case ISD::VECREDUCE_OR:
case ISD::VECREDUCE_XOR:
return LowerVECREDUCE(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
case ISD::LOAD:
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ enum NodeType : unsigned {
UNPACK_VECTOR,

FCOPYSIGN,
FMAXNUM3,
FMINNUM3,
FMAXIMUM3,
FMINIMUM3,

DYNAMIC_STACKALLOC,
STACKRESTORE,
STACKSAVE,
Expand Down Expand Up @@ -279,6 +284,7 @@ class NVPTXTargetLowering : public TargetLowering {

SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
Expand Down
Loading