From 7a4de918c44337259ca65ac0bce0649ecdf06392 Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Wed, 16 Apr 2025 17:17:47 -0700 Subject: [PATCH] [NVPTX] lower VECREDUCE intrinsics to tree reduction Also adds support for sm_100+ fmax3/fmin3 instructions, introduced in PTX 8.8. This method of tree reduction has a few benefits over the default in DAGTypeLegalizer: - The default shuffle reduction progressively halves and partially reduces the vector down until we reach a single element. This produces a sequence of operations that combine disparate elements of the vector. For example, `vecreduce_fadd <4 x f32>` will give `(a + c) + (b + d)`, whereas the tree reduction produces (a + b) + (c + d) by grouping nearby elements together first. Both use the same number of registers, but the shuffle reduction has longer live ranges. The same example is graphed below. Note we hold onto 3 registers for 2 cycles in the shuffle reduction and 1 cycle in tree reduction. (shuffle reduction) PTX: %r1 = add.f32 a, c %r2 = add.f32 b, d %r3 = add.f32 %r1, %r3 Pipeline: cycles ----> | 1 | 2 | 3 | 4 | 5 | 6 | | a = load.f32 | b = load.f32 | c = load.f32 | d = load.f32 | | | | | | | %r1 = add.f32 a, c | %r2 = add.f32 b, d | %r3 = add.f32 %r1, %r2 | live regs ----> | a [1R] | a b [2R] | a b c [3R] | b d %r1 [3R] | %r1 %r2 [2R] | %r3 [1R] | (tree reduction) PTX: %r1 = add.f32 a, b %r2 = add.f32 c, d %r3 = add.f32 %r1, %r2 Pipeline: cycles ----> | 1 | 2 | 3 | 4 | 5 | 6 | | a = load.f32 | b = load.f32 | c = load.f32 | d = load.f32 | | | | | | %r1 = add.f32 a, b | | %r2 = add.f32 c, d | %r3 = add.f32 %r1, %r2 | live regs ----> | a [1R] | a b [2R] | c %r1 [2R] | c %r1 d [3R] | %r1 %r2 [2R] | %r3 [1R] | - The shuffle reduction cannot easily support fmax3/fmin3 because it progressively halves the input vector. - Faster compile time. Happens in one pass over the intrinsic, rather than O(N) passes if iteratively splitting the vector operands. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 296 ++++ llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 6 + llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 54 + .../Target/NVPTX/NVPTXTargetTransformInfo.h | 4 + .../CodeGen/NVPTX/reduction-intrinsics.ll | 1431 +++++++++-------- 5 files changed, 1103 insertions(+), 688 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d817a3c6a8777..c185dd672ada1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -850,6 +850,27 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, if (STI.allowFP16Math() || STI.hasBF16Math()) setTargetDAGCombine(ISD::SETCC); + // Vector reduction operations. These may be turned into sequential, shuffle, + // or tree reductions depending on what instructions are available for each + // type. + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { + MVT EltVT = VT.getVectorElementType(); + if (EltVT == MVT::f16 || EltVT == MVT::bf16 || EltVT == MVT::f32 || + EltVT == MVT::f64) { + setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMUL, + ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_SEQ_FMUL, + ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN, + ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM}, + VT, Custom); + } else if (EltVT.isScalarInteger()) { + setOperationAction( + {ISD::VECREDUCE_ADD, ISD::VECREDUCE_MUL, ISD::VECREDUCE_AND, + ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR, ISD::VECREDUCE_SMAX, + ISD::VECREDUCE_SMIN, ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN}, + VT, Custom); + } + } + // Promote fp16 arithmetic if fp16 hardware isn't available or the // user passed --nvptx-no-fp16-math. The flag is useful because, // although sm_53+ GPUs have some sort of FP16 support in @@ -1083,6 +1104,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::BFI) MAKE_CASE(NVPTXISD::PRMT) MAKE_CASE(NVPTXISD::FCOPYSIGN) + MAKE_CASE(NVPTXISD::FMAXNUM3) + MAKE_CASE(NVPTXISD::FMINNUM3) + MAKE_CASE(NVPTXISD::FMAXIMUM3) + MAKE_CASE(NVPTXISD::FMINIMUM3) MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC) MAKE_CASE(NVPTXISD::STACKRESTORE) MAKE_CASE(NVPTXISD::STACKSAVE) @@ -2038,6 +2063,259 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getBuildVector(Node->getValueType(0), dl, Ops); } +/// A generic routine for constructing a tree reduction on a vector operand. +/// This method groups elements bottom-up, progressively building each level. +/// Unlike the shuffle reduction used in DAGTypeLegalizer and ExpandReductions, +/// adjacent elements are combined first, leading to shorter live ranges. This +/// approach makes the most sense if the shuffle reduction would use the same +/// amount of registers. +/// +/// The flags on the original reduction operation will be propagated to +/// each scalar operation. +static SDValue BuildTreeReduction( + const SmallVector &Elements, EVT EltTy, + ArrayRef> Ops, + const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) { + // Build the reduction tree at each level, starting with all the elements. + SmallVector Level = Elements; + + unsigned OpIdx = 0; + while (Level.size() > 1) { + // Try to reduce this level using the current operator. + const auto [DefaultScalarOp, DefaultGroupSize] = Ops[OpIdx]; + + // Build the next level by partially reducing all elements. + SmallVector ReducedLevel; + unsigned I = 0, E = Level.size(); + for (; I + DefaultGroupSize <= E; I += DefaultGroupSize) { + // Reduce elements in groups of [DefaultGroupSize], as much as possible. + ReducedLevel.push_back(DAG.getNode( + DefaultScalarOp, DL, EltTy, + ArrayRef(Level).slice(I, DefaultGroupSize), Flags)); + } + + if (I < E) { + // Handle leftover elements. + + if (ReducedLevel.empty()) { + // We didn't reduce anything at this level. We need to pick a smaller + // operator. + ++OpIdx; + assert(OpIdx < Ops.size() && "no smaller operators for reduction"); + continue; + } + + // We reduced some things but there's still more left, meaning the + // operator's number of inputs doesn't evenly divide this level size. Move + // these elements to the next level. + for (; I < E; ++I) + ReducedLevel.push_back(Level[I]); + } + + // Process the next level. + Level = ReducedLevel; + } + + return *Level.begin(); +} + +/// Lower reductions to either a sequence of operations or a tree if +/// reassociations are allowed. This method will use larger operations like +/// max3/min3 when the target supports them. +SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + const SDNodeFlags Flags = Op->getFlags(); + SDValue Vector; + SDValue Accumulator; + + if (Op->getOpcode() == ISD::VECREDUCE_SEQ_FADD || + Op->getOpcode() == ISD::VECREDUCE_SEQ_FMUL) { + // special case with accumulator as first arg + Accumulator = Op.getOperand(0); + Vector = Op.getOperand(1); + } else { + // default case + Vector = Op.getOperand(0); + } + + EVT EltTy = Vector.getValueType().getVectorElementType(); + const bool CanUseMinMax3 = EltTy == MVT::f32 && STI.getSmVersion() >= 100 && + STI.getPTXVersion() >= 88; + + // A list of SDNode opcodes with equivalent semantics, sorted descending by + // number of inputs they take. + SmallVector, 2> ScalarOps; + + // Whether we can lower to scalar operations in an arbitrary order. + bool IsAssociative = allowUnsafeFPMath(DAG.getMachineFunction()); + + // Whether the data type and operation can be represented with fewer ops and + // registers in a shuffle reduction. + bool PrefersShuffle; + + switch (Op->getOpcode()) { + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_SEQ_FADD: + ScalarOps = {{ISD::FADD, 2}}; + IsAssociative |= Op->getOpcode() == ISD::VECREDUCE_FADD; + // Prefer add.{,b}f16x2 for v2{,b}f16 + PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16; + break; + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_SEQ_FMUL: + ScalarOps = {{ISD::FMUL, 2}}; + IsAssociative |= Op->getOpcode() == ISD::VECREDUCE_FMUL; + // Prefer mul.{,b}f16x2 for v2{,b}f16 + PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16; + break; + case ISD::VECREDUCE_FMAX: + if (CanUseMinMax3) + ScalarOps.push_back({NVPTXISD::FMAXNUM3, 3}); + ScalarOps.push_back({ISD::FMAXNUM, 2}); + // Definition of maxNum in IEEE 754 2008 is non-associative due to handling + // of sNaN inputs. Allow overriding with fast-math or 'reassoc' attribute. + IsAssociative |= Flags.hasAllowReassociation(); + PrefersShuffle = false; + break; + case ISD::VECREDUCE_FMIN: + if (CanUseMinMax3) + ScalarOps.push_back({NVPTXISD::FMINNUM3, 3}); + ScalarOps.push_back({ISD::FMINNUM, 2}); + // Definition of minNum in IEEE 754 2008 is non-associative due to handling + // of sNaN inputs. Allow overriding with fast-math or 'reassoc' attribute. + IsAssociative |= Flags.hasAllowReassociation(); + PrefersShuffle = false; + break; + case ISD::VECREDUCE_FMAXIMUM: + if (CanUseMinMax3) { + ScalarOps.push_back({NVPTXISD::FMAXIMUM3, 3}); + // Can't use fmax3 in shuffle reduction + PrefersShuffle = false; + } else { + // Prefer max.{,b}f16x2 for v2{,b}f16 + PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16; + } + ScalarOps.push_back({ISD::FMAXIMUM, 2}); + IsAssociative = true; + break; + case ISD::VECREDUCE_FMINIMUM: + if (CanUseMinMax3) { + ScalarOps.push_back({NVPTXISD::FMINIMUM3, 3}); + // Can't use fmin3 in shuffle reduction + PrefersShuffle = false; + } else { + // Prefer min.{,b}f16x2 for v2{,b}f16 + PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16; + } + ScalarOps.push_back({ISD::FMINIMUM, 2}); + IsAssociative = true; + break; + case ISD::VECREDUCE_ADD: + ScalarOps = {{ISD::ADD, 2}}; + IsAssociative = true; + // Prefer add.{s,u}16x2 for v2i16 + PrefersShuffle = EltTy == MVT::i16; + break; + case ISD::VECREDUCE_MUL: + ScalarOps = {{ISD::MUL, 2}}; + IsAssociative = true; + // Integer multiply doesn't support packed types + PrefersShuffle = false; + break; + case ISD::VECREDUCE_UMAX: + ScalarOps = {{ISD::UMAX, 2}}; + IsAssociative = true; + // Prefer max.u16x2 for v2i16 + PrefersShuffle = EltTy == MVT::i16; + break; + case ISD::VECREDUCE_UMIN: + ScalarOps = {{ISD::UMIN, 2}}; + IsAssociative = true; + // Prefer min.u16x2 for v2i16 + PrefersShuffle = EltTy == MVT::i16; + break; + case ISD::VECREDUCE_SMAX: + ScalarOps = {{ISD::SMAX, 2}}; + IsAssociative = true; + // Prefer max.s16x2 for v2i16 + PrefersShuffle = EltTy == MVT::i16; + break; + case ISD::VECREDUCE_SMIN: + ScalarOps = {{ISD::SMIN, 2}}; + IsAssociative = true; + // Prefer min.s16x2 for v2i16 + PrefersShuffle = EltTy == MVT::i16; + break; + case ISD::VECREDUCE_AND: + ScalarOps = {{ISD::AND, 2}}; + IsAssociative = true; + // Prefer and.b32 for v2i16. + PrefersShuffle = EltTy == MVT::i16; + break; + case ISD::VECREDUCE_OR: + ScalarOps = {{ISD::OR, 2}}; + IsAssociative = true; + // Prefer or.b32 for v2i16. + PrefersShuffle = EltTy == MVT::i16; + break; + case ISD::VECREDUCE_XOR: + ScalarOps = {{ISD::XOR, 2}}; + IsAssociative = true; + // Prefer xor.b32 for v2i16. + PrefersShuffle = EltTy == MVT::i16; + break; + default: + llvm_unreachable("unhandled vecreduce operation"); + } + + // We don't expect an accumulator for reassociative vector reduction ops. + assert((!IsAssociative || !Accumulator) && "unexpected accumulator"); + + // If shuffle reduction is preferred, leave it to SelectionDAG. + if (IsAssociative && PrefersShuffle) + return SDValue(); + + // Otherwise, handle the reduction here. + SmallVector Elements; + DAG.ExtractVectorElements(Vector, Elements); + + // Lower to tree reduction. + if (IsAssociative) + return BuildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG); + + // Lower to sequential reduction. + EVT VectorTy = Vector.getValueType(); + const unsigned NumElts = VectorTy.getVectorNumElements(); + for (unsigned OpIdx = 0, I = 0; I < NumElts; ++OpIdx) { + // Try to reduce the remaining sequence as much as possible using the + // current operator. + assert(OpIdx < ScalarOps.size() && "no smaller operators for reduction"); + const auto [DefaultScalarOp, DefaultGroupSize] = ScalarOps[OpIdx]; + + if (!Accumulator) { + // Try to initialize the accumulator using the current operator. + if (I + DefaultGroupSize <= NumElts) { + Accumulator = DAG.getNode( + DefaultScalarOp, DL, EltTy, + ArrayRef(Elements).slice(I, I + DefaultGroupSize), Flags); + I += DefaultGroupSize; + } + } + + if (Accumulator) { + for (; I + (DefaultGroupSize - 1) <= NumElts; I += DefaultGroupSize - 1) { + SmallVector Operands = {Accumulator}; + for (unsigned K = 0; K < DefaultGroupSize - 1; ++K) + Operands.push_back(Elements[I + K]); + Accumulator = DAG.getNode(DefaultScalarOp, DL, EltTy, Operands, Flags); + } + } + } + + return Accumulator; +} + SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { // Handle bitcasting from v2i8 without hitting the default promotion // strategy which goes through stack memory. @@ -2869,6 +3147,24 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_SEQ_FADD: + case ISD::VECREDUCE_SEQ_FMUL: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + return LowerVECREDUCE(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::LOAD: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 3a8091fecfde1..e2f07074f77f1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -60,6 +60,11 @@ enum NodeType : unsigned { UNPACK_VECTOR, FCOPYSIGN, + FMAXNUM3, + FMINNUM3, + FMAXIMUM3, + FMINIMUM3, + DYNAMIC_STACKALLOC, STACKRESTORE, STACKSAVE, @@ -279,6 +284,7 @@ class NVPTXTargetLowering : public TargetLowering { SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 1a2515b7f66f3..6652bfa81e5d2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -395,6 +395,46 @@ multiclass FMINIMUMMAXIMUM { Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>; } +// 3-input min/max (sm_100+) for f32 only +multiclass FMINIMUMMAXIMUM3 { + def f32rrr_ftz : + NVPTXInst<(outs B32:$dst), + (ins B32:$a, B32:$b, B32:$c), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b, $c;"), + [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>, + Requires<[doF32FTZ, hasPTX<88>, hasSM<100>]>; + def f32rri_ftz : + NVPTXInst<(outs B32:$dst), + (ins B32:$a, B32:$b, f32imm:$c), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b, $c;"), + [(set f32:$dst, (OpNode f32:$a, f32:$b, fpimm:$c))]>, + Requires<[doF32FTZ, hasPTX<88>, hasSM<100>]>; + def f32rii_ftz : + NVPTXInst<(outs B32:$dst), + (ins B32:$a, f32imm:$b, f32imm:$c), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b, $c;"), + [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>, + Requires<[doF32FTZ, hasPTX<88>, hasSM<100>]>; + def f32rrr : + NVPTXInst<(outs B32:$dst), + (ins B32:$a, B32:$b, B32:$c), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b, $c;"), + [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>, + Requires<[hasPTX<88>, hasSM<100>]>; + def f32rri : + NVPTXInst<(outs B32:$dst), + (ins B32:$a, B32:$b, f32imm:$c), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b, $c;"), + [(set f32:$dst, (OpNode f32:$a, B32:$b, fpimm:$c))]>, + Requires<[hasPTX<88>, hasSM<100>]>; + def f32rii : + NVPTXInst<(outs B32:$dst), + (ins B32:$a, f32imm:$b, f32imm:$c), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b, $c;"), + [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>, + Requires<[hasPTX<88>, hasSM<100>]>; +} + // Template for instructions which take three FP args. The // instructions are named ".f" (e.g. "add.f64"). // @@ -1106,6 +1146,20 @@ defm FMAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>; defm FMINNAN : FMINIMUMMAXIMUM<"min.NaN", /* NaN */ true, fminimum>; defm FMAXNAN : FMINIMUMMAXIMUM<"max.NaN", /* NaN */ true, fmaximum>; +def nvptx_fminnum3 : SDNode<"NVPTXISD::FMINNUM3", SDTFPTernaryOp, + [SDNPCommutative]>; +def nvptx_fmaxnum3 : SDNode<"NVPTXISD::FMAXNUM3", SDTFPTernaryOp, + [SDNPCommutative]>; +def nvptx_fminimum3 : SDNode<"NVPTXISD::FMINIMUM3", SDTFPTernaryOp, + [SDNPCommutative]>; +def nvptx_fmaximum3 : SDNode<"NVPTXISD::FMAXIMUM3", SDTFPTernaryOp, + [SDNPCommutative]>; + +defm FMIN3 : FMINIMUMMAXIMUM3<"min", nvptx_fminnum3>; +defm FMAX3 : FMINIMUMMAXIMUM3<"max", nvptx_fmaxnum3>; +defm FMINNAN3 : FMINIMUMMAXIMUM3<"min.NaN", nvptx_fminimum3>; +defm FMAXNAN3 : FMINIMUMMAXIMUM3<"max.NaN", nvptx_fmaximum3>; + defm FABS : F2<"abs", fabs>; defm FNEG : F2<"neg", fneg>; defm FABS_H: F2_Support_Half<"abs", fabs>; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index aa7850acbd64a..f78d77add4365 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -87,6 +87,10 @@ class NVPTXTTIImpl final : public BasicTTIImplBase { } unsigned getMinVectorRegisterBitWidth() const override { return 32; } + bool shouldExpandReduction(const IntrinsicInst *II) const override { + return false; + } + // We don't want to prevent inlining because of target-cpu and -features // attributes that were added to newer versions of LLVM/Clang: There are // no incompatible functions in PTX, ptxas will throw errors in such cases. diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index e10949f95fac4..fcac4732925ca 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -2,19 +2,18 @@ ; RUN: llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM80 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ +; RUN: %if ptxas-12.9 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_80 %} -; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \ +; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM100 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \ +; RUN: %if ptxas-12.9 %{ llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_100 %} target triple = "nvptx64-nvidia-cuda" target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" -; Check straight line reduction. define half @reduce_fadd_half(<8 x half> %in) { ; CHECK-LABEL: reduce_fadd_half( ; CHECK: { @@ -43,45 +42,22 @@ define half @reduce_fadd_half(<8 x half> %in) { } define half @reduce_fadd_half_reassoc(<8 x half> %in) { -; CHECK-SM80-LABEL: reduce_fadd_half_reassoc( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<6>; -; CHECK-SM80-NEXT: .reg .b32 %r<10>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; -; CHECK-SM80-NEXT: add.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM80-NEXT: add.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM80-NEXT: add.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: add.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: mov.b16 %rs4, 0x0000; -; CHECK-SM80-NEXT: add.rn.f16 %rs5, %rs3, %rs4; -; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_fadd_half_reassoc( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<6>; -; CHECK-SM100-NEXT: .reg .b32 %r<10>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; -; CHECK-SM100-NEXT: add.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM100-NEXT: add.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM100-NEXT: add.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: add.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: mov.b16 %rs4, 0x0000; -; CHECK-SM100-NEXT: add.rn.f16 %rs5, %rs3, %rs4; -; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_fadd_half_reassoc( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<6>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; +; CHECK-NEXT: add.rn.f16x2 %r5, %r2, %r4; +; CHECK-NEXT: add.rn.f16x2 %r6, %r1, %r3; +; CHECK-NEXT: add.rn.f16x2 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: add.rn.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: mov.b16 %rs4, 0x0000; +; CHECK-NEXT: add.rn.f16 %rs5, %rs3, %rs4; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs5; +; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in) ret half %res } @@ -109,7 +85,6 @@ define half @reduce_fadd_half_reassoc_nonpow2(<7 x half> %in) { ret half %res } -; Check straight-line reduction. define float @reduce_fadd_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fadd_float( ; CHECK: { @@ -140,13 +115,13 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0]; -; CHECK-NEXT: add.rn.f32 %r9, %r3, %r7; -; CHECK-NEXT: add.rn.f32 %r10, %r1, %r5; -; CHECK-NEXT: add.rn.f32 %r11, %r4, %r8; -; CHECK-NEXT: add.rn.f32 %r12, %r2, %r6; -; CHECK-NEXT: add.rn.f32 %r13, %r12, %r11; -; CHECK-NEXT: add.rn.f32 %r14, %r10, %r9; -; CHECK-NEXT: add.rn.f32 %r15, %r14, %r13; +; CHECK-NEXT: add.rn.f32 %r9, %r7, %r8; +; CHECK-NEXT: add.rn.f32 %r10, %r5, %r6; +; CHECK-NEXT: add.rn.f32 %r11, %r10, %r9; +; CHECK-NEXT: add.rn.f32 %r12, %r3, %r4; +; CHECK-NEXT: add.rn.f32 %r13, %r1, %r2; +; CHECK-NEXT: add.rn.f32 %r14, %r13, %r12; +; CHECK-NEXT: add.rn.f32 %r15, %r14, %r11; ; CHECK-NEXT: add.rn.f32 %r16, %r15, 0f00000000; ; CHECK-NEXT: st.param.b32 [func_retval0], %r16; ; CHECK-NEXT: ret; @@ -163,12 +138,12 @@ define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) { ; CHECK-NEXT: ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24]; ; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: add.rn.f32 %r8, %r3, %r7; -; CHECK-NEXT: add.rn.f32 %r9, %r1, %r5; -; CHECK-NEXT: add.rn.f32 %r10, %r9, %r8; -; CHECK-NEXT: add.rn.f32 %r11, %r2, %r6; -; CHECK-NEXT: add.rn.f32 %r12, %r11, %r4; -; CHECK-NEXT: add.rn.f32 %r13, %r10, %r12; +; CHECK-NEXT: add.rn.f32 %r8, %r5, %r6; +; CHECK-NEXT: add.rn.f32 %r9, %r8, %r7; +; CHECK-NEXT: add.rn.f32 %r10, %r3, %r4; +; CHECK-NEXT: add.rn.f32 %r11, %r1, %r2; +; CHECK-NEXT: add.rn.f32 %r12, %r11, %r10; +; CHECK-NEXT: add.rn.f32 %r13, %r12, %r9; ; CHECK-NEXT: add.rn.f32 %r14, %r13, 0f00000000; ; CHECK-NEXT: st.param.b32 [func_retval0], %r14; ; CHECK-NEXT: ret; @@ -176,7 +151,6 @@ define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) { ret float %res } -; Check straight line reduction. define half @reduce_fmul_half(<8 x half> %in) { ; CHECK-LABEL: reduce_fmul_half( ; CHECK: { @@ -203,41 +177,20 @@ define half @reduce_fmul_half(<8 x half> %in) { } define half @reduce_fmul_half_reassoc(<8 x half> %in) { -; CHECK-SM80-LABEL: reduce_fmul_half_reassoc( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<10>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_fmul_half_reassoc( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<10>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_fmul_half_reassoc( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; +; CHECK-NEXT: mul.rn.f16x2 %r5, %r2, %r4; +; CHECK-NEXT: mul.rn.f16x2 %r6, %r1, %r3; +; CHECK-NEXT: mul.rn.f16x2 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: mul.rn.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in) ret half %res } @@ -268,7 +221,6 @@ define half @reduce_fmul_half_reassoc_nonpow2(<7 x half> %in) { ret half %res } -; Check straight-line reduction. define float @reduce_fmul_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fmul_float( ; CHECK: { @@ -298,13 +250,13 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0]; -; CHECK-NEXT: mul.rn.f32 %r9, %r3, %r7; -; CHECK-NEXT: mul.rn.f32 %r10, %r1, %r5; -; CHECK-NEXT: mul.rn.f32 %r11, %r4, %r8; -; CHECK-NEXT: mul.rn.f32 %r12, %r2, %r6; -; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r11; -; CHECK-NEXT: mul.rn.f32 %r14, %r10, %r9; -; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r13; +; CHECK-NEXT: mul.rn.f32 %r9, %r7, %r8; +; CHECK-NEXT: mul.rn.f32 %r10, %r5, %r6; +; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r9; +; CHECK-NEXT: mul.rn.f32 %r12, %r3, %r4; +; CHECK-NEXT: mul.rn.f32 %r13, %r1, %r2; +; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r12; +; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in) @@ -320,33 +272,38 @@ define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) { ; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24]; ; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: mul.rn.f32 %r8, %r3, %r7; -; CHECK-NEXT: mul.rn.f32 %r9, %r1, %r5; -; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r8; -; CHECK-NEXT: mul.rn.f32 %r11, %r2, %r6; -; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r4; -; CHECK-NEXT: mul.rn.f32 %r13, %r10, %r12; +; CHECK-NEXT: mul.rn.f32 %r8, %r5, %r6; +; CHECK-NEXT: mul.rn.f32 %r9, %r8, %r7; +; CHECK-NEXT: mul.rn.f32 %r10, %r3, %r4; +; CHECK-NEXT: mul.rn.f32 %r11, %r1, %r2; +; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r10; +; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r9; ; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <7 x float> %in) ret float %res } -; Check straight line reduction. define half @reduce_fmax_half(<8 x half> %in) { ; CHECK-LABEL: reduce_fmax_half( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b16 %rs<16>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_param_0]; -; CHECK-NEXT: max.f16x2 %r5, %r2, %r4; -; CHECK-NEXT: max.f16x2 %r6, %r1, %r3; -; CHECK-NEXT: max.f16x2 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: max.f16 %rs3, %rs1, %rs2; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r2; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; CHECK-NEXT: max.f16 %rs9, %rs7, %rs8; +; CHECK-NEXT: max.f16 %rs10, %rs9, %rs5; +; CHECK-NEXT: max.f16 %rs11, %rs10, %rs6; +; CHECK-NEXT: max.f16 %rs12, %rs11, %rs3; +; CHECK-NEXT: max.f16 %rs13, %rs12, %rs4; +; CHECK-NEXT: max.f16 %rs14, %rs13, %rs1; +; CHECK-NEXT: max.f16 %rs15, %rs14, %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs15; ; CHECK-NEXT: ret; %res = call half @llvm.vector.reduce.fmax(<8 x half> %in) ret half %res @@ -355,17 +312,23 @@ define half @reduce_fmax_half(<8 x half> %in) { define half @reduce_fmax_half_reassoc(<8 x half> %in) { ; CHECK-LABEL: reduce_fmax_half_reassoc( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b16 %rs<16>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_reassoc_param_0]; -; CHECK-NEXT: max.f16x2 %r5, %r2, %r4; -; CHECK-NEXT: max.f16x2 %r6, %r1, %r3; -; CHECK-NEXT: max.f16x2 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: max.f16 %rs3, %rs1, %rs2; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4; +; CHECK-NEXT: max.f16 %rs9, %rs7, %rs8; +; CHECK-NEXT: max.f16 %rs10, %rs5, %rs6; +; CHECK-NEXT: max.f16 %rs11, %rs10, %rs9; +; CHECK-NEXT: max.f16 %rs12, %rs3, %rs4; +; CHECK-NEXT: max.f16 %rs13, %rs1, %rs2; +; CHECK-NEXT: max.f16 %rs14, %rs13, %rs12; +; CHECK-NEXT: max.f16 %rs15, %rs14, %rs11; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs15; ; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fmax(<8 x half> %in) ret half %res @@ -374,111 +337,152 @@ define half @reduce_fmax_half_reassoc(<8 x half> %in) { define half @reduce_fmax_half_reassoc_nonpow2(<7 x half> %in) { ; CHECK-LABEL: reduce_fmax_half_reassoc_nonpow2( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<12>; -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b16 %rs<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [reduce_fmax_half_reassoc_nonpow2_param_0+8]; -; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NEXT: ld.param.v2.b32 {%r2, %r3}, [reduce_fmax_half_reassoc_nonpow2_param_0]; -; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fmax_half_reassoc_nonpow2_param_0+12]; -; CHECK-NEXT: max.f16x2 %r4, %r2, %r1; -; CHECK-NEXT: mov.b16 %rs8, 0xFE00; -; CHECK-NEXT: mov.b32 %r5, {%rs7, %rs8}; -; CHECK-NEXT: max.f16x2 %r6, %r3, %r5; -; CHECK-NEXT: max.f16x2 %r7, %r4, %r6; -; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r7; -; CHECK-NEXT: max.f16 %rs11, %rs9, %rs10; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs11; +; CHECK-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [reduce_fmax_half_reassoc_nonpow2_param_0+8]; +; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmax_half_reassoc_nonpow2_param_0]; +; CHECK-NEXT: max.f16 %rs8, %rs5, %rs6; +; CHECK-NEXT: max.f16 %rs9, %rs8, %rs7; +; CHECK-NEXT: max.f16 %rs10, %rs3, %rs4; +; CHECK-NEXT: max.f16 %rs11, %rs1, %rs2; +; CHECK-NEXT: max.f16 %rs12, %rs11, %rs10; +; CHECK-NEXT: max.f16 %rs13, %rs12, %rs9; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs13; ; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fmax(<7 x half> %in) ret half %res } -; Check straight-line reduction. define float @reduce_fmax_float(<8 x float> %in) { ; -; CHECK-LABEL: reduce_fmax_float( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0]; -; CHECK-NEXT: max.f32 %r9, %r4, %r8; -; CHECK-NEXT: max.f32 %r10, %r2, %r6; -; CHECK-NEXT: max.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.f32 %r12, %r3, %r7; -; CHECK-NEXT: max.f32 %r13, %r1, %r5; -; CHECK-NEXT: max.f32 %r14, %r13, %r12; -; CHECK-NEXT: max.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmax_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0]; +; CHECK-SM80-NEXT: max.f32 %r9, %r1, %r2; +; CHECK-SM80-NEXT: max.f32 %r10, %r9, %r3; +; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r4; +; CHECK-SM80-NEXT: max.f32 %r12, %r11, %r5; +; CHECK-SM80-NEXT: max.f32 %r13, %r12, %r6; +; CHECK-SM80-NEXT: max.f32 %r14, %r13, %r7; +; CHECK-SM80-NEXT: max.f32 %r15, %r14, %r8; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmax_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0]; +; CHECK-SM100-NEXT: max.f32 %r9, %r1, %r2, %r3; +; CHECK-SM100-NEXT: max.f32 %r10, %r9, %r4, %r5; +; CHECK-SM100-NEXT: max.f32 %r11, %r10, %r6, %r7; +; CHECK-SM100-NEXT: max.f32 %r12, %r11, %r8; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call float @llvm.vector.reduce.fmax(<8 x float> %in) ret float %res } define float @reduce_fmax_float_reassoc(<8 x float> %in) { ; -; CHECK-LABEL: reduce_fmax_float_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0]; -; CHECK-NEXT: max.f32 %r9, %r4, %r8; -; CHECK-NEXT: max.f32 %r10, %r2, %r6; -; CHECK-NEXT: max.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.f32 %r12, %r3, %r7; -; CHECK-NEXT: max.f32 %r13, %r1, %r5; -; CHECK-NEXT: max.f32 %r14, %r13, %r12; -; CHECK-NEXT: max.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmax_float_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0]; +; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: max.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: max.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: max.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: max.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmax_float_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0]; +; CHECK-SM100-NEXT: max.f32 %r9, %r4, %r5, %r6; +; CHECK-SM100-NEXT: max.f32 %r10, %r1, %r2, %r3; +; CHECK-SM100-NEXT: max.f32 %r11, %r10, %r9, %r7; +; CHECK-SM100-NEXT: max.f32 %r12, %r11, %r8; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmax(<8 x float> %in) ret float %res } define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) { ; -; CHECK-LABEL: reduce_fmax_float_reassoc_nonpow2( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<14>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmax_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: max.f32 %r8, %r3, %r7; -; CHECK-NEXT: max.f32 %r9, %r1, %r5; -; CHECK-NEXT: max.f32 %r10, %r9, %r8; -; CHECK-NEXT: max.f32 %r11, %r2, %r6; -; CHECK-NEXT: max.f32 %r12, %r11, %r4; -; CHECK-NEXT: max.f32 %r13, %r10, %r12; -; CHECK-NEXT: st.param.b32 [func_retval0], %r13; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmax_float_reassoc_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<14>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fmax_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_nonpow2_param_0]; +; CHECK-SM80-NEXT: max.f32 %r8, %r5, %r6; +; CHECK-SM80-NEXT: max.f32 %r9, %r8, %r7; +; CHECK-SM80-NEXT: max.f32 %r10, %r3, %r4; +; CHECK-SM80-NEXT: max.f32 %r11, %r1, %r2; +; CHECK-SM80-NEXT: max.f32 %r12, %r11, %r10; +; CHECK-SM80-NEXT: max.f32 %r13, %r12, %r9; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmax_float_reassoc_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fmax_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_nonpow2_param_0]; +; CHECK-SM100-NEXT: max.f32 %r8, %r4, %r5, %r6; +; CHECK-SM100-NEXT: max.f32 %r9, %r1, %r2, %r3; +; CHECK-SM100-NEXT: max.f32 %r10, %r9, %r8, %r7; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmax(<7 x float> %in) ret float %res } -; Check straight line reduction. define half @reduce_fmin_half(<8 x half> %in) { ; CHECK-LABEL: reduce_fmin_half( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b16 %rs<16>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_param_0]; -; CHECK-NEXT: min.f16x2 %r5, %r2, %r4; -; CHECK-NEXT: min.f16x2 %r6, %r1, %r3; -; CHECK-NEXT: min.f16x2 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: min.f16 %rs3, %rs1, %rs2; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r2; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; CHECK-NEXT: min.f16 %rs9, %rs7, %rs8; +; CHECK-NEXT: min.f16 %rs10, %rs9, %rs5; +; CHECK-NEXT: min.f16 %rs11, %rs10, %rs6; +; CHECK-NEXT: min.f16 %rs12, %rs11, %rs3; +; CHECK-NEXT: min.f16 %rs13, %rs12, %rs4; +; CHECK-NEXT: min.f16 %rs14, %rs13, %rs1; +; CHECK-NEXT: min.f16 %rs15, %rs14, %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs15; ; CHECK-NEXT: ret; %res = call half @llvm.vector.reduce.fmin(<8 x half> %in) ret half %res @@ -487,17 +491,23 @@ define half @reduce_fmin_half(<8 x half> %in) { define half @reduce_fmin_half_reassoc(<8 x half> %in) { ; CHECK-LABEL: reduce_fmin_half_reassoc( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b16 %rs<16>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_reassoc_param_0]; -; CHECK-NEXT: min.f16x2 %r5, %r2, %r4; -; CHECK-NEXT: min.f16x2 %r6, %r1, %r3; -; CHECK-NEXT: min.f16x2 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: min.f16 %rs3, %rs1, %rs2; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4; +; CHECK-NEXT: min.f16 %rs9, %rs7, %rs8; +; CHECK-NEXT: min.f16 %rs10, %rs5, %rs6; +; CHECK-NEXT: min.f16 %rs11, %rs10, %rs9; +; CHECK-NEXT: min.f16 %rs12, %rs3, %rs4; +; CHECK-NEXT: min.f16 %rs13, %rs1, %rs2; +; CHECK-NEXT: min.f16 %rs14, %rs13, %rs12; +; CHECK-NEXT: min.f16 %rs15, %rs14, %rs11; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs15; ; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fmin(<8 x half> %in) ret half %res @@ -506,97 +516,132 @@ define half @reduce_fmin_half_reassoc(<8 x half> %in) { define half @reduce_fmin_half_reassoc_nonpow2(<7 x half> %in) { ; CHECK-LABEL: reduce_fmin_half_reassoc_nonpow2( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<12>; -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b16 %rs<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [reduce_fmin_half_reassoc_nonpow2_param_0+8]; -; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NEXT: ld.param.v2.b32 {%r2, %r3}, [reduce_fmin_half_reassoc_nonpow2_param_0]; -; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fmin_half_reassoc_nonpow2_param_0+12]; -; CHECK-NEXT: min.f16x2 %r4, %r2, %r1; -; CHECK-NEXT: mov.b16 %rs8, 0x7E00; -; CHECK-NEXT: mov.b32 %r5, {%rs7, %rs8}; -; CHECK-NEXT: min.f16x2 %r6, %r3, %r5; -; CHECK-NEXT: min.f16x2 %r7, %r4, %r6; -; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r7; -; CHECK-NEXT: min.f16 %rs11, %rs9, %rs10; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs11; +; CHECK-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [reduce_fmin_half_reassoc_nonpow2_param_0+8]; +; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_fmin_half_reassoc_nonpow2_param_0]; +; CHECK-NEXT: min.f16 %rs8, %rs5, %rs6; +; CHECK-NEXT: min.f16 %rs9, %rs8, %rs7; +; CHECK-NEXT: min.f16 %rs10, %rs3, %rs4; +; CHECK-NEXT: min.f16 %rs11, %rs1, %rs2; +; CHECK-NEXT: min.f16 %rs12, %rs11, %rs10; +; CHECK-NEXT: min.f16 %rs13, %rs12, %rs9; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs13; ; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fmin(<7 x half> %in) ret half %res } -; Check straight-line reduction. define float @reduce_fmin_float(<8 x float> %in) { ; -; CHECK-LABEL: reduce_fmin_float( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0]; -; CHECK-NEXT: min.f32 %r9, %r4, %r8; -; CHECK-NEXT: min.f32 %r10, %r2, %r6; -; CHECK-NEXT: min.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.f32 %r12, %r3, %r7; -; CHECK-NEXT: min.f32 %r13, %r1, %r5; -; CHECK-NEXT: min.f32 %r14, %r13, %r12; -; CHECK-NEXT: min.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmin_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0]; +; CHECK-SM80-NEXT: min.f32 %r9, %r1, %r2; +; CHECK-SM80-NEXT: min.f32 %r10, %r9, %r3; +; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r4; +; CHECK-SM80-NEXT: min.f32 %r12, %r11, %r5; +; CHECK-SM80-NEXT: min.f32 %r13, %r12, %r6; +; CHECK-SM80-NEXT: min.f32 %r14, %r13, %r7; +; CHECK-SM80-NEXT: min.f32 %r15, %r14, %r8; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmin_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0]; +; CHECK-SM100-NEXT: min.f32 %r9, %r1, %r2, %r3; +; CHECK-SM100-NEXT: min.f32 %r10, %r9, %r4, %r5; +; CHECK-SM100-NEXT: min.f32 %r11, %r10, %r6, %r7; +; CHECK-SM100-NEXT: min.f32 %r12, %r11, %r8; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call float @llvm.vector.reduce.fmin(<8 x float> %in) ret float %res } define float @reduce_fmin_float_reassoc(<8 x float> %in) { ; -; CHECK-LABEL: reduce_fmin_float_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0]; -; CHECK-NEXT: min.f32 %r9, %r4, %r8; -; CHECK-NEXT: min.f32 %r10, %r2, %r6; -; CHECK-NEXT: min.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.f32 %r12, %r3, %r7; -; CHECK-NEXT: min.f32 %r13, %r1, %r5; -; CHECK-NEXT: min.f32 %r14, %r13, %r12; -; CHECK-NEXT: min.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmin_float_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0]; +; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: min.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: min.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: min.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: min.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmin_float_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0]; +; CHECK-SM100-NEXT: min.f32 %r9, %r4, %r5, %r6; +; CHECK-SM100-NEXT: min.f32 %r10, %r1, %r2, %r3; +; CHECK-SM100-NEXT: min.f32 %r11, %r10, %r9, %r7; +; CHECK-SM100-NEXT: min.f32 %r12, %r11, %r8; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmin(<8 x float> %in) ret float %res } define float @reduce_fmin_float_reassoc_nonpow2(<7 x float> %in) { ; -; CHECK-LABEL: reduce_fmin_float_reassoc_nonpow2( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<14>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmin_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: min.f32 %r8, %r3, %r7; -; CHECK-NEXT: min.f32 %r9, %r1, %r5; -; CHECK-NEXT: min.f32 %r10, %r9, %r8; -; CHECK-NEXT: min.f32 %r11, %r2, %r6; -; CHECK-NEXT: min.f32 %r12, %r11, %r4; -; CHECK-NEXT: min.f32 %r13, %r10, %r12; -; CHECK-NEXT: st.param.b32 [func_retval0], %r13; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmin_float_reassoc_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<14>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fmin_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_nonpow2_param_0]; +; CHECK-SM80-NEXT: min.f32 %r8, %r5, %r6; +; CHECK-SM80-NEXT: min.f32 %r9, %r8, %r7; +; CHECK-SM80-NEXT: min.f32 %r10, %r3, %r4; +; CHECK-SM80-NEXT: min.f32 %r11, %r1, %r2; +; CHECK-SM80-NEXT: min.f32 %r12, %r11, %r10; +; CHECK-SM80-NEXT: min.f32 %r13, %r12, %r9; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmin_float_reassoc_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fmin_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_nonpow2_param_0]; +; CHECK-SM100-NEXT: min.f32 %r8, %r4, %r5, %r6; +; CHECK-SM100-NEXT: min.f32 %r9, %r1, %r2, %r3; +; CHECK-SM100-NEXT: min.f32 %r10, %r9, %r8, %r7; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmin(<7 x float> %in) ret float %res } -; Check straight-line reduction. define half @reduce_fmaximum_half(<8 x half> %in) { ; CHECK-LABEL: reduce_fmaximum_half( ; CHECK: { @@ -661,74 +706,114 @@ define half @reduce_fmaximum_half_reassoc_nonpow2(<7 x half> %in) { ret half %res } -; Check straight-line reduction. define float @reduce_fmaximum_float(<8 x float> %in) { ; -; CHECK-LABEL: reduce_fmaximum_float( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0]; -; CHECK-NEXT: max.NaN.f32 %r9, %r4, %r8; -; CHECK-NEXT: max.NaN.f32 %r10, %r2, %r6; -; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r7; -; CHECK-NEXT: max.NaN.f32 %r13, %r1, %r5; -; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12; -; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmaximum_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0]; +; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: max.NaN.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: max.NaN.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: max.NaN.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: max.NaN.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmaximum_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0]; +; CHECK-SM100-NEXT: max.NaN.f32 %r9, %r4, %r5, %r6; +; CHECK-SM100-NEXT: max.NaN.f32 %r10, %r1, %r2, %r3; +; CHECK-SM100-NEXT: max.NaN.f32 %r11, %r10, %r9, %r7; +; CHECK-SM100-NEXT: max.NaN.f32 %r12, %r11, %r8; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call float @llvm.vector.reduce.fmaximum(<8 x float> %in) ret float %res } define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { ; -; CHECK-LABEL: reduce_fmaximum_float_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0]; -; CHECK-NEXT: max.NaN.f32 %r9, %r4, %r8; -; CHECK-NEXT: max.NaN.f32 %r10, %r2, %r6; -; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r7; -; CHECK-NEXT: max.NaN.f32 %r13, %r1, %r5; -; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12; -; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmaximum_float_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0]; +; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: max.NaN.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: max.NaN.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: max.NaN.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: max.NaN.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmaximum_float_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0]; +; CHECK-SM100-NEXT: max.NaN.f32 %r9, %r4, %r5, %r6; +; CHECK-SM100-NEXT: max.NaN.f32 %r10, %r1, %r2, %r3; +; CHECK-SM100-NEXT: max.NaN.f32 %r11, %r10, %r9, %r7; +; CHECK-SM100-NEXT: max.NaN.f32 %r12, %r11, %r8; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmaximum(<8 x float> %in) ret float %res } define float @reduce_fmaximum_float_reassoc_nonpow2(<7 x float> %in) { ; -; CHECK-LABEL: reduce_fmaximum_float_reassoc_nonpow2( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<14>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: max.NaN.f32 %r8, %r3, %r7; -; CHECK-NEXT: max.NaN.f32 %r9, %r1, %r5; -; CHECK-NEXT: max.NaN.f32 %r10, %r9, %r8; -; CHECK-NEXT: max.NaN.f32 %r11, %r2, %r6; -; CHECK-NEXT: max.NaN.f32 %r12, %r11, %r4; -; CHECK-NEXT: max.NaN.f32 %r13, %r10, %r12; -; CHECK-NEXT: st.param.b32 [func_retval0], %r13; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmaximum_float_reassoc_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<14>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0]; +; CHECK-SM80-NEXT: max.NaN.f32 %r8, %r5, %r6; +; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r8, %r7; +; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r3, %r4; +; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r1, %r2; +; CHECK-SM80-NEXT: max.NaN.f32 %r12, %r11, %r10; +; CHECK-SM80-NEXT: max.NaN.f32 %r13, %r12, %r9; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmaximum_float_reassoc_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0]; +; CHECK-SM100-NEXT: max.NaN.f32 %r8, %r4, %r5, %r6; +; CHECK-SM100-NEXT: max.NaN.f32 %r9, %r1, %r2, %r3; +; CHECK-SM100-NEXT: max.NaN.f32 %r10, %r9, %r8, %r7; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmaximum(<7 x float> %in) ret float %res } -; Check straight-line reduction. define half @reduce_fminimum_half(<8 x half> %in) { ; CHECK-LABEL: reduce_fminimum_half( ; CHECK: { @@ -793,74 +878,116 @@ define half @reduce_fminimum_half_reassoc_nonpow2(<7 x half> %in) { ret half %res } -; Check straight-line reduction. define float @reduce_fminimum_float(<8 x float> %in) { ; -; CHECK-LABEL: reduce_fminimum_float( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0]; -; CHECK-NEXT: min.NaN.f32 %r9, %r4, %r8; -; CHECK-NEXT: min.NaN.f32 %r10, %r2, %r6; -; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r7; -; CHECK-NEXT: min.NaN.f32 %r13, %r1, %r5; -; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12; -; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fminimum_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0]; +; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: min.NaN.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: min.NaN.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: min.NaN.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: min.NaN.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fminimum_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0]; +; CHECK-SM100-NEXT: min.NaN.f32 %r9, %r4, %r5, %r6; +; CHECK-SM100-NEXT: min.NaN.f32 %r10, %r1, %r2, %r3; +; CHECK-SM100-NEXT: min.NaN.f32 %r11, %r10, %r9, %r7; +; CHECK-SM100-NEXT: min.NaN.f32 %r12, %r11, %r8; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call float @llvm.vector.reduce.fminimum(<8 x float> %in) ret float %res } define float @reduce_fminimum_float_reassoc(<8 x float> %in) { ; -; CHECK-LABEL: reduce_fminimum_float_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0]; -; CHECK-NEXT: min.NaN.f32 %r9, %r4, %r8; -; CHECK-NEXT: min.NaN.f32 %r10, %r2, %r6; -; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r7; -; CHECK-NEXT: min.NaN.f32 %r13, %r1, %r5; -; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12; -; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fminimum_float_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0]; +; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: min.NaN.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: min.NaN.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: min.NaN.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: min.NaN.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fminimum_float_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0]; +; CHECK-SM100-NEXT: min.NaN.f32 %r9, %r4, %r5, %r6; +; CHECK-SM100-NEXT: min.NaN.f32 %r10, %r1, %r2, %r3; +; CHECK-SM100-NEXT: min.NaN.f32 %r11, %r10, %r9, %r7; +; CHECK-SM100-NEXT: min.NaN.f32 %r12, %r11, %r8; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fminimum(<8 x float> %in) ret float %res } define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) { ; -; CHECK-LABEL: reduce_fminimum_float_reassoc_nonpow2( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<14>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: min.NaN.f32 %r8, %r3, %r7; -; CHECK-NEXT: min.NaN.f32 %r9, %r1, %r5; -; CHECK-NEXT: min.NaN.f32 %r10, %r9, %r8; -; CHECK-NEXT: min.NaN.f32 %r11, %r2, %r6; -; CHECK-NEXT: min.NaN.f32 %r12, %r11, %r4; -; CHECK-NEXT: min.NaN.f32 %r13, %r10, %r12; -; CHECK-NEXT: st.param.b32 [func_retval0], %r13; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fminimum_float_reassoc_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<14>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_nonpow2_param_0]; +; CHECK-SM80-NEXT: min.NaN.f32 %r8, %r5, %r6; +; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r8, %r7; +; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r3, %r4; +; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r1, %r2; +; CHECK-SM80-NEXT: min.NaN.f32 %r12, %r11, %r10; +; CHECK-SM80-NEXT: min.NaN.f32 %r13, %r12, %r9; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fminimum_float_reassoc_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_nonpow2_param_0]; +; CHECK-SM100-NEXT: min.NaN.f32 %r8, %r4, %r5, %r6; +; CHECK-SM100-NEXT: min.NaN.f32 %r9, %r1, %r2, %r3; +; CHECK-SM100-NEXT: min.NaN.f32 %r10, %r9, %r8, %r7; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fminimum(<7 x float> %in) ret float %res } define i16 @reduce_add_i16(<8 x i16> %in) { +; ; CHECK-SM80-LABEL: reduce_add_i16( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b16 %rs<16>; @@ -870,15 +997,15 @@ define i16 @reduce_add_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-SM80-NEXT: add.s16 %rs5, %rs3, %rs1; +; CHECK-SM80-NEXT: add.s16 %rs5, %rs4, %rs2; ; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3; ; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1; -; CHECK-SM80-NEXT: add.s16 %rs10, %rs8, %rs6; -; CHECK-SM80-NEXT: add.s16 %rs11, %rs4, %rs2; -; CHECK-SM80-NEXT: add.s16 %rs12, %rs9, %rs7; -; CHECK-SM80-NEXT: add.s16 %rs13, %rs12, %rs11; -; CHECK-SM80-NEXT: add.s16 %rs14, %rs10, %rs5; -; CHECK-SM80-NEXT: add.s16 %rs15, %rs14, %rs13; +; CHECK-SM80-NEXT: add.s16 %rs10, %rs9, %rs7; +; CHECK-SM80-NEXT: add.s16 %rs11, %rs10, %rs5; +; CHECK-SM80-NEXT: add.s16 %rs12, %rs3, %rs1; +; CHECK-SM80-NEXT: add.s16 %rs13, %rs8, %rs6; +; CHECK-SM80-NEXT: add.s16 %rs14, %rs13, %rs12; +; CHECK-SM80-NEXT: add.s16 %rs15, %rs14, %rs11; ; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-SM80-NEXT: ret; @@ -886,26 +1013,24 @@ define i16 @reduce_add_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_add_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0]; ; CHECK-SM100-NEXT: add.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: add.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: add.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: add.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: add.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.add(<8 x i16> %in) ret i16 %res } define i16 @reduce_add_i16_nonpow2(<7 x i16> %in) { +; ; CHECK-SM80-LABEL: reduce_add_i16_nonpow2( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b16 %rs<14>; @@ -959,13 +1084,13 @@ define i32 @reduce_add_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0]; -; CHECK-NEXT: add.s32 %r9, %r3, %r7; -; CHECK-NEXT: add.s32 %r10, %r1, %r5; -; CHECK-NEXT: add.s32 %r11, %r4, %r8; -; CHECK-NEXT: add.s32 %r12, %r2, %r6; -; CHECK-NEXT: add.s32 %r13, %r12, %r11; -; CHECK-NEXT: add.s32 %r14, %r10, %r9; -; CHECK-NEXT: add.s32 %r15, %r14, %r13; +; CHECK-NEXT: add.s32 %r9, %r7, %r8; +; CHECK-NEXT: add.s32 %r10, %r5, %r6; +; CHECK-NEXT: add.s32 %r11, %r10, %r9; +; CHECK-NEXT: add.s32 %r12, %r3, %r4; +; CHECK-NEXT: add.s32 %r13, %r1, %r2; +; CHECK-NEXT: add.s32 %r14, %r13, %r12; +; CHECK-NEXT: add.s32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.add(<8 x i32> %in) @@ -981,12 +1106,12 @@ define i32 @reduce_add_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: ld.param.b32 %r7, [reduce_add_i32_nonpow2_param_0+24]; ; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_add_i32_nonpow2_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_nonpow2_param_0]; -; CHECK-NEXT: add.s32 %r8, %r3, %r7; -; CHECK-NEXT: add.s32 %r9, %r1, %r5; -; CHECK-NEXT: add.s32 %r10, %r9, %r8; -; CHECK-NEXT: add.s32 %r11, %r2, %r6; -; CHECK-NEXT: add.s32 %r12, %r11, %r4; -; CHECK-NEXT: add.s32 %r13, %r10, %r12; +; CHECK-NEXT: add.s32 %r8, %r5, %r6; +; CHECK-NEXT: add.s32 %r9, %r8, %r7; +; CHECK-NEXT: add.s32 %r10, %r3, %r4; +; CHECK-NEXT: add.s32 %r11, %r1, %r2; +; CHECK-NEXT: add.s32 %r12, %r11, %r10; +; CHECK-NEXT: add.s32 %r13, %r12, %r9; ; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.add(<7 x i32> %in) @@ -1001,17 +1126,17 @@ define i16 @reduce_mul_i16(<8 x i16> %in) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i16_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-NEXT: mul.lo.s16 %rs5, %rs3, %rs1; -; CHECK-NEXT: mov.b32 {%rs6, %rs7}, %r3; -; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r1; -; CHECK-NEXT: mul.lo.s16 %rs10, %rs8, %rs6; -; CHECK-NEXT: mul.lo.s16 %rs11, %rs4, %rs2; -; CHECK-NEXT: mul.lo.s16 %rs12, %rs9, %rs7; -; CHECK-NEXT: mul.lo.s16 %rs13, %rs12, %rs11; -; CHECK-NEXT: mul.lo.s16 %rs14, %rs10, %rs5; -; CHECK-NEXT: mul.lo.s16 %rs15, %rs14, %rs13; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4; +; CHECK-NEXT: mul.lo.s16 %rs9, %rs7, %rs8; +; CHECK-NEXT: mul.lo.s16 %rs10, %rs5, %rs6; +; CHECK-NEXT: mul.lo.s16 %rs11, %rs10, %rs9; +; CHECK-NEXT: mul.lo.s16 %rs12, %rs3, %rs4; +; CHECK-NEXT: mul.lo.s16 %rs13, %rs1, %rs2; +; CHECK-NEXT: mul.lo.s16 %rs14, %rs13, %rs12; +; CHECK-NEXT: mul.lo.s16 %rs15, %rs14, %rs11; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs15; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; @@ -1029,12 +1154,12 @@ define i16 @reduce_mul_i16_nonpow2(<7 x i16> %in) { ; CHECK-NEXT: ld.param.b16 %rs7, [reduce_mul_i16_nonpow2_param_0+12]; ; CHECK-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [reduce_mul_i16_nonpow2_param_0+8]; ; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_mul_i16_nonpow2_param_0]; -; CHECK-NEXT: mul.lo.s16 %rs8, %rs3, %rs7; -; CHECK-NEXT: mul.lo.s16 %rs9, %rs1, %rs5; -; CHECK-NEXT: mul.lo.s16 %rs10, %rs9, %rs8; -; CHECK-NEXT: mul.lo.s16 %rs11, %rs2, %rs6; -; CHECK-NEXT: mul.lo.s16 %rs12, %rs4, %rs11; -; CHECK-NEXT: mul.lo.s16 %rs13, %rs10, %rs12; +; CHECK-NEXT: mul.lo.s16 %rs8, %rs5, %rs6; +; CHECK-NEXT: mul.lo.s16 %rs9, %rs7, %rs8; +; CHECK-NEXT: mul.lo.s16 %rs10, %rs3, %rs4; +; CHECK-NEXT: mul.lo.s16 %rs11, %rs1, %rs2; +; CHECK-NEXT: mul.lo.s16 %rs12, %rs11, %rs10; +; CHECK-NEXT: mul.lo.s16 %rs13, %rs12, %rs9; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs13; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -1050,13 +1175,13 @@ define i32 @reduce_mul_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0]; -; CHECK-NEXT: mul.lo.s32 %r9, %r3, %r7; -; CHECK-NEXT: mul.lo.s32 %r10, %r1, %r5; -; CHECK-NEXT: mul.lo.s32 %r11, %r4, %r8; -; CHECK-NEXT: mul.lo.s32 %r12, %r2, %r6; -; CHECK-NEXT: mul.lo.s32 %r13, %r12, %r11; -; CHECK-NEXT: mul.lo.s32 %r14, %r10, %r9; -; CHECK-NEXT: mul.lo.s32 %r15, %r14, %r13; +; CHECK-NEXT: mul.lo.s32 %r9, %r7, %r8; +; CHECK-NEXT: mul.lo.s32 %r10, %r5, %r6; +; CHECK-NEXT: mul.lo.s32 %r11, %r10, %r9; +; CHECK-NEXT: mul.lo.s32 %r12, %r3, %r4; +; CHECK-NEXT: mul.lo.s32 %r13, %r1, %r2; +; CHECK-NEXT: mul.lo.s32 %r14, %r13, %r12; +; CHECK-NEXT: mul.lo.s32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.mul(<8 x i32> %in) @@ -1072,12 +1197,12 @@ define i32 @reduce_mul_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: ld.param.b32 %r7, [reduce_mul_i32_nonpow2_param_0+24]; ; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_mul_i32_nonpow2_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_nonpow2_param_0]; -; CHECK-NEXT: mul.lo.s32 %r8, %r3, %r7; -; CHECK-NEXT: mul.lo.s32 %r9, %r1, %r5; -; CHECK-NEXT: mul.lo.s32 %r10, %r9, %r8; -; CHECK-NEXT: mul.lo.s32 %r11, %r2, %r6; -; CHECK-NEXT: mul.lo.s32 %r12, %r4, %r11; -; CHECK-NEXT: mul.lo.s32 %r13, %r10, %r12; +; CHECK-NEXT: mul.lo.s32 %r8, %r5, %r6; +; CHECK-NEXT: mul.lo.s32 %r9, %r8, %r7; +; CHECK-NEXT: mul.lo.s32 %r10, %r3, %r4; +; CHECK-NEXT: mul.lo.s32 %r11, %r1, %r2; +; CHECK-NEXT: mul.lo.s32 %r12, %r11, %r10; +; CHECK-NEXT: mul.lo.s32 %r13, %r12, %r9; ; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.mul(<7 x i32> %in) @@ -1085,6 +1210,7 @@ define i32 @reduce_mul_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_umax_i16(<8 x i16> %in) { +; ; CHECK-SM80-LABEL: reduce_umax_i16( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b16 %rs<16>; @@ -1094,15 +1220,15 @@ define i16 @reduce_umax_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-SM80-NEXT: max.u16 %rs5, %rs3, %rs1; +; CHECK-SM80-NEXT: max.u16 %rs5, %rs4, %rs2; ; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3; ; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1; -; CHECK-SM80-NEXT: max.u16 %rs10, %rs8, %rs6; -; CHECK-SM80-NEXT: max.u16 %rs11, %rs4, %rs2; -; CHECK-SM80-NEXT: max.u16 %rs12, %rs9, %rs7; -; CHECK-SM80-NEXT: max.u16 %rs13, %rs12, %rs11; -; CHECK-SM80-NEXT: max.u16 %rs14, %rs10, %rs5; -; CHECK-SM80-NEXT: max.u16 %rs15, %rs14, %rs13; +; CHECK-SM80-NEXT: max.u16 %rs10, %rs9, %rs7; +; CHECK-SM80-NEXT: max.u16 %rs11, %rs10, %rs5; +; CHECK-SM80-NEXT: max.u16 %rs12, %rs3, %rs1; +; CHECK-SM80-NEXT: max.u16 %rs13, %rs8, %rs6; +; CHECK-SM80-NEXT: max.u16 %rs14, %rs13, %rs12; +; CHECK-SM80-NEXT: max.u16 %rs15, %rs14, %rs11; ; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-SM80-NEXT: ret; @@ -1110,26 +1236,24 @@ define i16 @reduce_umax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0]; ; CHECK-SM100-NEXT: max.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: max.u16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: max.u16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umax(<8 x i16> %in) ret i16 %res } define i16 @reduce_umax_i16_nonpow2(<7 x i16> %in) { +; ; CHECK-SM80-LABEL: reduce_umax_i16_nonpow2( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b16 %rs<14>; @@ -1183,13 +1307,13 @@ define i32 @reduce_umax_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0]; -; CHECK-NEXT: max.u32 %r9, %r3, %r7; -; CHECK-NEXT: max.u32 %r10, %r1, %r5; -; CHECK-NEXT: max.u32 %r11, %r4, %r8; -; CHECK-NEXT: max.u32 %r12, %r2, %r6; -; CHECK-NEXT: max.u32 %r13, %r12, %r11; -; CHECK-NEXT: max.u32 %r14, %r10, %r9; -; CHECK-NEXT: max.u32 %r15, %r14, %r13; +; CHECK-NEXT: max.u32 %r9, %r7, %r8; +; CHECK-NEXT: max.u32 %r10, %r5, %r6; +; CHECK-NEXT: max.u32 %r11, %r10, %r9; +; CHECK-NEXT: max.u32 %r12, %r3, %r4; +; CHECK-NEXT: max.u32 %r13, %r1, %r2; +; CHECK-NEXT: max.u32 %r14, %r13, %r12; +; CHECK-NEXT: max.u32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.umax(<8 x i32> %in) @@ -1205,12 +1329,12 @@ define i32 @reduce_umax_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: ld.param.b32 %r7, [reduce_umax_i32_nonpow2_param_0+24]; ; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_umax_i32_nonpow2_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_nonpow2_param_0]; -; CHECK-NEXT: max.u32 %r8, %r3, %r7; -; CHECK-NEXT: max.u32 %r9, %r1, %r5; -; CHECK-NEXT: max.u32 %r10, %r9, %r8; -; CHECK-NEXT: max.u32 %r11, %r2, %r6; -; CHECK-NEXT: max.u32 %r12, %r4, %r11; -; CHECK-NEXT: max.u32 %r13, %r10, %r12; +; CHECK-NEXT: max.u32 %r8, %r5, %r6; +; CHECK-NEXT: max.u32 %r9, %r8, %r7; +; CHECK-NEXT: max.u32 %r10, %r3, %r4; +; CHECK-NEXT: max.u32 %r11, %r1, %r2; +; CHECK-NEXT: max.u32 %r12, %r11, %r10; +; CHECK-NEXT: max.u32 %r13, %r12, %r9; ; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.umax(<7 x i32> %in) @@ -1218,6 +1342,7 @@ define i32 @reduce_umax_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_umin_i16(<8 x i16> %in) { +; ; CHECK-SM80-LABEL: reduce_umin_i16( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b16 %rs<16>; @@ -1227,15 +1352,15 @@ define i16 @reduce_umin_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-SM80-NEXT: min.u16 %rs5, %rs3, %rs1; +; CHECK-SM80-NEXT: min.u16 %rs5, %rs4, %rs2; ; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3; ; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1; -; CHECK-SM80-NEXT: min.u16 %rs10, %rs8, %rs6; -; CHECK-SM80-NEXT: min.u16 %rs11, %rs4, %rs2; -; CHECK-SM80-NEXT: min.u16 %rs12, %rs9, %rs7; -; CHECK-SM80-NEXT: min.u16 %rs13, %rs12, %rs11; -; CHECK-SM80-NEXT: min.u16 %rs14, %rs10, %rs5; -; CHECK-SM80-NEXT: min.u16 %rs15, %rs14, %rs13; +; CHECK-SM80-NEXT: min.u16 %rs10, %rs9, %rs7; +; CHECK-SM80-NEXT: min.u16 %rs11, %rs10, %rs5; +; CHECK-SM80-NEXT: min.u16 %rs12, %rs3, %rs1; +; CHECK-SM80-NEXT: min.u16 %rs13, %rs8, %rs6; +; CHECK-SM80-NEXT: min.u16 %rs14, %rs13, %rs12; +; CHECK-SM80-NEXT: min.u16 %rs15, %rs14, %rs11; ; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-SM80-NEXT: ret; @@ -1243,26 +1368,24 @@ define i16 @reduce_umin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0]; ; CHECK-SM100-NEXT: min.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: min.u16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: min.u16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umin(<8 x i16> %in) ret i16 %res } define i16 @reduce_umin_i16_nonpow2(<7 x i16> %in) { +; ; CHECK-SM80-LABEL: reduce_umin_i16_nonpow2( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b16 %rs<14>; @@ -1316,13 +1439,13 @@ define i32 @reduce_umin_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0]; -; CHECK-NEXT: min.u32 %r9, %r3, %r7; -; CHECK-NEXT: min.u32 %r10, %r1, %r5; -; CHECK-NEXT: min.u32 %r11, %r4, %r8; -; CHECK-NEXT: min.u32 %r12, %r2, %r6; -; CHECK-NEXT: min.u32 %r13, %r12, %r11; -; CHECK-NEXT: min.u32 %r14, %r10, %r9; -; CHECK-NEXT: min.u32 %r15, %r14, %r13; +; CHECK-NEXT: min.u32 %r9, %r7, %r8; +; CHECK-NEXT: min.u32 %r10, %r5, %r6; +; CHECK-NEXT: min.u32 %r11, %r10, %r9; +; CHECK-NEXT: min.u32 %r12, %r3, %r4; +; CHECK-NEXT: min.u32 %r13, %r1, %r2; +; CHECK-NEXT: min.u32 %r14, %r13, %r12; +; CHECK-NEXT: min.u32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.umin(<8 x i32> %in) @@ -1338,12 +1461,12 @@ define i32 @reduce_umin_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: ld.param.b32 %r7, [reduce_umin_i32_nonpow2_param_0+24]; ; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_umin_i32_nonpow2_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_nonpow2_param_0]; -; CHECK-NEXT: min.u32 %r8, %r3, %r7; -; CHECK-NEXT: min.u32 %r9, %r1, %r5; -; CHECK-NEXT: min.u32 %r10, %r9, %r8; -; CHECK-NEXT: min.u32 %r11, %r2, %r6; -; CHECK-NEXT: min.u32 %r12, %r4, %r11; -; CHECK-NEXT: min.u32 %r13, %r10, %r12; +; CHECK-NEXT: min.u32 %r8, %r5, %r6; +; CHECK-NEXT: min.u32 %r9, %r8, %r7; +; CHECK-NEXT: min.u32 %r10, %r3, %r4; +; CHECK-NEXT: min.u32 %r11, %r1, %r2; +; CHECK-NEXT: min.u32 %r12, %r11, %r10; +; CHECK-NEXT: min.u32 %r13, %r12, %r9; ; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.umin(<7 x i32> %in) @@ -1351,6 +1474,7 @@ define i32 @reduce_umin_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_smax_i16(<8 x i16> %in) { +; ; CHECK-SM80-LABEL: reduce_smax_i16( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b16 %rs<16>; @@ -1360,15 +1484,15 @@ define i16 @reduce_smax_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-SM80-NEXT: max.s16 %rs5, %rs3, %rs1; +; CHECK-SM80-NEXT: max.s16 %rs5, %rs4, %rs2; ; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3; ; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1; -; CHECK-SM80-NEXT: max.s16 %rs10, %rs8, %rs6; -; CHECK-SM80-NEXT: max.s16 %rs11, %rs4, %rs2; -; CHECK-SM80-NEXT: max.s16 %rs12, %rs9, %rs7; -; CHECK-SM80-NEXT: max.s16 %rs13, %rs12, %rs11; -; CHECK-SM80-NEXT: max.s16 %rs14, %rs10, %rs5; -; CHECK-SM80-NEXT: max.s16 %rs15, %rs14, %rs13; +; CHECK-SM80-NEXT: max.s16 %rs10, %rs9, %rs7; +; CHECK-SM80-NEXT: max.s16 %rs11, %rs10, %rs5; +; CHECK-SM80-NEXT: max.s16 %rs12, %rs3, %rs1; +; CHECK-SM80-NEXT: max.s16 %rs13, %rs8, %rs6; +; CHECK-SM80-NEXT: max.s16 %rs14, %rs13, %rs12; +; CHECK-SM80-NEXT: max.s16 %rs15, %rs14, %rs11; ; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-SM80-NEXT: ret; @@ -1376,26 +1500,24 @@ define i16 @reduce_smax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0]; ; CHECK-SM100-NEXT: max.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: max.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: max.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smax(<8 x i16> %in) ret i16 %res } define i16 @reduce_smax_i16_nonpow2(<7 x i16> %in) { +; ; CHECK-SM80-LABEL: reduce_smax_i16_nonpow2( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b16 %rs<14>; @@ -1449,13 +1571,13 @@ define i32 @reduce_smax_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0]; -; CHECK-NEXT: max.s32 %r9, %r3, %r7; -; CHECK-NEXT: max.s32 %r10, %r1, %r5; -; CHECK-NEXT: max.s32 %r11, %r4, %r8; -; CHECK-NEXT: max.s32 %r12, %r2, %r6; -; CHECK-NEXT: max.s32 %r13, %r12, %r11; -; CHECK-NEXT: max.s32 %r14, %r10, %r9; -; CHECK-NEXT: max.s32 %r15, %r14, %r13; +; CHECK-NEXT: max.s32 %r9, %r7, %r8; +; CHECK-NEXT: max.s32 %r10, %r5, %r6; +; CHECK-NEXT: max.s32 %r11, %r10, %r9; +; CHECK-NEXT: max.s32 %r12, %r3, %r4; +; CHECK-NEXT: max.s32 %r13, %r1, %r2; +; CHECK-NEXT: max.s32 %r14, %r13, %r12; +; CHECK-NEXT: max.s32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.smax(<8 x i32> %in) @@ -1471,12 +1593,12 @@ define i32 @reduce_smax_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: ld.param.b32 %r7, [reduce_smax_i32_nonpow2_param_0+24]; ; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_smax_i32_nonpow2_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_nonpow2_param_0]; -; CHECK-NEXT: max.s32 %r8, %r3, %r7; -; CHECK-NEXT: max.s32 %r9, %r1, %r5; -; CHECK-NEXT: max.s32 %r10, %r9, %r8; -; CHECK-NEXT: max.s32 %r11, %r2, %r6; -; CHECK-NEXT: max.s32 %r12, %r4, %r11; -; CHECK-NEXT: max.s32 %r13, %r10, %r12; +; CHECK-NEXT: max.s32 %r8, %r5, %r6; +; CHECK-NEXT: max.s32 %r9, %r8, %r7; +; CHECK-NEXT: max.s32 %r10, %r3, %r4; +; CHECK-NEXT: max.s32 %r11, %r1, %r2; +; CHECK-NEXT: max.s32 %r12, %r11, %r10; +; CHECK-NEXT: max.s32 %r13, %r12, %r9; ; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.smax(<7 x i32> %in) @@ -1484,6 +1606,7 @@ define i32 @reduce_smax_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_smin_i16(<8 x i16> %in) { +; ; CHECK-SM80-LABEL: reduce_smin_i16( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b16 %rs<16>; @@ -1493,15 +1616,15 @@ define i16 @reduce_smin_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-SM80-NEXT: min.s16 %rs5, %rs3, %rs1; +; CHECK-SM80-NEXT: min.s16 %rs5, %rs4, %rs2; ; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3; ; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1; -; CHECK-SM80-NEXT: min.s16 %rs10, %rs8, %rs6; -; CHECK-SM80-NEXT: min.s16 %rs11, %rs4, %rs2; -; CHECK-SM80-NEXT: min.s16 %rs12, %rs9, %rs7; -; CHECK-SM80-NEXT: min.s16 %rs13, %rs12, %rs11; -; CHECK-SM80-NEXT: min.s16 %rs14, %rs10, %rs5; -; CHECK-SM80-NEXT: min.s16 %rs15, %rs14, %rs13; +; CHECK-SM80-NEXT: min.s16 %rs10, %rs9, %rs7; +; CHECK-SM80-NEXT: min.s16 %rs11, %rs10, %rs5; +; CHECK-SM80-NEXT: min.s16 %rs12, %rs3, %rs1; +; CHECK-SM80-NEXT: min.s16 %rs13, %rs8, %rs6; +; CHECK-SM80-NEXT: min.s16 %rs14, %rs13, %rs12; +; CHECK-SM80-NEXT: min.s16 %rs15, %rs14, %rs11; ; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-SM80-NEXT: ret; @@ -1509,26 +1632,24 @@ define i16 @reduce_smin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0]; ; CHECK-SM100-NEXT: min.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: min.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: min.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smin(<8 x i16> %in) ret i16 %res } define i16 @reduce_smin_i16_nonpow2(<7 x i16> %in) { +; ; CHECK-SM80-LABEL: reduce_smin_i16_nonpow2( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b16 %rs<14>; @@ -1582,13 +1703,13 @@ define i32 @reduce_smin_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0]; -; CHECK-NEXT: min.s32 %r9, %r3, %r7; -; CHECK-NEXT: min.s32 %r10, %r1, %r5; -; CHECK-NEXT: min.s32 %r11, %r4, %r8; -; CHECK-NEXT: min.s32 %r12, %r2, %r6; -; CHECK-NEXT: min.s32 %r13, %r12, %r11; -; CHECK-NEXT: min.s32 %r14, %r10, %r9; -; CHECK-NEXT: min.s32 %r15, %r14, %r13; +; CHECK-NEXT: min.s32 %r9, %r7, %r8; +; CHECK-NEXT: min.s32 %r10, %r5, %r6; +; CHECK-NEXT: min.s32 %r11, %r10, %r9; +; CHECK-NEXT: min.s32 %r12, %r3, %r4; +; CHECK-NEXT: min.s32 %r13, %r1, %r2; +; CHECK-NEXT: min.s32 %r14, %r13, %r12; +; CHECK-NEXT: min.s32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.smin(<8 x i32> %in) @@ -1604,12 +1725,12 @@ define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: ld.param.b32 %r7, [reduce_smin_i32_nonpow2_param_0+24]; ; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_smin_i32_nonpow2_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_nonpow2_param_0]; -; CHECK-NEXT: min.s32 %r8, %r3, %r7; -; CHECK-NEXT: min.s32 %r9, %r1, %r5; -; CHECK-NEXT: min.s32 %r10, %r9, %r8; -; CHECK-NEXT: min.s32 %r11, %r2, %r6; -; CHECK-NEXT: min.s32 %r12, %r4, %r11; -; CHECK-NEXT: min.s32 %r13, %r10, %r12; +; CHECK-NEXT: min.s32 %r8, %r5, %r6; +; CHECK-NEXT: min.s32 %r9, %r8, %r7; +; CHECK-NEXT: min.s32 %r10, %r3, %r4; +; CHECK-NEXT: min.s32 %r11, %r1, %r2; +; CHECK-NEXT: min.s32 %r12, %r11, %r10; +; CHECK-NEXT: min.s32 %r13, %r12, %r9; ; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.smin(<7 x i32> %in) @@ -1617,43 +1738,21 @@ define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_and_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_and_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; -; CHECK-SM80-NEXT: and.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: and.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: and.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: and.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_and_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; -; CHECK-SM100-NEXT: and.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: and.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: and.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: and.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_and_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; +; CHECK-NEXT: and.b32 %r5, %r2, %r4; +; CHECK-NEXT: and.b32 %r6, %r1, %r3; +; CHECK-NEXT: and.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.and(<8 x i16> %in) ret i16 %res } @@ -1693,13 +1792,13 @@ define i32 @reduce_and_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0]; -; CHECK-NEXT: and.b32 %r9, %r3, %r7; -; CHECK-NEXT: and.b32 %r10, %r1, %r5; -; CHECK-NEXT: and.b32 %r11, %r4, %r8; -; CHECK-NEXT: and.b32 %r12, %r2, %r6; -; CHECK-NEXT: and.b32 %r13, %r12, %r11; -; CHECK-NEXT: and.b32 %r14, %r10, %r9; -; CHECK-NEXT: and.b32 %r15, %r14, %r13; +; CHECK-NEXT: and.b32 %r9, %r7, %r8; +; CHECK-NEXT: and.b32 %r10, %r5, %r6; +; CHECK-NEXT: and.b32 %r11, %r10, %r9; +; CHECK-NEXT: and.b32 %r12, %r3, %r4; +; CHECK-NEXT: and.b32 %r13, %r1, %r2; +; CHECK-NEXT: and.b32 %r14, %r13, %r12; +; CHECK-NEXT: and.b32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.and(<8 x i32> %in) @@ -1715,12 +1814,12 @@ define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: ld.param.b32 %r7, [reduce_and_i32_nonpow2_param_0+24]; ; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_and_i32_nonpow2_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_nonpow2_param_0]; -; CHECK-NEXT: and.b32 %r8, %r3, %r7; -; CHECK-NEXT: and.b32 %r9, %r1, %r5; -; CHECK-NEXT: and.b32 %r10, %r9, %r8; -; CHECK-NEXT: and.b32 %r11, %r2, %r6; -; CHECK-NEXT: and.b32 %r12, %r11, %r4; -; CHECK-NEXT: and.b32 %r13, %r10, %r12; +; CHECK-NEXT: and.b32 %r8, %r5, %r6; +; CHECK-NEXT: and.b32 %r9, %r8, %r7; +; CHECK-NEXT: and.b32 %r10, %r3, %r4; +; CHECK-NEXT: and.b32 %r11, %r1, %r2; +; CHECK-NEXT: and.b32 %r12, %r11, %r10; +; CHECK-NEXT: and.b32 %r13, %r12, %r9; ; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.and(<7 x i32> %in) @@ -1728,43 +1827,21 @@ define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_or_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_or_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; -; CHECK-SM80-NEXT: or.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: or.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: or.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_or_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; -; CHECK-SM100-NEXT: or.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: or.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: or.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_or_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; +; CHECK-NEXT: or.b32 %r5, %r2, %r4; +; CHECK-NEXT: or.b32 %r6, %r1, %r3; +; CHECK-NEXT: or.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: or.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.or(<8 x i16> %in) ret i16 %res } @@ -1804,13 +1881,13 @@ define i32 @reduce_or_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0]; -; CHECK-NEXT: or.b32 %r9, %r3, %r7; -; CHECK-NEXT: or.b32 %r10, %r1, %r5; -; CHECK-NEXT: or.b32 %r11, %r4, %r8; -; CHECK-NEXT: or.b32 %r12, %r2, %r6; -; CHECK-NEXT: or.b32 %r13, %r12, %r11; -; CHECK-NEXT: or.b32 %r14, %r10, %r9; -; CHECK-NEXT: or.b32 %r15, %r14, %r13; +; CHECK-NEXT: or.b32 %r9, %r7, %r8; +; CHECK-NEXT: or.b32 %r10, %r5, %r6; +; CHECK-NEXT: or.b32 %r11, %r10, %r9; +; CHECK-NEXT: or.b32 %r12, %r3, %r4; +; CHECK-NEXT: or.b32 %r13, %r1, %r2; +; CHECK-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-NEXT: or.b32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.or(<8 x i32> %in) @@ -1826,12 +1903,12 @@ define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: ld.param.b32 %r7, [reduce_or_i32_nonpow2_param_0+24]; ; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_or_i32_nonpow2_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_nonpow2_param_0]; -; CHECK-NEXT: or.b32 %r8, %r3, %r7; -; CHECK-NEXT: or.b32 %r9, %r1, %r5; -; CHECK-NEXT: or.b32 %r10, %r9, %r8; -; CHECK-NEXT: or.b32 %r11, %r2, %r6; -; CHECK-NEXT: or.b32 %r12, %r11, %r4; -; CHECK-NEXT: or.b32 %r13, %r10, %r12; +; CHECK-NEXT: or.b32 %r8, %r5, %r6; +; CHECK-NEXT: or.b32 %r9, %r8, %r7; +; CHECK-NEXT: or.b32 %r10, %r3, %r4; +; CHECK-NEXT: or.b32 %r11, %r1, %r2; +; CHECK-NEXT: or.b32 %r12, %r11, %r10; +; CHECK-NEXT: or.b32 %r13, %r12, %r9; ; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.or(<7 x i32> %in) @@ -1839,43 +1916,21 @@ define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_xor_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_xor_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; -; CHECK-SM80-NEXT: xor.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: xor.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: xor.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: xor.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_xor_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; -; CHECK-SM100-NEXT: xor.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: xor.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: xor.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: xor.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_xor_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; +; CHECK-NEXT: xor.b32 %r5, %r2, %r4; +; CHECK-NEXT: xor.b32 %r6, %r1, %r3; +; CHECK-NEXT: xor.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: xor.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.xor(<8 x i16> %in) ret i16 %res } @@ -1915,13 +1970,13 @@ define i32 @reduce_xor_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0]; -; CHECK-NEXT: xor.b32 %r9, %r3, %r7; -; CHECK-NEXT: xor.b32 %r10, %r1, %r5; -; CHECK-NEXT: xor.b32 %r11, %r4, %r8; -; CHECK-NEXT: xor.b32 %r12, %r2, %r6; -; CHECK-NEXT: xor.b32 %r13, %r12, %r11; -; CHECK-NEXT: xor.b32 %r14, %r10, %r9; -; CHECK-NEXT: xor.b32 %r15, %r14, %r13; +; CHECK-NEXT: xor.b32 %r9, %r7, %r8; +; CHECK-NEXT: xor.b32 %r10, %r5, %r6; +; CHECK-NEXT: xor.b32 %r11, %r10, %r9; +; CHECK-NEXT: xor.b32 %r12, %r3, %r4; +; CHECK-NEXT: xor.b32 %r13, %r1, %r2; +; CHECK-NEXT: xor.b32 %r14, %r13, %r12; +; CHECK-NEXT: xor.b32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.xor(<8 x i32> %in) @@ -1937,12 +1992,12 @@ define i32 @reduce_xor_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: ld.param.b32 %r7, [reduce_xor_i32_nonpow2_param_0+24]; ; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_xor_i32_nonpow2_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_nonpow2_param_0]; -; CHECK-NEXT: xor.b32 %r8, %r3, %r7; -; CHECK-NEXT: xor.b32 %r9, %r1, %r5; -; CHECK-NEXT: xor.b32 %r10, %r9, %r8; -; CHECK-NEXT: xor.b32 %r11, %r2, %r6; -; CHECK-NEXT: xor.b32 %r12, %r11, %r4; -; CHECK-NEXT: xor.b32 %r13, %r10, %r12; +; CHECK-NEXT: xor.b32 %r8, %r5, %r6; +; CHECK-NEXT: xor.b32 %r9, %r8, %r7; +; CHECK-NEXT: xor.b32 %r10, %r3, %r4; +; CHECK-NEXT: xor.b32 %r11, %r1, %r2; +; CHECK-NEXT: xor.b32 %r12, %r11, %r10; +; CHECK-NEXT: xor.b32 %r13, %r12, %r9; ; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.xor(<7 x i32> %in)