intel
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Lines changed: 328 additions & 5 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Lines changed: 328 additions & 5 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Lines changed: 6 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Lines changed: 6 additions & 0 deletions
@@ -530,13 +530,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
     // Full set of gfx9 features.
-    getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+    getActionDefinitionsBuilder({G_ADD, G_SUB})
       .legalFor({S32, S16, V2S16})
+      .clampMaxNumElementsStrict(0, S16, 2)
+      .scalarize(0)
       .minScalar(0, S16)
+      .widenScalarToNextMultipleOf(0, 32)
+      .maxScalar(0, S32);
+
+    getActionDefinitionsBuilder(G_MUL)
+      .legalFor({S32, S16, V2S16})
       .clampMaxNumElementsStrict(0, S16, 2)
+      .scalarize(0)
+      .minScalar(0, S16)
       .widenScalarToNextMultipleOf(0, 32)
-      .maxScalar(0, S32)
-      .scalarize(0);
+      .custom();
+    assert(ST.hasMad64_32());
 
     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
       .legalFor({S32, S16, V2S16}) // Clamp modifier
@@ -546,13 +555,21 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .widenScalarToNextPow2(0, 32)
       .lower();
   } else if (ST.has16BitInsts()) {
-    getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+    getActionDefinitionsBuilder({G_ADD, G_SUB})
       .legalFor({S32, S16})
       .minScalar(0, S16)
       .widenScalarToNextMultipleOf(0, 32)
       .maxScalar(0, S32)
       .scalarize(0);
 
+    getActionDefinitionsBuilder(G_MUL)
+      .legalFor({S32, S16})
+      .scalarize(0)
+      .minScalar(0, S16)
+      .widenScalarToNextMultipleOf(0, 32)
+      .custom();
+    assert(ST.hasMad64_32());
+
     // Technically the saturating operations require clamp bit support, but this
     // was introduced at the same time as 16-bit operations.
     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
@@ -569,12 +586,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0)
       .lower();
   } else {
-    getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+    getActionDefinitionsBuilder({G_ADD, G_SUB})
       .legalFor({S32})
       .widenScalarToNextMultipleOf(0, 32)
       .clampScalar(0, S32, S32)
       .scalarize(0);
 
+    auto &Mul = getActionDefinitionsBuilder(G_MUL)
+      .legalFor({S32})
+      .scalarize(0)
+      .minScalar(0, S32)
+      .widenScalarToNextMultipleOf(0, 32);
+
+    if (ST.hasMad64_32())
+      Mul.custom();
+    else
+      Mul.maxScalar(0, S32);
+
     if (ST.hasIntClamp()) {
       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
         .legalFor({S32}) // Clamp modifier.
@@ -1763,6 +1791,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
     return legalizeFFloor(MI, MRI, B);
   case TargetOpcode::G_BUILD_VECTOR:
     return legalizeBuildVector(MI, MRI, B);
+  case TargetOpcode::G_MUL:
+    return legalizeMul(Helper, MI);
   case TargetOpcode::G_CTLZ:
   case TargetOpcode::G_CTTZ:
     return legalizeCTLZ_CTTZ(MI, MRI, B);
@@ -2861,6 +2891,299 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
   return true;
 }
 
+// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
+//
+// Source and accumulation registers must all be 32-bits.
+//
+// TODO: When the multiply is uniform, we should produce a code sequence
+// that is better suited to instruction selection on the SALU. Instead of
+// the outer loop going over parts of the result, the outer loop should go
+// over parts of one of the factors. This should result in instruction
+// selection that makes full use of S_ADDC_U32 instructions.
+void AMDGPULegalizerInfo::buildMultiply(
+    LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
+    ArrayRef<Register> Src0, ArrayRef<Register> Src1,
+    bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
+  // Use (possibly empty) vectors of S1 registers to represent the set of
+  // carries from one pair of positions to the next.
+  using Carry = SmallVector<Register, 2>;
+
+  MachineIRBuilder &B = Helper.MIRBuilder;
+
+  const LLT S1 = LLT::scalar(1);
+  const LLT S32 = LLT::scalar(32);
+  const LLT S64 = LLT::scalar(64);
+
+  Register Zero32;
+  Register Zero64;
+
+  auto getZero32 = [&]() -> Register {
+    if (!Zero32)
+      Zero32 = B.buildConstant(S32, 0).getReg(0);
+    return Zero32;
+  };
+  auto getZero64 = [&]() -> Register {
+    if (!Zero64)
+      Zero64 = B.buildConstant(S64, 0).getReg(0);
+    return Zero64;
+  };
+
+  // Merge the given carries into the 32-bit LocalAccum, which is modified
+  // in-place.
+  //
+  // Returns the carry-out, which is a single S1 register or null.
+  auto mergeCarry =
+      [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
+        if (CarryIn.empty())
+          return Register();
+
+        bool HaveCarryOut = true;
+        Register CarryAccum;
+        if (CarryIn.size() == 1) {
+          if (!LocalAccum) {
+            LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
+            return Register();
+          }
+
+          CarryAccum = getZero32();
+        } else {
+          CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
+          for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
+            CarryAccum =
+                B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
+                    .getReg(0);
+          }
+
+          if (!LocalAccum) {
+            LocalAccum = getZero32();
+            HaveCarryOut = false;
+          }
+        }
+
+        auto Add =
+            B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
+        LocalAccum = Add.getReg(0);
+        return HaveCarryOut ? Add.getReg(1) : Register();
+      };
+
+  // Build a multiply-add chain to compute
+  //
+  //   LocalAccum + (partial products at DstIndex)
+  //       + (opportunistic subset of CarryIn)
+  //
+  // LocalAccum is an array of one or two 32-bit registers that are updated
+  // in-place. The incoming registers may be null.
+  //
+  // In some edge cases, carry-ins can be consumed "for free". In that case,
+  // the consumed carry bits are removed from CarryIn in-place.
+  auto buildMadChain =
+      [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
+          -> Carry {
+        assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
+               (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
+
+        Register Tmp;
+        Carry CarryOut;
+        unsigned j0 = 0;
+
+        // Use plain 32-bit multiplication for the most significant part of the
+        // result by default.
+        if (LocalAccum.size() == 1 &&
+            (!UsePartialMad64_32 || !CarryIn.empty())) {
+          do {
+            unsigned j1 = DstIndex - j0;
+            auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
+            if (!LocalAccum[0]) {
+              LocalAccum[0] = Mul.getReg(0);
+            } else {
+              if (CarryIn.empty()) {
+                LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
+              } else {
+                LocalAccum[0] =
+                    B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
+                        .getReg(0);
+                CarryIn.pop_back();
+              }
+            }
+            ++j0;
+          } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
+        }
+
+        // Build full 64-bit multiplies.
+        if (j0 <= DstIndex) {
+          bool HaveSmallAccum = false;
+          Register Tmp;
+
+          if (LocalAccum[0]) {
+            if (LocalAccum.size() == 1) {
+              Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
+              HaveSmallAccum = true;
+            } else if (LocalAccum[1]) {
+              Tmp = B.buildMerge(S64, LocalAccum).getReg(0);
+              HaveSmallAccum = false;
+            } else {
+              Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
+              HaveSmallAccum = true;
+            }
+          } else {
+            assert(LocalAccum.size() == 1 || !LocalAccum[1]);
+            Tmp = getZero64();
+            HaveSmallAccum = true;
+          }
+
+          do {
+            unsigned j1 = DstIndex - j0;
+            auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
+                                    {Src0[j0], Src1[j1], Tmp});
+            Tmp = Mad.getReg(0);
+            if (!HaveSmallAccum)
+              CarryOut.push_back(Mad.getReg(1));
+            HaveSmallAccum = false;
+            ++j0;
+          } while (j0 <= DstIndex);
+
+          auto Unmerge = B.buildUnmerge(S32, Tmp);
+          LocalAccum[0] = Unmerge.getReg(0);
+          if (LocalAccum.size() > 1)
+            LocalAccum[1] = Unmerge.getReg(1);
+        }
+
+        return CarryOut;
+      };
+
+  // Outer multiply loop, iterating over destination parts from least
+  // significant to most significant parts.
+  //
+  // The columns of the following diagram correspond to the destination parts
+  // affected by one iteration of the outer loop (ignoring boundary
+  // conditions).
+  //
+  //   Dest index relative to 2 * i:      1 0 -1
+  //                                      ------
+  //   Carries from previous iteration:     e o
+  //   Even-aligned partial product sum:  E E .
+  //   Odd-aligned partial product sum:     O O
+  //
+  // 'o' is OddCarry, 'e' is EvenCarry.
+  // EE and OO are computed from partial products via buildMadChain and use
+  // accumulation where possible and appropriate.
+  //
+  Register SeparateOddCarry;
+  Carry EvenCarry;
+  Carry OddCarry;
+
+  for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
+    Carry OddCarryIn = std::move(OddCarry);
+    Carry EvenCarryIn = std::move(EvenCarry);
+    OddCarry.clear();
+    EvenCarry.clear();
+
+    // Partial products at offset 2 * i.
+    if (2 * i < Accum.size()) {
+      auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
+      EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
+    }
+
+    // Partial products at offset 2 * i - 1.
+    if (i > 0) {
+      if (!SeparateOddAlignedProducts) {
+        auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
+        OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
+      } else {
+        bool IsHighest = 2 * i >= Accum.size();
+        Register SeparateOddOut[2];
+        auto LocalAccum = makeMutableArrayRef(SeparateOddOut)
+                              .take_front(IsHighest ? 1 : 2);
+        OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
+
+        MachineInstr *Lo;
+
+        if (i == 1) {
+          if (!IsHighest)
+            Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
+          else
+            Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
+        } else {
+          Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
+                            SeparateOddCarry);
+        }
+        Accum[2 * i - 1] = Lo->getOperand(0).getReg();
+
+        if (!IsHighest) {
+          auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
+                                Lo->getOperand(1).getReg());
+          Accum[2 * i] = Hi.getReg(0);
+          SeparateOddCarry = Hi.getReg(1);
+        }
+      }
+    }
+
+    // Add in the carries from the previous iteration
+    if (i > 0) {
+      if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
+        EvenCarryIn.push_back(CarryOut);
+
+      if (2 * i < Accum.size()) {
+        if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
+          OddCarry.push_back(CarryOut);
+      }
+    }
+  }
+}
+
+// Custom narrowing of wide multiplies using wide multiply-add instructions.
+//
+// TODO: If the multiply is followed by an addition, we should attempt to
+// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
+bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
+                                      MachineInstr &MI) const {
+  assert(ST.hasMad64_32());
+  assert(MI.getOpcode() == TargetOpcode::G_MUL);
+
+  MachineIRBuilder &B = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *B.getMRI();
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(1).getReg();
+  Register Src1 = MI.getOperand(2).getReg();
+
+  LLT Ty = MRI.getType(DstReg);
+  assert(Ty.isScalar());
+
+  unsigned Size = Ty.getSizeInBits();
+  unsigned NumParts = Size / 32;
+  assert((Size % 32) == 0);
+  assert(NumParts >= 2);
+
+  // Whether to use MAD_64_32 for partial products whose high half is
+  // discarded. This avoids some ADD instructions but risks false dependency
+  // stalls on some subtargets in some cases.
+  const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
+
+  // Whether to compute odd-aligned partial products separately. This is
+  // advisable on subtargets where the accumulator of MAD_64_32 must be placed
+  // in an even-aligned VGPR.
+  const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
+
+  LLT S32 = LLT::scalar(32);
+  SmallVector<Register, 2> Src0Parts, Src1Parts;
+  for (unsigned i = 0; i < NumParts; ++i) {
+    Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
+    Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
+  }
+  B.buildUnmerge(Src0Parts, Src0);
+  B.buildUnmerge(Src1Parts, Src1);
+
+  SmallVector<Register, 2> AccumRegs(NumParts);
+  buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
+                SeparateOddAlignedProducts);
+
+  B.buildMerge(DstReg, AccumRegs);
+  MI.eraseFromParent();
+  return true;
+
+}
+
 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
 // case with a single min instruction instead of a compare+select.
 
@@ -88,6 +88,12 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
 
   bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &B) const;
+
+  void buildMultiply(LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
+                     ArrayRef<Register> Src0, ArrayRef<Register> Src1,
+                     bool UsePartialMad64_32,
+                     bool SeparateOddAlignedProducts) const;
+  bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
   bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
                          MachineIRBuilder &B) const;