llvm
diff --git a/‎llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Lines changed: 96 additions & 73 deletions b/‎llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Lines changed: 96 additions & 73 deletions
diff --git a/‎llvm/test/CodeGen/NVPTX/aggregate-return.ll
Lines changed: 97 additions & 31 deletions b/‎llvm/test/CodeGen/NVPTX/aggregate-return.ll
Lines changed: 97 additions & 31 deletions
@@ -5089,11 +5089,13 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
   return SDValue();
 }
 
+/// OverrideVT - allows overriding result and memory type
 static std::optional<std::pair<SDValue, SDValue>>
-convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
+convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector,
+                  std::optional<EVT> OverrideVT = std::nullopt) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
-  const EVT ResVT = LD->getValueType(0);
-  const EVT MemVT = LD->getMemoryVT();
+  const EVT ResVT = OverrideVT.value_or(LD->getValueType(0));
+  const EVT MemVT = OverrideVT.value_or(LD->getMemoryVT());
 
   // If we're doing sign/zero extension as part of the load, avoid lowering to
   // a LoadV node. TODO: consider relaxing this restriction.
@@ -5147,33 +5149,31 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
   // pass along the extension information
   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
 
-  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
-                                          LD->getMemoryVT(),
+  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
                                           LD->getMemOperand());
-
-  SmallVector<SDValue> ScalarRes;
-  if (EltVT.isVector()) {
-    assert(EVT(EltVT.getVectorElementType()) == ResVT.getVectorElementType());
-    assert(NumElts * EltVT.getVectorNumElements() ==
-           ResVT.getVectorNumElements());
-    // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
-    // into individual elements.
-    for (const unsigned I : llvm::seq(NumElts)) {
-      SDValue SubVector = NewLD.getValue(I);
-      DAG.ExtractVectorElements(SubVector, ScalarRes);
-    }
-  } else {
-    for (const unsigned I : llvm::seq(NumElts)) {
-      SDValue Res = NewLD.getValue(I);
-      if (LoadEltVT != EltVT)
-        Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
-      ScalarRes.push_back(Res);
-    }
-  }
-
   SDValue LoadChain = NewLD.getValue(NumElts);
 
   if (BuildVector) {
+    SmallVector<SDValue> ScalarRes;
+    if (EltVT.isVector()) {
+      assert(EVT(EltVT.getVectorElementType()) == ResVT.getVectorElementType());
+      assert(NumElts * EltVT.getVectorNumElements() ==
+             ResVT.getVectorNumElements());
+      // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
+      // into individual elements.
+      for (const unsigned I : llvm::seq(NumElts)) {
+        SDValue SubVector = NewLD.getValue(I);
+        DAG.ExtractVectorElements(SubVector, ScalarRes);
+      }
+    } else {
+      for (const unsigned I : llvm::seq(NumElts)) {
+        SDValue Res = NewLD.getValue(I);
+        if (LoadEltVT != EltVT)
+          Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
+        ScalarRes.push_back(Res);
+      }
+    }
+
     const MVT BuildVecVT =
         MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
     SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
@@ -5188,23 +5188,20 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
 static SDValue PerformLoadCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI) {
   auto *MemN = cast<MemSDNode>(N);
-  EVT MemVT = MemN->getMemoryVT();
-
-  // ignore volatile loads
-  if (MemN->isVolatile())
-    return SDValue();
-
   // only operate on vectors of f32s / i64s
-  if (!MemVT.isVector())
+  if (EVT MemVT = MemN->getMemoryVT();
+      !(MemVT == MVT::i64 ||
+        (MemVT.isVector() && (MemVT.getVectorElementType() == MVT::f32 ||
+                              MemVT.getVectorElementType() == MVT::i64))))
     return SDValue();
 
-  EVT ElementVT = MemVT.getVectorElementType();
-  if (!(ElementVT == MVT::f32 ||
-        (ElementVT == MVT::i64 && N->getOpcode() != ISD::LOAD)))
-    return SDValue();
+  const unsigned OrigNumResults =
+      llvm::count_if(N->values(), [](const auto &VT) {
+        return VT == MVT::i64 || VT == MVT::f32 || VT.isVector();
+      });
 
   SmallDenseMap<SDNode *, unsigned> ExtractElts;
-  SDNode *ProxyReg = nullptr;
+  SmallVector<SDNode *> ProxyRegs(OrigNumResults, nullptr);
   SmallVector<std::pair<SDNode *, unsigned /*offset*/>> WorkList{{N, 0}};
   while (!WorkList.empty()) {
     auto [V, Offset] = WorkList.pop_back_val();
@@ -5217,8 +5214,14 @@ static SDValue PerformLoadCombine(SDNode *N,
 
       SDNode *User = U.getUser();
       if (User->getOpcode() == NVPTXISD::ProxyReg) {
+        Offset = U.getResNo() * 2;
+        SDNode *&ProxyReg = ProxyRegs[Offset / 2];
+
+        // We shouldn't have multiple proxy regs for the same value from the
+        // load, but bail out anyway since we don't handle this.
         if (ProxyReg)
-          return SDValue(); // bail out if we've seen a proxy reg?
+          return SDValue();
+
         ProxyReg = User;
       } else if (User->getOpcode() == ISD::BITCAST &&
                  User->getValueType(0) == MVT::v2f32 &&
@@ -5308,9 +5311,18 @@ static SDValue PerformLoadCombine(SDNode *N,
     if (NewGlueIdx)
       NewGlue = NewLoad.getValue(*NewGlueIdx);
   } else if (N->getOpcode() == ISD::LOAD) { // rewrite a load
-    if (auto Result = convertVectorLoad(N, DCI.DAG, /*BuildVector=*/false)) {
+    std::optional<EVT> CastToType;
+    EVT ResVT = N->getValueType(0);
+    if (ResVT == MVT::i64) {
+      // ld.b64 is treated as a vector by subsequent code
+      CastToType = MVT::v2f32;
+    }
+    if (auto Result =
+            convertVectorLoad(N, DCI.DAG, /*BuildVector=*/false, CastToType)) {
       std::tie(NewLoad, NewChain) = *Result;
-      NumElts = MemVT.getVectorNumElements();
+      NumElts =
+          CastToType.value_or(cast<MemSDNode>(NewLoad.getNode())->getMemoryVT())
+              .getVectorNumElements();
       if (NewLoad->getValueType(NewLoad->getNumValues() - 1) == MVT::Glue)
         NewGlue = NewLoad.getValue(NewLoad->getNumValues() - 1);
     }
@@ -5322,54 +5334,65 @@ static SDValue PerformLoadCombine(SDNode *N,
   // (3) begin rewriting uses
   SmallVector<SDValue> NewOutputsF32;
 
-  if (ProxyReg) {
-    // scalarize proxyreg, but first rewrite all uses of chain and glue from the
-    // old load to the new load
+  if (llvm::any_of(ProxyRegs, [](const SDNode *PR) { return PR != nullptr; })) {
+    // scalarize proxy regs, but first rewrite all uses of chain and glue from
+    // the old load to the new load
     DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
     DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
 
-    // Update the new chain and glue to be old inputs to the proxyreg, if they
-    // came from an intervening instruction between this proxyreg and the
-    // original load (ex: callseq_end). Other than bitcasts and extractelts, we
-    // followed all other nodes by chain and glue accesses.
-    if (SDValue OldInChain = ProxyReg->getOperand(0); OldInChain.getNode() != N)
+    for (unsigned ProxyI = 0, ProxyE = ProxyRegs.size(); ProxyI != ProxyE;
+         ++ProxyI) {
+      SDNode *ProxyReg = ProxyRegs[ProxyI];
+
+      // no proxy reg might mean this result is unused
+      if (!ProxyReg)
+        continue;
+
+      // Update the new chain and glue to be old inputs to the proxyreg, if they
+      // came from an intervening instruction between this proxyreg and the
+      // original load (ex: callseq_end). Other than bitcasts and extractelts,
+      // we followed all other nodes by chain and glue accesses.
+      if (SDValue OldInChain = ProxyReg->getOperand(0);
+          OldInChain.getNode() != N)
         NewChain = OldInChain;
-    if (SDValue OldInGlue = ProxyReg->getOperand(2); OldInGlue.getNode() != N)
+      if (SDValue OldInGlue = ProxyReg->getOperand(2); OldInGlue.getNode() != N)
         NewGlue = OldInGlue;
 
-    // update OldChain, OldGlue to the outputs of ProxyReg, which we will
-    // replace later
-    OldChain = SDValue(ProxyReg, 1);
-    OldGlue = SDValue(ProxyReg, 2);
-
-    // generate the scalar proxy regs
-    for (unsigned I = 0, E = NumElts; I != E; ++I) {
-      SDValue ProxyRegElem =
-          DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(ProxyReg),
-                          DCI.DAG.getVTList(MVT::f32, MVT::Other, MVT::Glue),
-                          {NewChain, NewLoad.getValue(I), NewGlue});
-      NewChain = ProxyRegElem.getValue(1);
-      NewGlue = ProxyRegElem.getValue(2);
-      NewOutputsF32.push_back(ProxyRegElem);
+      // update OldChain, OldGlue to the outputs of ProxyReg, which we will
+      // replace later
+      OldChain = SDValue(ProxyReg, 1);
+      OldGlue = SDValue(ProxyReg, 2);
+
+      // generate the scalar proxy regs
+      for (unsigned I = 0, E = 2; I != E; ++I) {
+        SDValue ProxyRegElem = DCI.DAG.getNode(
+            NVPTXISD::ProxyReg, SDLoc(ProxyReg),
+            DCI.DAG.getVTList(MVT::f32, MVT::Other, MVT::Glue),
+            {NewChain, NewLoad.getValue(ProxyI * 2 + I), NewGlue});
+        NewChain = ProxyRegElem.getValue(1);
+        NewGlue = ProxyRegElem.getValue(2);
+        NewOutputsF32.push_back(ProxyRegElem);
+      }
+
+      // replace all uses of the glue and chain from the old proxy reg
+      DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
+      DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
     }
   } else {
     for (unsigned I = 0, E = NumElts; I != E; ++I)
       if (NewLoad->getValueType(I) == MVT::f32)
         NewOutputsF32.push_back(NewLoad.getValue(I));
+
+    // replace all glue and chain nodes
+    DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
+    if (OldGlue)
+      DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
   }
 
-  // now, for all extractelts, replace them with one of the new outputs
+  // replace all extractelts with the new outputs
   for (auto &[Extract, Index] : ExtractElts)
     DCI.CombineTo(Extract, NewOutputsF32[Index], false);
 
-  // now replace all glue and chain nodes
-  DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
-  if (OldGlue)
-    DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
-
-  // cleanup
-  if (ProxyReg)
-    DCI.recursivelyDeleteUnusedNodes(ProxyReg);
   return SDValue();
 }
 
 
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
@@ -7,57 +8,122 @@ declare [2 x float] @bara([2 x float] %input)
 declare {float, float} @bars({float, float} %input)
 
 define void @test_v2f32(<2 x float> %input, ptr %output) {
-; CHECK-LABEL: @test_v2f32
+; CHECK-LABEL: test_v2f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %f<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_v2f32_param_0];
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.b64 [param0], %rd1;
+; CHECK-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    barv,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [retval0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_v2f32_param_1];
+; CHECK-NEXT:    st.v2.b32 [%rd2], {%f1, %f2};
+; CHECK-NEXT:    ret;
   %call = tail call <2 x float> @barv(<2 x float> %input)
-; CHECK: .param .align 8 .b8 retval0[8];
-; CHECK: ld.param.v2.b32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0];
   store <2 x float> %call, ptr %output, align 8
-; CHECK: st.v2.b32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]}
   ret void
 }
 
 define void @test_v3f32(<3 x float> %input, ptr %output) {
-; CHECK-LABEL: @test_v3f32
-;
+; CHECK-LABEL: test_v3f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %f<10>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_v3f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f3, [test_v3f32_param_0+8];
+; CHECK-NEXT:    { // callseq 1, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v2.b32 [param0], {%f1, %f2};
+; CHECK-NEXT:    st.param.b32 [param0+8], %f3;
+; CHECK-NEXT:    .param .align 16 .b8 retval0[16];
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    barv3,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.v2.b32 {%f4, %f5}, [retval0];
+; CHECK-NEXT:    ld.param.b32 %f6, [retval0+8];
+; CHECK-NEXT:    } // callseq 1
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_v3f32_param_1];
+; CHECK-NEXT:    st.b32 [%rd1+8], %f6;
+; CHECK-NEXT:    st.v2.b32 [%rd1], {%f4, %f5};
+; CHECK-NEXT:    ret;
   %call = tail call <3 x float> @barv3(<3 x float> %input)
-; CHECK: .param .align 16 .b8 retval0[16];
-; CHECK-DAG: ld.param.v2.b32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0];
-; CHECK-DAG: ld.param.b32 [[E2:%f[0-9]+]], [retval0+8];
 ; Make sure we don't load more values than than we need to.
-; CHECK-NOT: ld.param.b32 [[E3:%f[0-9]+]], [retval0+12];
   store <3 x float> %call, ptr %output, align 8
-; CHECK-DAG: st.b32 [{{%rd[0-9]}}+8],
-; -- This is suboptimal. We should do st.v2.f32 instead
-;    of combining 2xf32 info i64.
-; CHECK-DAG: st.b64 [{{%rd[0-9]}}],
-; CHECK: ret;
   ret void
 }
 
 define void @test_a2f32([2 x float] %input, ptr %output) {
-; CHECK-LABEL: @test_a2f32
+; CHECK-LABEL: test_a2f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %f<7>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %f1, [test_a2f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [test_a2f32_param_0+4];
+; CHECK-NEXT:    { // callseq 2, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[8];
+; CHECK-NEXT:    st.param.b32 [param0], %f1;
+; CHECK-NEXT:    st.param.b32 [param0+4], %f2;
+; CHECK-NEXT:    .param .align 4 .b8 retval0[8];
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    bara,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.b32 %f3, [retval0];
+; CHECK-NEXT:    ld.param.b32 %f4, [retval0+4];
+; CHECK-NEXT:    } // callseq 2
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_a2f32_param_1];
+; CHECK-NEXT:    st.b32 [%rd1+4], %f4;
+; CHECK-NEXT:    st.b32 [%rd1], %f3;
+; CHECK-NEXT:    ret;
   %call = tail call [2 x float] @bara([2 x float] %input)
-; CHECK: .param .align 4 .b8 retval0[8];
-; CHECK-DAG: ld.param.b32 [[ELEMA1:%f[0-9]+]], [retval0];
-; CHECK-DAG: ld.param.b32 [[ELEMA2:%f[0-9]+]], [retval0+4];
   store [2 x float] %call, ptr %output, align 4
-; CHECK: }
-; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMA1]]
-; CHECK-DAG: st.b32 [{{%rd[0-9]+}}+4], [[ELEMA2]]
   ret void
-; CHECK: ret
 }
 
 define void @test_s2f32({float, float} %input, ptr %output) {
-; CHECK-LABEL: @test_s2f32
+; CHECK-LABEL: test_s2f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %f<7>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %f1, [test_s2f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [test_s2f32_param_0+4];
+; CHECK-NEXT:    { // callseq 3, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[8];
+; CHECK-NEXT:    st.param.b32 [param0], %f1;
+; CHECK-NEXT:    st.param.b32 [param0+4], %f2;
+; CHECK-NEXT:    .param .align 4 .b8 retval0[8];
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    bars,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.b32 %f3, [retval0];
+; CHECK-NEXT:    ld.param.b32 %f4, [retval0+4];
+; CHECK-NEXT:    } // callseq 3
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_s2f32_param_1];
+; CHECK-NEXT:    st.b32 [%rd1+4], %f4;
+; CHECK-NEXT:    st.b32 [%rd1], %f3;
+; CHECK-NEXT:    ret;
   %call = tail call {float, float} @bars({float, float} %input)
-; CHECK: .param .align 4 .b8 retval0[8];
-; CHECK-DAG: ld.param.b32 [[ELEMS1:%f[0-9]+]], [retval0];
-; CHECK-DAG: ld.param.b32 [[ELEMS2:%f[0-9]+]], [retval0+4];
   store {float, float} %call, ptr %output, align 4
-; CHECK: }
-; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMS1]]
-; CHECK-DAG: st.b32 [{{%rd[0-9]+}}+4], [[ELEMS2]]
   ret void
-; CHECK: ret
 }