Skip to content

Commit 71e9b61

Browse files
committed
[NVPTX] update combiner rule for more types of loads
Handle more loads, including ones with multiple proxy registers: - i64 = LOAD - i64 = LoadParam - v2f32,v2f32 = LoadParamV2 Also update the test cases. Because this is an optimization, it is not triggered for some of these tests that compile with no optimizations.
1 parent 70d7c4d commit 71e9b61

File tree

7 files changed

+191
-151
lines changed

7 files changed

+191
-151
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 79 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -5069,9 +5069,13 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
50695069
return SDValue();
50705070
}
50715071

5072+
/// OverrideVT - allows overriding result and memory type
50725073
static std::optional<std::pair<SDValue, SDValue>>
5073-
convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
5074+
convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector,
5075+
std::optional<EVT> OverrideVT = std::nullopt) {
50745076
EVT ResVT = N->getValueType(0);
5077+
if (OverrideVT)
5078+
ResVT = *OverrideVT;
50755079
SDLoc DL(N);
50765080

50775081
assert(ResVT.isVector() && "Vector load must have vector type");
@@ -5085,8 +5089,8 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
50855089

50865090
Align Alignment = LD->getAlign();
50875091
auto &TD = DAG.getDataLayout();
5088-
Align PrefAlign =
5089-
TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
5092+
Align PrefAlign = TD.getPrefTypeAlign(
5093+
OverrideVT.value_or(LD->getMemoryVT()).getTypeForEVT(*DAG.getContext()));
50905094
if (Alignment < PrefAlign) {
50915095
// This load is not sufficiently aligned, so bail out and let this vector
50925096
// load be scalarized. Note that we may still be able to emit smaller
@@ -5131,7 +5135,8 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
51315135
OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
51325136

51335137
SDValue NewLD = DAG.getMemIntrinsicNode(
5134-
Opcode, DL, LdResVTs, OtherOps, LD->getMemoryVT(), LD->getMemOperand());
5138+
Opcode, DL, LdResVTs, OtherOps, OverrideVT.value_or(LD->getMemoryVT()),
5139+
LD->getMemOperand());
51355140

51365141
SDValue LoadChain = NewLD.getValue(NumElts);
51375142

@@ -5170,23 +5175,20 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
51705175
static SDValue PerformLoadCombine(SDNode *N,
51715176
TargetLowering::DAGCombinerInfo &DCI) {
51725177
auto *MemN = cast<MemSDNode>(N);
5173-
EVT MemVT = MemN->getMemoryVT();
5174-
5175-
// ignore volatile loads
5176-
if (MemN->isVolatile())
5177-
return SDValue();
5178-
51795178
// only operate on vectors of f32s / i64s
5180-
if (!MemVT.isVector())
5179+
if (EVT MemVT = MemN->getMemoryVT();
5180+
!(MemVT == MVT::i64 ||
5181+
(MemVT.isVector() && (MemVT.getVectorElementType() == MVT::f32 ||
5182+
MemVT.getVectorElementType() == MVT::i64))))
51815183
return SDValue();
51825184

5183-
EVT ElementVT = MemVT.getVectorElementType();
5184-
if (!(ElementVT == MVT::f32 ||
5185-
(ElementVT == MVT::i64 && N->getOpcode() != ISD::LOAD)))
5186-
return SDValue();
5185+
const unsigned OrigNumResults =
5186+
llvm::count_if(N->values(), [](const auto &VT) {
5187+
return VT == MVT::i64 || VT == MVT::f32 || VT.isVector();
5188+
});
51875189

51885190
SmallDenseMap<SDNode *, unsigned> ExtractElts;
5189-
SDNode *ProxyReg = nullptr;
5191+
SmallVector<SDNode *> ProxyRegs(OrigNumResults, nullptr);
51905192
SmallVector<std::pair<SDNode *, unsigned /*offset*/>> WorkList{{N, 0}};
51915193
while (!WorkList.empty()) {
51925194
auto [V, Offset] = WorkList.pop_back_val();
@@ -5199,8 +5201,14 @@ static SDValue PerformLoadCombine(SDNode *N,
51995201

52005202
SDNode *User = U.getUser();
52015203
if (User->getOpcode() == NVPTXISD::ProxyReg) {
5204+
Offset = U.getResNo() * 2;
5205+
SDNode *&ProxyReg = ProxyRegs[Offset / 2];
5206+
5207+
// We shouldn't have multiple proxy regs for the same value from the
5208+
// load, but bail out anyway since we don't handle this.
52025209
if (ProxyReg)
5203-
return SDValue(); // bail out if we've seen a proxy reg?
5210+
return SDValue();
5211+
52045212
ProxyReg = User;
52055213
} else if (User->getOpcode() == ISD::BITCAST &&
52065214
User->getValueType(0) == MVT::v2f32 &&
@@ -5290,9 +5298,18 @@ static SDValue PerformLoadCombine(SDNode *N,
52905298
if (NewGlueIdx)
52915299
NewGlue = NewLoad.getValue(*NewGlueIdx);
52925300
} else if (N->getOpcode() == ISD::LOAD) { // rewrite a load
5293-
if (auto Result = convertVectorLoad(N, DCI.DAG, /*BuildVector=*/false)) {
5301+
std::optional<EVT> CastToType;
5302+
EVT ResVT = N->getValueType(0);
5303+
if (ResVT == MVT::i64) {
5304+
// ld.b64 is treated as a vector by subsequent code
5305+
CastToType = MVT::v2f32;
5306+
}
5307+
if (auto Result =
5308+
convertVectorLoad(N, DCI.DAG, /*BuildVector=*/false, CastToType)) {
52945309
std::tie(NewLoad, NewChain) = *Result;
5295-
NumElts = MemVT.getVectorNumElements();
5310+
NumElts =
5311+
CastToType.value_or(cast<MemSDNode>(NewLoad.getNode())->getMemoryVT())
5312+
.getVectorNumElements();
52965313
if (NewLoad->getValueType(NewLoad->getNumValues() - 1) == MVT::Glue)
52975314
NewGlue = NewLoad.getValue(NewLoad->getNumValues() - 1);
52985315
}
@@ -5304,54 +5321,65 @@ static SDValue PerformLoadCombine(SDNode *N,
53045321
// (3) begin rewriting uses
53055322
SmallVector<SDValue> NewOutputsF32;
53065323

5307-
if (ProxyReg) {
5308-
// scalarize proxyreg, but first rewrite all uses of chain and glue from the
5309-
// old load to the new load
5324+
if (llvm::any_of(ProxyRegs, [](const SDNode *PR) { return PR != nullptr; })) {
5325+
// scalarize proxy regs, but first rewrite all uses of chain and glue from
5326+
// the old load to the new load
53105327
DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
53115328
DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
53125329

5313-
// Update the new chain and glue to be old inputs to the proxyreg, if they
5314-
// came from an intervening instruction between this proxyreg and the
5315-
// original load (ex: callseq_end). Other than bitcasts and extractelts, we
5316-
// followed all other nodes by chain and glue accesses.
5317-
if (SDValue OldInChain = ProxyReg->getOperand(0); OldInChain.getNode() != N)
5330+
for (unsigned ProxyI = 0, ProxyE = ProxyRegs.size(); ProxyI != ProxyE;
5331+
++ProxyI) {
5332+
SDNode *ProxyReg = ProxyRegs[ProxyI];
5333+
5334+
// no proxy reg might mean this result is unused
5335+
if (!ProxyReg)
5336+
continue;
5337+
5338+
// Update the new chain and glue to be old inputs to the proxyreg, if they
5339+
// came from an intervening instruction between this proxyreg and the
5340+
// original load (ex: callseq_end). Other than bitcasts and extractelts,
5341+
// we followed all other nodes by chain and glue accesses.
5342+
if (SDValue OldInChain = ProxyReg->getOperand(0);
5343+
OldInChain.getNode() != N)
53185344
NewChain = OldInChain;
5319-
if (SDValue OldInGlue = ProxyReg->getOperand(2); OldInGlue.getNode() != N)
5345+
if (SDValue OldInGlue = ProxyReg->getOperand(2); OldInGlue.getNode() != N)
53205346
NewGlue = OldInGlue;
53215347

5322-
// update OldChain, OldGlue to the outputs of ProxyReg, which we will
5323-
// replace later
5324-
OldChain = SDValue(ProxyReg, 1);
5325-
OldGlue = SDValue(ProxyReg, 2);
5326-
5327-
// generate the scalar proxy regs
5328-
for (unsigned I = 0, E = NumElts; I != E; ++I) {
5329-
SDValue ProxyRegElem =
5330-
DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(ProxyReg),
5331-
DCI.DAG.getVTList(MVT::f32, MVT::Other, MVT::Glue),
5332-
{NewChain, NewLoad.getValue(I), NewGlue});
5333-
NewChain = ProxyRegElem.getValue(1);
5334-
NewGlue = ProxyRegElem.getValue(2);
5335-
NewOutputsF32.push_back(ProxyRegElem);
5348+
// update OldChain, OldGlue to the outputs of ProxyReg, which we will
5349+
// replace later
5350+
OldChain = SDValue(ProxyReg, 1);
5351+
OldGlue = SDValue(ProxyReg, 2);
5352+
5353+
// generate the scalar proxy regs
5354+
for (unsigned I = 0, E = 2; I != E; ++I) {
5355+
SDValue ProxyRegElem = DCI.DAG.getNode(
5356+
NVPTXISD::ProxyReg, SDLoc(ProxyReg),
5357+
DCI.DAG.getVTList(MVT::f32, MVT::Other, MVT::Glue),
5358+
{NewChain, NewLoad.getValue(ProxyI * 2 + I), NewGlue});
5359+
NewChain = ProxyRegElem.getValue(1);
5360+
NewGlue = ProxyRegElem.getValue(2);
5361+
NewOutputsF32.push_back(ProxyRegElem);
5362+
}
5363+
5364+
// replace all uses of the glue and chain from the old proxy reg
5365+
DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
5366+
DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
53365367
}
53375368
} else {
53385369
for (unsigned I = 0, E = NumElts; I != E; ++I)
53395370
if (NewLoad->getValueType(I) == MVT::f32)
53405371
NewOutputsF32.push_back(NewLoad.getValue(I));
5372+
5373+
// replace all glue and chain nodes
5374+
DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
5375+
if (OldGlue)
5376+
DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
53415377
}
53425378

5343-
// now, for all extractelts, replace them with one of the new outputs
5379+
// replace all extractelts with the new outputs
53445380
for (auto &[Extract, Index] : ExtractElts)
53455381
DCI.CombineTo(Extract, NewOutputsF32[Index], false);
53465382

5347-
// now replace all glue and chain nodes
5348-
DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
5349-
if (OldGlue)
5350-
DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
5351-
5352-
// cleanup
5353-
if (ProxyReg)
5354-
DCI.recursivelyDeleteUnusedNodes(ProxyReg);
53555383
return SDValue();
53565384
}
53575385

llvm/test/CodeGen/NVPTX/aggregate-return.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,7 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
2727
; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12];
2828
store <3 x float> %call, ptr %output, align 8
2929
; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8],
30-
; -- This is suboptimal. We should do st.v2.f32 instead
31-
; of combining 2xf32 info i64.
32-
; CHECK-DAG: st.u64 [{{%rd[0-9]}}],
30+
; CHECK-DAG: st.v2.f32 [{{%rd[0-9]}}], {[[E0]], [[E1]]}
3331
; CHECK: ret;
3432
ret void
3533
}

llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Lines changed: 48 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -762,32 +762,32 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
762762
; SM70-NEXT: // %bb.0:
763763
; SM70-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
764764
; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
765-
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1;
766-
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
767-
; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r3;
768-
; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r4;
769-
; SM70-NEXT: cvt.u32.u16 %r5, %rs8;
765+
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4;
766+
; SM70-NEXT: cvt.u32.u16 %r5, %rs2;
770767
; SM70-NEXT: shl.b32 %r6, %r5, 16;
771768
; SM70-NEXT: mov.b32 %f1, %r6;
772-
; SM70-NEXT: cvt.u32.u16 %r7, %rs7;
769+
; SM70-NEXT: cvt.u32.u16 %r7, %rs1;
773770
; SM70-NEXT: shl.b32 %r8, %r7, 16;
774771
; SM70-NEXT: mov.b32 %f2, %r8;
775-
; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
772+
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r3;
773+
; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
776774
; SM70-NEXT: shl.b32 %r10, %r9, 16;
777775
; SM70-NEXT: mov.b32 %f3, %r10;
778-
; SM70-NEXT: cvt.u32.u16 %r11, %rs5;
776+
; SM70-NEXT: cvt.u32.u16 %r11, %rs3;
779777
; SM70-NEXT: shl.b32 %r12, %r11, 16;
780778
; SM70-NEXT: mov.b32 %f4, %r12;
781-
; SM70-NEXT: cvt.u32.u16 %r13, %rs4;
779+
; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r2;
780+
; SM70-NEXT: cvt.u32.u16 %r13, %rs6;
782781
; SM70-NEXT: shl.b32 %r14, %r13, 16;
783782
; SM70-NEXT: mov.b32 %f5, %r14;
784-
; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
783+
; SM70-NEXT: cvt.u32.u16 %r15, %rs5;
785784
; SM70-NEXT: shl.b32 %r16, %r15, 16;
786785
; SM70-NEXT: mov.b32 %f6, %r16;
787-
; SM70-NEXT: cvt.u32.u16 %r17, %rs2;
786+
; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1;
787+
; SM70-NEXT: cvt.u32.u16 %r17, %rs8;
788788
; SM70-NEXT: shl.b32 %r18, %r17, 16;
789789
; SM70-NEXT: mov.b32 %f7, %r18;
790-
; SM70-NEXT: cvt.u32.u16 %r19, %rs1;
790+
; SM70-NEXT: cvt.u32.u16 %r19, %rs7;
791791
; SM70-NEXT: shl.b32 %r20, %r19, 16;
792792
; SM70-NEXT: mov.b32 %f8, %r20;
793793
; SM70-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
@@ -804,18 +804,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
804804
; SM80-NEXT: // %bb.0:
805805
; SM80-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
806806
; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
807-
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1;
808-
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
809-
; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r3;
810-
; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r4;
811-
; SM80-NEXT: cvt.f32.bf16 %f1, %rs8;
812-
; SM80-NEXT: cvt.f32.bf16 %f2, %rs7;
813-
; SM80-NEXT: cvt.f32.bf16 %f3, %rs6;
814-
; SM80-NEXT: cvt.f32.bf16 %f4, %rs5;
815-
; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
816-
; SM80-NEXT: cvt.f32.bf16 %f6, %rs3;
817-
; SM80-NEXT: cvt.f32.bf16 %f7, %rs2;
818-
; SM80-NEXT: cvt.f32.bf16 %f8, %rs1;
807+
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
808+
; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
809+
; SM80-NEXT: cvt.f32.bf16 %f2, %rs1;
810+
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r3;
811+
; SM80-NEXT: cvt.f32.bf16 %f3, %rs4;
812+
; SM80-NEXT: cvt.f32.bf16 %f4, %rs3;
813+
; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r2;
814+
; SM80-NEXT: cvt.f32.bf16 %f5, %rs6;
815+
; SM80-NEXT: cvt.f32.bf16 %f6, %rs5;
816+
; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r1;
817+
; SM80-NEXT: cvt.f32.bf16 %f7, %rs8;
818+
; SM80-NEXT: cvt.f32.bf16 %f8, %rs7;
819819
; SM80-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
820820
; SM80-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
821821
; SM80-NEXT: ret;
@@ -830,18 +830,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
830830
; SM80-FTZ-NEXT: // %bb.0:
831831
; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
832832
; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
833-
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1;
834-
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r2;
835-
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r3;
836-
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r4;
837-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs8;
838-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs7;
839-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3, %rs6;
840-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs5;
841-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
842-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6, %rs3;
843-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs2;
844-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs1;
833+
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4;
834+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
835+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs1;
836+
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r3;
837+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3, %rs4;
838+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs3;
839+
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r2;
840+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs6;
841+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6, %rs5;
842+
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r1;
843+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs8;
844+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs7;
845845
; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
846846
; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
847847
; SM80-FTZ-NEXT: ret;
@@ -856,18 +856,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
856856
; SM90-NEXT: // %bb.0:
857857
; SM90-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
858858
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
859-
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1;
860-
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r2;
861-
; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r3;
862-
; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r4;
863-
; SM90-NEXT: cvt.f32.bf16 %f1, %rs8;
864-
; SM90-NEXT: cvt.f32.bf16 %f2, %rs7;
865-
; SM90-NEXT: cvt.f32.bf16 %f3, %rs6;
866-
; SM90-NEXT: cvt.f32.bf16 %f4, %rs5;
867-
; SM90-NEXT: cvt.f32.bf16 %f5, %rs4;
868-
; SM90-NEXT: cvt.f32.bf16 %f6, %rs3;
869-
; SM90-NEXT: cvt.f32.bf16 %f7, %rs2;
870-
; SM90-NEXT: cvt.f32.bf16 %f8, %rs1;
859+
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r4;
860+
; SM90-NEXT: cvt.f32.bf16 %f1, %rs2;
861+
; SM90-NEXT: cvt.f32.bf16 %f2, %rs1;
862+
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r3;
863+
; SM90-NEXT: cvt.f32.bf16 %f3, %rs4;
864+
; SM90-NEXT: cvt.f32.bf16 %f4, %rs3;
865+
; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r2;
866+
; SM90-NEXT: cvt.f32.bf16 %f5, %rs6;
867+
; SM90-NEXT: cvt.f32.bf16 %f6, %rs5;
868+
; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r1;
869+
; SM90-NEXT: cvt.f32.bf16 %f7, %rs8;
870+
; SM90-NEXT: cvt.f32.bf16 %f8, %rs7;
871871
; SM90-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
872872
; SM90-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
873873
; SM90-NEXT: ret;

0 commit comments

Comments
 (0)