Skip to content

Commit 896665b

Browse files
committed
[NVPTX] update combiner rule for more types of loads
Handle more loads, including ones with multiple proxy registers: - i64 = LOAD - i64 = LoadParam - v2f32,v2f32 = LoadParamV2 Also update the test cases. Because this is an optimization, it is not triggered for some of these tests that compile with no optimizations.
1 parent fc44c92 commit 896665b

File tree

7 files changed

+191
-151
lines changed

7 files changed

+191
-151
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 79 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -5071,9 +5071,13 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
50715071
return SDValue();
50725072
}
50735073

5074+
/// OverrideVT - allows overriding result and memory type
50745075
static std::optional<std::pair<SDValue, SDValue>>
5075-
convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
5076+
convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector,
5077+
std::optional<EVT> OverrideVT = std::nullopt) {
50765078
EVT ResVT = N->getValueType(0);
5079+
if (OverrideVT)
5080+
ResVT = *OverrideVT;
50775081
SDLoc DL(N);
50785082

50795083
assert(ResVT.isVector() && "Vector load must have vector type");
@@ -5087,8 +5091,8 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
50875091

50885092
Align Alignment = LD->getAlign();
50895093
auto &TD = DAG.getDataLayout();
5090-
Align PrefAlign =
5091-
TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
5094+
Align PrefAlign = TD.getPrefTypeAlign(
5095+
OverrideVT.value_or(LD->getMemoryVT()).getTypeForEVT(*DAG.getContext()));
50925096
if (Alignment < PrefAlign) {
50935097
// This load is not sufficiently aligned, so bail out and let this vector
50945098
// load be scalarized. Note that we may still be able to emit smaller
@@ -5133,7 +5137,8 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
51335137
OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
51345138

51355139
SDValue NewLD = DAG.getMemIntrinsicNode(
5136-
Opcode, DL, LdResVTs, OtherOps, LD->getMemoryVT(), LD->getMemOperand());
5140+
Opcode, DL, LdResVTs, OtherOps, OverrideVT.value_or(LD->getMemoryVT()),
5141+
LD->getMemOperand());
51375142

51385143
SDValue LoadChain = NewLD.getValue(NumElts);
51395144

@@ -5172,23 +5177,20 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
51725177
static SDValue PerformLoadCombine(SDNode *N,
51735178
TargetLowering::DAGCombinerInfo &DCI) {
51745179
auto *MemN = cast<MemSDNode>(N);
5175-
EVT MemVT = MemN->getMemoryVT();
5176-
5177-
// ignore volatile loads
5178-
if (MemN->isVolatile())
5179-
return SDValue();
5180-
51815180
// only operate on vectors of f32s / i64s
5182-
if (!MemVT.isVector())
5181+
if (EVT MemVT = MemN->getMemoryVT();
5182+
!(MemVT == MVT::i64 ||
5183+
(MemVT.isVector() && (MemVT.getVectorElementType() == MVT::f32 ||
5184+
MemVT.getVectorElementType() == MVT::i64))))
51835185
return SDValue();
51845186

5185-
EVT ElementVT = MemVT.getVectorElementType();
5186-
if (!(ElementVT == MVT::f32 ||
5187-
(ElementVT == MVT::i64 && N->getOpcode() != ISD::LOAD)))
5188-
return SDValue();
5187+
const unsigned OrigNumResults =
5188+
llvm::count_if(N->values(), [](const auto &VT) {
5189+
return VT == MVT::i64 || VT == MVT::f32 || VT.isVector();
5190+
});
51895191

51905192
SmallDenseMap<SDNode *, unsigned> ExtractElts;
5191-
SDNode *ProxyReg = nullptr;
5193+
SmallVector<SDNode *> ProxyRegs(OrigNumResults, nullptr);
51925194
SmallVector<std::pair<SDNode *, unsigned /*offset*/>> WorkList{{N, 0}};
51935195
while (!WorkList.empty()) {
51945196
auto [V, Offset] = WorkList.pop_back_val();
@@ -5201,8 +5203,14 @@ static SDValue PerformLoadCombine(SDNode *N,
52015203

52025204
SDNode *User = U.getUser();
52035205
if (User->getOpcode() == NVPTXISD::ProxyReg) {
5206+
Offset = U.getResNo() * 2;
5207+
SDNode *&ProxyReg = ProxyRegs[Offset / 2];
5208+
5209+
// We shouldn't have multiple proxy regs for the same value from the
5210+
// load, but bail out anyway since we don't handle this.
52045211
if (ProxyReg)
5205-
return SDValue(); // bail out if we've seen a proxy reg?
5212+
return SDValue();
5213+
52065214
ProxyReg = User;
52075215
} else if (User->getOpcode() == ISD::BITCAST &&
52085216
User->getValueType(0) == MVT::v2f32 &&
@@ -5292,9 +5300,18 @@ static SDValue PerformLoadCombine(SDNode *N,
52925300
if (NewGlueIdx)
52935301
NewGlue = NewLoad.getValue(*NewGlueIdx);
52945302
} else if (N->getOpcode() == ISD::LOAD) { // rewrite a load
5295-
if (auto Result = convertVectorLoad(N, DCI.DAG, /*BuildVector=*/false)) {
5303+
std::optional<EVT> CastToType;
5304+
EVT ResVT = N->getValueType(0);
5305+
if (ResVT == MVT::i64) {
5306+
// ld.b64 is treated as a vector by subsequent code
5307+
CastToType = MVT::v2f32;
5308+
}
5309+
if (auto Result =
5310+
convertVectorLoad(N, DCI.DAG, /*BuildVector=*/false, CastToType)) {
52965311
std::tie(NewLoad, NewChain) = *Result;
5297-
NumElts = MemVT.getVectorNumElements();
5312+
NumElts =
5313+
CastToType.value_or(cast<MemSDNode>(NewLoad.getNode())->getMemoryVT())
5314+
.getVectorNumElements();
52985315
if (NewLoad->getValueType(NewLoad->getNumValues() - 1) == MVT::Glue)
52995316
NewGlue = NewLoad.getValue(NewLoad->getNumValues() - 1);
53005317
}
@@ -5306,54 +5323,65 @@ static SDValue PerformLoadCombine(SDNode *N,
53065323
// (3) begin rewriting uses
53075324
SmallVector<SDValue> NewOutputsF32;
53085325

5309-
if (ProxyReg) {
5310-
// scalarize proxyreg, but first rewrite all uses of chain and glue from the
5311-
// old load to the new load
5326+
if (llvm::any_of(ProxyRegs, [](const SDNode *PR) { return PR != nullptr; })) {
5327+
// scalarize proxy regs, but first rewrite all uses of chain and glue from
5328+
// the old load to the new load
53125329
DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
53135330
DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
53145331

5315-
// Update the new chain and glue to be old inputs to the proxyreg, if they
5316-
// came from an intervening instruction between this proxyreg and the
5317-
// original load (ex: callseq_end). Other than bitcasts and extractelts, we
5318-
// followed all other nodes by chain and glue accesses.
5319-
if (SDValue OldInChain = ProxyReg->getOperand(0); OldInChain.getNode() != N)
5332+
for (unsigned ProxyI = 0, ProxyE = ProxyRegs.size(); ProxyI != ProxyE;
5333+
++ProxyI) {
5334+
SDNode *ProxyReg = ProxyRegs[ProxyI];
5335+
5336+
// no proxy reg might mean this result is unused
5337+
if (!ProxyReg)
5338+
continue;
5339+
5340+
// Update the new chain and glue to be old inputs to the proxyreg, if they
5341+
// came from an intervening instruction between this proxyreg and the
5342+
// original load (ex: callseq_end). Other than bitcasts and extractelts,
5343+
// we followed all other nodes by chain and glue accesses.
5344+
if (SDValue OldInChain = ProxyReg->getOperand(0);
5345+
OldInChain.getNode() != N)
53205346
NewChain = OldInChain;
5321-
if (SDValue OldInGlue = ProxyReg->getOperand(2); OldInGlue.getNode() != N)
5347+
if (SDValue OldInGlue = ProxyReg->getOperand(2); OldInGlue.getNode() != N)
53225348
NewGlue = OldInGlue;
53235349

5324-
// update OldChain, OldGlue to the outputs of ProxyReg, which we will
5325-
// replace later
5326-
OldChain = SDValue(ProxyReg, 1);
5327-
OldGlue = SDValue(ProxyReg, 2);
5328-
5329-
// generate the scalar proxy regs
5330-
for (unsigned I = 0, E = NumElts; I != E; ++I) {
5331-
SDValue ProxyRegElem =
5332-
DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(ProxyReg),
5333-
DCI.DAG.getVTList(MVT::f32, MVT::Other, MVT::Glue),
5334-
{NewChain, NewLoad.getValue(I), NewGlue});
5335-
NewChain = ProxyRegElem.getValue(1);
5336-
NewGlue = ProxyRegElem.getValue(2);
5337-
NewOutputsF32.push_back(ProxyRegElem);
5350+
// update OldChain, OldGlue to the outputs of ProxyReg, which we will
5351+
// replace later
5352+
OldChain = SDValue(ProxyReg, 1);
5353+
OldGlue = SDValue(ProxyReg, 2);
5354+
5355+
// generate the scalar proxy regs
5356+
for (unsigned I = 0, E = 2; I != E; ++I) {
5357+
SDValue ProxyRegElem = DCI.DAG.getNode(
5358+
NVPTXISD::ProxyReg, SDLoc(ProxyReg),
5359+
DCI.DAG.getVTList(MVT::f32, MVT::Other, MVT::Glue),
5360+
{NewChain, NewLoad.getValue(ProxyI * 2 + I), NewGlue});
5361+
NewChain = ProxyRegElem.getValue(1);
5362+
NewGlue = ProxyRegElem.getValue(2);
5363+
NewOutputsF32.push_back(ProxyRegElem);
5364+
}
5365+
5366+
// replace all uses of the glue and chain from the old proxy reg
5367+
DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
5368+
DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
53385369
}
53395370
} else {
53405371
for (unsigned I = 0, E = NumElts; I != E; ++I)
53415372
if (NewLoad->getValueType(I) == MVT::f32)
53425373
NewOutputsF32.push_back(NewLoad.getValue(I));
5374+
5375+
// replace all glue and chain nodes
5376+
DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
5377+
if (OldGlue)
5378+
DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
53435379
}
53445380

5345-
// now, for all extractelts, replace them with one of the new outputs
5381+
// replace all extractelts with the new outputs
53465382
for (auto &[Extract, Index] : ExtractElts)
53475383
DCI.CombineTo(Extract, NewOutputsF32[Index], false);
53485384

5349-
// now replace all glue and chain nodes
5350-
DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
5351-
if (OldGlue)
5352-
DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
5353-
5354-
// cleanup
5355-
if (ProxyReg)
5356-
DCI.recursivelyDeleteUnusedNodes(ProxyReg);
53575385
return SDValue();
53585386
}
53595387

llvm/test/CodeGen/NVPTX/aggregate-return.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,7 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
2727
; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12];
2828
store <3 x float> %call, ptr %output, align 8
2929
; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8],
30-
; -- This is suboptimal. We should do st.v2.f32 instead
31-
; of combining 2xf32 info i64.
32-
; CHECK-DAG: st.u64 [{{%rd[0-9]}}],
30+
; CHECK-DAG: st.v2.f32 [{{%rd[0-9]}}], {[[E0]], [[E1]]}
3331
; CHECK: ret;
3432
ret void
3533
}

llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Lines changed: 48 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -762,32 +762,32 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
762762
; SM70-NEXT: // %bb.0:
763763
; SM70-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
764764
; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
765-
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1;
766-
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
767-
; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r3;
768-
; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r4;
769-
; SM70-NEXT: cvt.u32.u16 %r5, %rs8;
765+
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4;
766+
; SM70-NEXT: cvt.u32.u16 %r5, %rs2;
770767
; SM70-NEXT: shl.b32 %r6, %r5, 16;
771768
; SM70-NEXT: mov.b32 %f1, %r6;
772-
; SM70-NEXT: cvt.u32.u16 %r7, %rs7;
769+
; SM70-NEXT: cvt.u32.u16 %r7, %rs1;
773770
; SM70-NEXT: shl.b32 %r8, %r7, 16;
774771
; SM70-NEXT: mov.b32 %f2, %r8;
775-
; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
772+
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r3;
773+
; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
776774
; SM70-NEXT: shl.b32 %r10, %r9, 16;
777775
; SM70-NEXT: mov.b32 %f3, %r10;
778-
; SM70-NEXT: cvt.u32.u16 %r11, %rs5;
776+
; SM70-NEXT: cvt.u32.u16 %r11, %rs3;
779777
; SM70-NEXT: shl.b32 %r12, %r11, 16;
780778
; SM70-NEXT: mov.b32 %f4, %r12;
781-
; SM70-NEXT: cvt.u32.u16 %r13, %rs4;
779+
; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r2;
780+
; SM70-NEXT: cvt.u32.u16 %r13, %rs6;
782781
; SM70-NEXT: shl.b32 %r14, %r13, 16;
783782
; SM70-NEXT: mov.b32 %f5, %r14;
784-
; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
783+
; SM70-NEXT: cvt.u32.u16 %r15, %rs5;
785784
; SM70-NEXT: shl.b32 %r16, %r15, 16;
786785
; SM70-NEXT: mov.b32 %f6, %r16;
787-
; SM70-NEXT: cvt.u32.u16 %r17, %rs2;
786+
; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1;
787+
; SM70-NEXT: cvt.u32.u16 %r17, %rs8;
788788
; SM70-NEXT: shl.b32 %r18, %r17, 16;
789789
; SM70-NEXT: mov.b32 %f7, %r18;
790-
; SM70-NEXT: cvt.u32.u16 %r19, %rs1;
790+
; SM70-NEXT: cvt.u32.u16 %r19, %rs7;
791791
; SM70-NEXT: shl.b32 %r20, %r19, 16;
792792
; SM70-NEXT: mov.b32 %f8, %r20;
793793
; SM70-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
@@ -804,18 +804,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
804804
; SM80-NEXT: // %bb.0:
805805
; SM80-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
806806
; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
807-
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1;
808-
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
809-
; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r3;
810-
; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r4;
811-
; SM80-NEXT: cvt.f32.bf16 %f1, %rs8;
812-
; SM80-NEXT: cvt.f32.bf16 %f2, %rs7;
813-
; SM80-NEXT: cvt.f32.bf16 %f3, %rs6;
814-
; SM80-NEXT: cvt.f32.bf16 %f4, %rs5;
815-
; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
816-
; SM80-NEXT: cvt.f32.bf16 %f6, %rs3;
817-
; SM80-NEXT: cvt.f32.bf16 %f7, %rs2;
818-
; SM80-NEXT: cvt.f32.bf16 %f8, %rs1;
807+
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
808+
; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
809+
; SM80-NEXT: cvt.f32.bf16 %f2, %rs1;
810+
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r3;
811+
; SM80-NEXT: cvt.f32.bf16 %f3, %rs4;
812+
; SM80-NEXT: cvt.f32.bf16 %f4, %rs3;
813+
; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r2;
814+
; SM80-NEXT: cvt.f32.bf16 %f5, %rs6;
815+
; SM80-NEXT: cvt.f32.bf16 %f6, %rs5;
816+
; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r1;
817+
; SM80-NEXT: cvt.f32.bf16 %f7, %rs8;
818+
; SM80-NEXT: cvt.f32.bf16 %f8, %rs7;
819819
; SM80-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
820820
; SM80-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
821821
; SM80-NEXT: ret;
@@ -830,18 +830,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
830830
; SM80-FTZ-NEXT: // %bb.0:
831831
; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
832832
; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
833-
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1;
834-
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r2;
835-
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r3;
836-
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r4;
837-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs8;
838-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs7;
839-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3, %rs6;
840-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs5;
841-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
842-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6, %rs3;
843-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs2;
844-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs1;
833+
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4;
834+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
835+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs1;
836+
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r3;
837+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3, %rs4;
838+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs3;
839+
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r2;
840+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs6;
841+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6, %rs5;
842+
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r1;
843+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs8;
844+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs7;
845845
; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
846846
; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
847847
; SM80-FTZ-NEXT: ret;
@@ -856,18 +856,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
856856
; SM90-NEXT: // %bb.0:
857857
; SM90-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
858858
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
859-
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1;
860-
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r2;
861-
; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r3;
862-
; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r4;
863-
; SM90-NEXT: cvt.f32.bf16 %f1, %rs8;
864-
; SM90-NEXT: cvt.f32.bf16 %f2, %rs7;
865-
; SM90-NEXT: cvt.f32.bf16 %f3, %rs6;
866-
; SM90-NEXT: cvt.f32.bf16 %f4, %rs5;
867-
; SM90-NEXT: cvt.f32.bf16 %f5, %rs4;
868-
; SM90-NEXT: cvt.f32.bf16 %f6, %rs3;
869-
; SM90-NEXT: cvt.f32.bf16 %f7, %rs2;
870-
; SM90-NEXT: cvt.f32.bf16 %f8, %rs1;
859+
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r4;
860+
; SM90-NEXT: cvt.f32.bf16 %f1, %rs2;
861+
; SM90-NEXT: cvt.f32.bf16 %f2, %rs1;
862+
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r3;
863+
; SM90-NEXT: cvt.f32.bf16 %f3, %rs4;
864+
; SM90-NEXT: cvt.f32.bf16 %f4, %rs3;
865+
; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r2;
866+
; SM90-NEXT: cvt.f32.bf16 %f5, %rs6;
867+
; SM90-NEXT: cvt.f32.bf16 %f6, %rs5;
868+
; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r1;
869+
; SM90-NEXT: cvt.f32.bf16 %f7, %rs8;
870+
; SM90-NEXT: cvt.f32.bf16 %f8, %rs7;
871871
; SM90-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
872872
; SM90-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
873873
; SM90-NEXT: ret;

0 commit comments

Comments
 (0)