Skip to content

Commit 00f5b69

Browse files
committed
[NVPTX] update combiner rule for more types of loads
Handle more loads, including ones with multiple proxy registers: - i64 = LOAD - i64 = LoadParam - v2f32,v2f32 = LoadParamV2 Also update the test cases. Because this is an optimization, it is not triggered for some of these tests that compile with no optimizations.
1 parent 69e590e commit 00f5b69

File tree

7 files changed

+191
-151
lines changed

7 files changed

+191
-151
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 79 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4958,9 +4958,13 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
49584958
return SDValue();
49594959
}
49604960

4961+
/// OverrideVT - allows overriding result and memory type
49614962
static std::optional<std::pair<SDValue, SDValue>>
4962-
convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
4963+
convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector,
4964+
std::optional<EVT> OverrideVT = std::nullopt) {
49634965
EVT ResVT = N->getValueType(0);
4966+
if (OverrideVT)
4967+
ResVT = *OverrideVT;
49644968
SDLoc DL(N);
49654969

49664970
assert(ResVT.isVector() && "Vector load must have vector type");
@@ -4974,8 +4978,8 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
49744978

49754979
Align Alignment = LD->getAlign();
49764980
auto &TD = DAG.getDataLayout();
4977-
Align PrefAlign =
4978-
TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
4981+
Align PrefAlign = TD.getPrefTypeAlign(
4982+
OverrideVT.value_or(LD->getMemoryVT()).getTypeForEVT(*DAG.getContext()));
49794983
if (Alignment < PrefAlign) {
49804984
// This load is not sufficiently aligned, so bail out and let this vector
49814985
// load be scalarized. Note that we may still be able to emit smaller
@@ -5020,7 +5024,8 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
50205024
OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
50215025

50225026
SDValue NewLD = DAG.getMemIntrinsicNode(
5023-
Opcode, DL, LdResVTs, OtherOps, LD->getMemoryVT(), LD->getMemOperand());
5027+
Opcode, DL, LdResVTs, OtherOps, OverrideVT.value_or(LD->getMemoryVT()),
5028+
LD->getMemOperand());
50245029

50255030
SDValue LoadChain = NewLD.getValue(NumElts);
50265031

@@ -5059,23 +5064,20 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
50595064
static SDValue PerformLoadCombine(SDNode *N,
50605065
TargetLowering::DAGCombinerInfo &DCI) {
50615066
auto *MemN = cast<MemSDNode>(N);
5062-
EVT MemVT = MemN->getMemoryVT();
5063-
5064-
// ignore volatile loads
5065-
if (MemN->isVolatile())
5066-
return SDValue();
5067-
50685067
// only operate on vectors of f32s / i64s
5069-
if (!MemVT.isVector())
5068+
if (EVT MemVT = MemN->getMemoryVT();
5069+
!(MemVT == MVT::i64 ||
5070+
(MemVT.isVector() && (MemVT.getVectorElementType() == MVT::f32 ||
5071+
MemVT.getVectorElementType() == MVT::i64))))
50705072
return SDValue();
50715073

5072-
EVT ElementVT = MemVT.getVectorElementType();
5073-
if (!(ElementVT == MVT::f32 ||
5074-
(ElementVT == MVT::i64 && N->getOpcode() != ISD::LOAD)))
5075-
return SDValue();
5074+
const unsigned OrigNumResults =
5075+
llvm::count_if(N->values(), [](const auto &VT) {
5076+
return VT == MVT::i64 || VT == MVT::f32 || VT.isVector();
5077+
});
50765078

50775079
SmallDenseMap<SDNode *, unsigned> ExtractElts;
5078-
SDNode *ProxyReg = nullptr;
5080+
SmallVector<SDNode *> ProxyRegs(OrigNumResults, nullptr);
50795081
SmallVector<std::pair<SDNode *, unsigned /*offset*/>> WorkList{{N, 0}};
50805082
while (!WorkList.empty()) {
50815083
auto [V, Offset] = WorkList.pop_back_val();
@@ -5088,8 +5090,14 @@ static SDValue PerformLoadCombine(SDNode *N,
50885090

50895091
SDNode *User = U.getUser();
50905092
if (User->getOpcode() == NVPTXISD::ProxyReg) {
5093+
Offset = U.getResNo() * 2;
5094+
SDNode *&ProxyReg = ProxyRegs[Offset / 2];
5095+
5096+
// We shouldn't have multiple proxy regs for the same value from the
5097+
// load, but bail out anyway since we don't handle this.
50915098
if (ProxyReg)
5092-
return SDValue(); // bail out if we've seen a proxy reg?
5099+
return SDValue();
5100+
50935101
ProxyReg = User;
50945102
} else if (User->getOpcode() == ISD::BITCAST &&
50955103
User->getValueType(0) == MVT::v2f32 &&
@@ -5179,9 +5187,18 @@ static SDValue PerformLoadCombine(SDNode *N,
51795187
if (NewGlueIdx)
51805188
NewGlue = NewLoad.getValue(*NewGlueIdx);
51815189
} else if (N->getOpcode() == ISD::LOAD) { // rewrite a load
5182-
if (auto Result = convertVectorLoad(N, DCI.DAG, /*BuildVector=*/false)) {
5190+
std::optional<EVT> CastToType;
5191+
EVT ResVT = N->getValueType(0);
5192+
if (ResVT == MVT::i64) {
5193+
// ld.b64 is treated as a vector by subsequent code
5194+
CastToType = MVT::v2f32;
5195+
}
5196+
if (auto Result =
5197+
convertVectorLoad(N, DCI.DAG, /*BuildVector=*/false, CastToType)) {
51835198
std::tie(NewLoad, NewChain) = *Result;
5184-
NumElts = MemVT.getVectorNumElements();
5199+
NumElts =
5200+
CastToType.value_or(cast<MemSDNode>(NewLoad.getNode())->getMemoryVT())
5201+
.getVectorNumElements();
51855202
if (NewLoad->getValueType(NewLoad->getNumValues() - 1) == MVT::Glue)
51865203
NewGlue = NewLoad.getValue(NewLoad->getNumValues() - 1);
51875204
}
@@ -5193,54 +5210,65 @@ static SDValue PerformLoadCombine(SDNode *N,
51935210
// (3) begin rewriting uses
51945211
SmallVector<SDValue> NewOutputsF32;
51955212

5196-
if (ProxyReg) {
5197-
// scalarize proxyreg, but first rewrite all uses of chain and glue from the
5198-
// old load to the new load
5213+
if (llvm::any_of(ProxyRegs, [](const SDNode *PR) { return PR != nullptr; })) {
5214+
// scalarize proxy regs, but first rewrite all uses of chain and glue from
5215+
// the old load to the new load
51995216
DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
52005217
DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
52015218

5202-
// Update the new chain and glue to be old inputs to the proxyreg, if they
5203-
// came from an intervening instruction between this proxyreg and the
5204-
// original load (ex: callseq_end). Other than bitcasts and extractelts, we
5205-
// followed all other nodes by chain and glue accesses.
5206-
if (SDValue OldInChain = ProxyReg->getOperand(0); OldInChain.getNode() != N)
5219+
for (unsigned ProxyI = 0, ProxyE = ProxyRegs.size(); ProxyI != ProxyE;
5220+
++ProxyI) {
5221+
SDNode *ProxyReg = ProxyRegs[ProxyI];
5222+
5223+
// no proxy reg might mean this result is unused
5224+
if (!ProxyReg)
5225+
continue;
5226+
5227+
// Update the new chain and glue to be old inputs to the proxyreg, if they
5228+
// came from an intervening instruction between this proxyreg and the
5229+
// original load (ex: callseq_end). Other than bitcasts and extractelts,
5230+
// we followed all other nodes by chain and glue accesses.
5231+
if (SDValue OldInChain = ProxyReg->getOperand(0);
5232+
OldInChain.getNode() != N)
52075233
NewChain = OldInChain;
5208-
if (SDValue OldInGlue = ProxyReg->getOperand(2); OldInGlue.getNode() != N)
5234+
if (SDValue OldInGlue = ProxyReg->getOperand(2); OldInGlue.getNode() != N)
52095235
NewGlue = OldInGlue;
52105236

5211-
// update OldChain, OldGlue to the outputs of ProxyReg, which we will
5212-
// replace later
5213-
OldChain = SDValue(ProxyReg, 1);
5214-
OldGlue = SDValue(ProxyReg, 2);
5215-
5216-
// generate the scalar proxy regs
5217-
for (unsigned I = 0, E = NumElts; I != E; ++I) {
5218-
SDValue ProxyRegElem =
5219-
DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(ProxyReg),
5220-
DCI.DAG.getVTList(MVT::f32, MVT::Other, MVT::Glue),
5221-
{NewChain, NewLoad.getValue(I), NewGlue});
5222-
NewChain = ProxyRegElem.getValue(1);
5223-
NewGlue = ProxyRegElem.getValue(2);
5224-
NewOutputsF32.push_back(ProxyRegElem);
5237+
// update OldChain, OldGlue to the outputs of ProxyReg, which we will
5238+
// replace later
5239+
OldChain = SDValue(ProxyReg, 1);
5240+
OldGlue = SDValue(ProxyReg, 2);
5241+
5242+
// generate the scalar proxy regs
5243+
for (unsigned I = 0, E = 2; I != E; ++I) {
5244+
SDValue ProxyRegElem = DCI.DAG.getNode(
5245+
NVPTXISD::ProxyReg, SDLoc(ProxyReg),
5246+
DCI.DAG.getVTList(MVT::f32, MVT::Other, MVT::Glue),
5247+
{NewChain, NewLoad.getValue(ProxyI * 2 + I), NewGlue});
5248+
NewChain = ProxyRegElem.getValue(1);
5249+
NewGlue = ProxyRegElem.getValue(2);
5250+
NewOutputsF32.push_back(ProxyRegElem);
5251+
}
5252+
5253+
// replace all uses of the glue and chain from the old proxy reg
5254+
DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
5255+
DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
52255256
}
52265257
} else {
52275258
for (unsigned I = 0, E = NumElts; I != E; ++I)
52285259
if (NewLoad->getValueType(I) == MVT::f32)
52295260
NewOutputsF32.push_back(NewLoad.getValue(I));
5261+
5262+
// replace all glue and chain nodes
5263+
DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
5264+
if (OldGlue)
5265+
DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
52305266
}
52315267

5232-
// now, for all extractelts, replace them with one of the new outputs
5268+
// replace all extractelts with the new outputs
52335269
for (auto &[Extract, Index] : ExtractElts)
52345270
DCI.CombineTo(Extract, NewOutputsF32[Index], false);
52355271

5236-
// now replace all glue and chain nodes
5237-
DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
5238-
if (OldGlue)
5239-
DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
5240-
5241-
// cleanup
5242-
if (ProxyReg)
5243-
DCI.recursivelyDeleteUnusedNodes(ProxyReg);
52445272
return SDValue();
52455273
}
52465274

llvm/test/CodeGen/NVPTX/aggregate-return.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,7 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
2727
; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12];
2828
store <3 x float> %call, ptr %output, align 8
2929
; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8],
30-
; -- This is suboptimal. We should do st.v2.f32 instead
31-
; of combining 2xf32 info i64.
32-
; CHECK-DAG: st.u64 [{{%rd[0-9]}}],
30+
; CHECK-DAG: st.v2.f32 [{{%rd[0-9]}}], {[[E0]], [[E1]]}
3331
; CHECK: ret;
3432
ret void
3533
}

llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Lines changed: 48 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -762,32 +762,32 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
762762
; SM70-NEXT: // %bb.0:
763763
; SM70-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
764764
; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
765-
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1;
766-
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
767-
; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r3;
768-
; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r4;
769-
; SM70-NEXT: cvt.u32.u16 %r5, %rs8;
765+
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4;
766+
; SM70-NEXT: cvt.u32.u16 %r5, %rs2;
770767
; SM70-NEXT: shl.b32 %r6, %r5, 16;
771768
; SM70-NEXT: mov.b32 %f1, %r6;
772-
; SM70-NEXT: cvt.u32.u16 %r7, %rs7;
769+
; SM70-NEXT: cvt.u32.u16 %r7, %rs1;
773770
; SM70-NEXT: shl.b32 %r8, %r7, 16;
774771
; SM70-NEXT: mov.b32 %f2, %r8;
775-
; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
772+
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r3;
773+
; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
776774
; SM70-NEXT: shl.b32 %r10, %r9, 16;
777775
; SM70-NEXT: mov.b32 %f3, %r10;
778-
; SM70-NEXT: cvt.u32.u16 %r11, %rs5;
776+
; SM70-NEXT: cvt.u32.u16 %r11, %rs3;
779777
; SM70-NEXT: shl.b32 %r12, %r11, 16;
780778
; SM70-NEXT: mov.b32 %f4, %r12;
781-
; SM70-NEXT: cvt.u32.u16 %r13, %rs4;
779+
; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r2;
780+
; SM70-NEXT: cvt.u32.u16 %r13, %rs6;
782781
; SM70-NEXT: shl.b32 %r14, %r13, 16;
783782
; SM70-NEXT: mov.b32 %f5, %r14;
784-
; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
783+
; SM70-NEXT: cvt.u32.u16 %r15, %rs5;
785784
; SM70-NEXT: shl.b32 %r16, %r15, 16;
786785
; SM70-NEXT: mov.b32 %f6, %r16;
787-
; SM70-NEXT: cvt.u32.u16 %r17, %rs2;
786+
; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1;
787+
; SM70-NEXT: cvt.u32.u16 %r17, %rs8;
788788
; SM70-NEXT: shl.b32 %r18, %r17, 16;
789789
; SM70-NEXT: mov.b32 %f7, %r18;
790-
; SM70-NEXT: cvt.u32.u16 %r19, %rs1;
790+
; SM70-NEXT: cvt.u32.u16 %r19, %rs7;
791791
; SM70-NEXT: shl.b32 %r20, %r19, 16;
792792
; SM70-NEXT: mov.b32 %f8, %r20;
793793
; SM70-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
@@ -804,18 +804,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
804804
; SM80-NEXT: // %bb.0:
805805
; SM80-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
806806
; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
807-
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1;
808-
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
809-
; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r3;
810-
; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r4;
811-
; SM80-NEXT: cvt.f32.bf16 %f1, %rs8;
812-
; SM80-NEXT: cvt.f32.bf16 %f2, %rs7;
813-
; SM80-NEXT: cvt.f32.bf16 %f3, %rs6;
814-
; SM80-NEXT: cvt.f32.bf16 %f4, %rs5;
815-
; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
816-
; SM80-NEXT: cvt.f32.bf16 %f6, %rs3;
817-
; SM80-NEXT: cvt.f32.bf16 %f7, %rs2;
818-
; SM80-NEXT: cvt.f32.bf16 %f8, %rs1;
807+
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
808+
; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
809+
; SM80-NEXT: cvt.f32.bf16 %f2, %rs1;
810+
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r3;
811+
; SM80-NEXT: cvt.f32.bf16 %f3, %rs4;
812+
; SM80-NEXT: cvt.f32.bf16 %f4, %rs3;
813+
; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r2;
814+
; SM80-NEXT: cvt.f32.bf16 %f5, %rs6;
815+
; SM80-NEXT: cvt.f32.bf16 %f6, %rs5;
816+
; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r1;
817+
; SM80-NEXT: cvt.f32.bf16 %f7, %rs8;
818+
; SM80-NEXT: cvt.f32.bf16 %f8, %rs7;
819819
; SM80-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
820820
; SM80-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
821821
; SM80-NEXT: ret;
@@ -830,18 +830,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
830830
; SM80-FTZ-NEXT: // %bb.0:
831831
; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
832832
; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
833-
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1;
834-
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r2;
835-
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r3;
836-
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r4;
837-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs8;
838-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs7;
839-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3, %rs6;
840-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs5;
841-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
842-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6, %rs3;
843-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs2;
844-
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs1;
833+
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4;
834+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
835+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs1;
836+
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r3;
837+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3, %rs4;
838+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs3;
839+
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r2;
840+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs6;
841+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6, %rs5;
842+
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r1;
843+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs8;
844+
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs7;
845845
; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
846846
; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
847847
; SM80-FTZ-NEXT: ret;
@@ -856,18 +856,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
856856
; SM90-NEXT: // %bb.0:
857857
; SM90-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
858858
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
859-
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1;
860-
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r2;
861-
; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r3;
862-
; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r4;
863-
; SM90-NEXT: cvt.f32.bf16 %f1, %rs8;
864-
; SM90-NEXT: cvt.f32.bf16 %f2, %rs7;
865-
; SM90-NEXT: cvt.f32.bf16 %f3, %rs6;
866-
; SM90-NEXT: cvt.f32.bf16 %f4, %rs5;
867-
; SM90-NEXT: cvt.f32.bf16 %f5, %rs4;
868-
; SM90-NEXT: cvt.f32.bf16 %f6, %rs3;
869-
; SM90-NEXT: cvt.f32.bf16 %f7, %rs2;
870-
; SM90-NEXT: cvt.f32.bf16 %f8, %rs1;
859+
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r4;
860+
; SM90-NEXT: cvt.f32.bf16 %f1, %rs2;
861+
; SM90-NEXT: cvt.f32.bf16 %f2, %rs1;
862+
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r3;
863+
; SM90-NEXT: cvt.f32.bf16 %f3, %rs4;
864+
; SM90-NEXT: cvt.f32.bf16 %f4, %rs3;
865+
; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r2;
866+
; SM90-NEXT: cvt.f32.bf16 %f5, %rs6;
867+
; SM90-NEXT: cvt.f32.bf16 %f6, %rs5;
868+
; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r1;
869+
; SM90-NEXT: cvt.f32.bf16 %f7, %rs8;
870+
; SM90-NEXT: cvt.f32.bf16 %f8, %rs7;
871871
; SM90-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
872872
; SM90-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
873873
; SM90-NEXT: ret;

0 commit comments

Comments
 (0)