Skip to content

Commit 088391c

Browse files
committed
[NVPTX] update how loads are optimized and disable on O0
1 parent 94a88a6 commit 088391c

File tree

4 files changed

+113
-29
lines changed

4 files changed

+113
-29
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,11 +1189,25 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
11891189

11901190
unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N);
11911191

1192+
LLVM_DEBUG({
1193+
dbgs() << "tryLoadVector on " << TLI->getTargetNodeName(N->getOpcode())
1194+
<< ":\n";
1195+
dbgs() << " load type: " << MemVT << "\n";
1196+
dbgs() << " total load width: " << TotalWidth << " bits\n";
1197+
dbgs() << " from type width: " << FromTypeWidth << " bits\n";
1198+
dbgs() << " element type: " << EltVT << "\n";
1199+
});
1200+
11921201
if (isSubVectorPackedInInteger(EltVT)) {
11931202
assert(ExtensionType == ISD::NON_EXTLOAD);
11941203
FromTypeWidth = EltVT.getSizeInBits();
11951204
EltVT = MVT::getIntegerVT(FromTypeWidth);
11961205
FromType = NVPTX::PTXLdStInstCode::Untyped;
1206+
LLVM_DEBUG({
1207+
dbgs() << " packed integers detected:\n";
1208+
dbgs() << " from type width: " << FromTypeWidth << " (new)\n";
1209+
dbgs() << " element type: " << EltVT << " (new)\n";
1210+
});
11971211
}
11981212

11991213
assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
@@ -1501,9 +1515,23 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
15011515
SDValue N2 = N->getOperand(NumElts + 1);
15021516
unsigned ToTypeWidth = TotalWidth / NumElts;
15031517

1518+
LLVM_DEBUG({
1519+
dbgs() << "tryStoreVector on " << TLI->getTargetNodeName(N->getOpcode())
1520+
<< ":\n";
1521+
dbgs() << " store type: " << StoreVT << "\n";
1522+
dbgs() << " total store width: " << TotalWidth << " bits\n";
1523+
dbgs() << " to type width: " << ToTypeWidth << " bits\n";
1524+
dbgs() << " element type: " << EltVT << "\n";
1525+
});
1526+
15041527
if (isSubVectorPackedInInteger(EltVT)) {
15051528
ToTypeWidth = EltVT.getSizeInBits();
15061529
EltVT = MVT::getIntegerVT(ToTypeWidth);
1530+
LLVM_DEBUG({
1531+
dbgs() << " packed integers detected:\n";
1532+
dbgs() << " to type width: " << ToTypeWidth << " (new)\n";
1533+
dbgs() << " element type: " << EltVT << " (new)\n";
1534+
});
15071535
}
15081536

15091537
assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5323,9 +5323,20 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI,
53235323
return {{NewLD, LoadChain}};
53245324
}
53255325

5326+
static MachineMemOperand *
5327+
getMachineMemOperandForType(const SelectionDAG &DAG,
5328+
const MachineMemOperand *MMO,
5329+
const MachinePointerInfo &PointerInfo, MVT VT) {
5330+
return DAG.getMachineFunction().getMachineMemOperand(MMO, PointerInfo,
5331+
LLT(VT));
5332+
}
5333+
53265334
static SDValue PerformLoadCombine(SDNode *N,
53275335
TargetLowering::DAGCombinerInfo &DCI,
53285336
const NVPTXSubtarget &STI) {
5337+
if (DCI.DAG.getOptLevel() == CodeGenOptLevel::None)
5338+
return {};
5339+
53295340
auto *MemN = cast<MemSDNode>(N);
53305341
// only operate on vectors of f32s / i64s
53315342
if (EVT MemVT = MemN->getMemoryVT();
@@ -5406,9 +5417,13 @@ static SDValue PerformLoadCombine(SDNode *N,
54065417
// Do we have to tweak the opcode for an NVPTXISD::Load* or do we have to
54075418
// rewrite an ISD::LOAD?
54085419
std::optional<NVPTXISD::NodeType> NewOpcode;
5420+
5421+
// LoadV's are handled slightly different in ISelDAGToDAG.
5422+
bool IsLoadV = false;
54095423
switch (N->getOpcode()) {
54105424
case NVPTXISD::LoadV2:
54115425
NewOpcode = NVPTXISD::LoadV4;
5426+
IsLoadV = true;
54125427
break;
54135428
case NVPTXISD::LoadParam:
54145429
NewOpcode = NVPTXISD::LoadParamV2;
@@ -5449,9 +5464,22 @@ static SDValue PerformLoadCombine(SDNode *N,
54495464
}
54505465
}
54515466

5467+
MVT LoadVT = MVT::f32;
5468+
MachineMemOperand *MMO = MemN->getMemOperand();
5469+
5470+
if (IsLoadV) {
5471+
// Some loads must have an operand type that matches the number of results
5472+
// and the type of each result. Because we changed a vNi64 to v(N*2)f32 we
5473+
// have to update it here. Note that LoadParam is not handled the same way
5474+
// in NVPXISelDAGToDAG so we only do this for LoadV*.
5475+
LoadVT = MVT::getVectorVT(MVT::f32, NumElts);
5476+
MMO = getMachineMemOperandForType(DCI.DAG, MMO, MemN->getPointerInfo(),
5477+
LoadVT);
5478+
}
5479+
54525480
NewLoad = DCI.DAG.getMemIntrinsicNode(
54535481
*NewOpcode, SDLoc(N), DCI.DAG.getVTList(VTs),
5454-
SmallVector<SDValue>(N->ops()), MVT::f32, MemN->getMemOperand());
5482+
SmallVector<SDValue>(N->ops()), LoadVT, MMO);
54555483
NewChain = NewLoad.getValue(*NewChainIdx);
54565484
if (NewGlueIdx)
54575485
NewGlue = NewLoad.getValue(*NewGlueIdx);
@@ -5550,6 +5578,9 @@ static SDValue PerformStoreCombineHelper(SDNode *N,
55505578
// as the previous value will become unused and eliminated later.
55515579
return N->getOperand(0);
55525580

5581+
if (DCI.DAG.getOptLevel() == CodeGenOptLevel::None)
5582+
return {};
5583+
55535584
auto *MemN = cast<MemSDNode>(N);
55545585
if (MemN->getMemoryVT() == MVT::v2f32) {
55555586
// try to fold, and expand:
@@ -5581,13 +5612,15 @@ static SDValue PerformStoreCombineHelper(SDNode *N,
55815612
if (NewOpcode) {
55825613
// copy chain, offset from existing store
55835614
SmallVector<SDValue> NewOps = {N->getOperand(0), N->getOperand(1)};
5615+
unsigned NumElts = 0;
55845616
// gather all operands to expand
55855617
for (unsigned I = 2, E = N->getNumOperands(); I < E; ++I) {
55865618
SDValue CurrentOp = N->getOperand(I);
55875619
if (CurrentOp->getOpcode() == ISD::BUILD_VECTOR) {
55885620
assert(CurrentOp.getValueType() == MVT::v2f32);
55895621
NewOps.push_back(CurrentOp.getOperand(0));
55905622
NewOps.push_back(CurrentOp.getOperand(1));
5623+
NumElts += 2;
55915624
} else {
55925625
NewOps.clear();
55935626
break;

llvm/test/CodeGen/NVPTX/f16x2-instructions.ll

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -614,7 +614,7 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
614614
; CHECK-F16: {
615615
; CHECK-F16-NEXT: .reg .pred %p<3>;
616616
; CHECK-F16-NEXT: .reg .b32 %r<9>;
617-
; CHECK-F16-NEXT: .reg .b64 %rd<3>;
617+
; CHECK-F16-NEXT: .reg .b64 %rd<4>;
618618
; CHECK-F16-EMPTY:
619619
; CHECK-F16-NEXT: // %bb.0:
620620
; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
@@ -626,15 +626,16 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
626626
; CHECK-F16-NEXT: mov.b64 {%r5, %r6}, %rd1;
627627
; CHECK-F16-NEXT: selp.f32 %r7, %r6, %r4, %p2;
628628
; CHECK-F16-NEXT: selp.f32 %r8, %r5, %r3, %p1;
629-
; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
629+
; CHECK-F16-NEXT: mov.b64 %rd3, {%r8, %r7};
630+
; CHECK-F16-NEXT: st.param.b64 [func_retval0], %rd3;
630631
; CHECK-F16-NEXT: ret;
631632
;
632633
; CHECK-NOF16-LABEL: test_select_cc_f32_f16(
633634
; CHECK-NOF16: {
634635
; CHECK-NOF16-NEXT: .reg .pred %p<3>;
635636
; CHECK-NOF16-NEXT: .reg .b16 %rs<5>;
636637
; CHECK-NOF16-NEXT: .reg .b32 %r<13>;
637-
; CHECK-NOF16-NEXT: .reg .b64 %rd<3>;
638+
; CHECK-NOF16-NEXT: .reg .b64 %rd<4>;
638639
; CHECK-NOF16-EMPTY:
639640
; CHECK-NOF16-NEXT: // %bb.0:
640641
; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
@@ -653,7 +654,8 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
653654
; CHECK-NOF16-NEXT: mov.b64 {%r9, %r10}, %rd1;
654655
; CHECK-NOF16-NEXT: selp.f32 %r11, %r10, %r8, %p2;
655656
; CHECK-NOF16-NEXT: selp.f32 %r12, %r9, %r7, %p1;
656-
; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r11};
657+
; CHECK-NOF16-NEXT: mov.b64 %rd3, {%r12, %r11};
658+
; CHECK-NOF16-NEXT: st.param.b64 [func_retval0], %rd3;
657659
; CHECK-NOF16-NEXT: ret;
658660
<2 x half> %c, <2 x half> %d) #0 {
659661
%cc = fcmp une <2 x half> %c, %d
@@ -1563,13 +1565,15 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
15631565
; CHECK: {
15641566
; CHECK-NEXT: .reg .b16 %rs<3>;
15651567
; CHECK-NEXT: .reg .b32 %r<4>;
1568+
; CHECK-NEXT: .reg .b64 %rd<2>;
15661569
; CHECK-EMPTY:
15671570
; CHECK-NEXT: // %bb.0:
15681571
; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xfloat_param_0];
15691572
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
15701573
; CHECK-NEXT: cvt.f32.f16 %r2, %rs2;
15711574
; CHECK-NEXT: cvt.f32.f16 %r3, %rs1;
1572-
; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2};
1575+
; CHECK-NEXT: mov.b64 %rd1, {%r3, %r2};
1576+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
15731577
; CHECK-NEXT: ret;
15741578
%r = fpext <2 x half> %a to <2 x float>
15751579
ret <2 x float> %r
@@ -2054,6 +2058,7 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
20542058
; CHECK-F16: {
20552059
; CHECK-F16-NEXT: .reg .b16 %rs<3>;
20562060
; CHECK-F16-NEXT: .reg .b32 %r<8>;
2061+
; CHECK-F16-NEXT: .reg .b64 %rd<2>;
20572062
; CHECK-F16-EMPTY:
20582063
; CHECK-F16-NEXT: // %bb.0:
20592064
; CHECK-F16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1];
@@ -2064,13 +2069,15 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
20642069
; CHECK-F16-NEXT: mov.b32 {%rs1, %rs2}, %r5;
20652070
; CHECK-F16-NEXT: cvt.f32.f16 %r6, %rs2;
20662071
; CHECK-F16-NEXT: cvt.f32.f16 %r7, %rs1;
2067-
; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%r7, %r6};
2072+
; CHECK-F16-NEXT: mov.b64 %rd1, {%r7, %r6};
2073+
; CHECK-F16-NEXT: st.param.b64 [func_retval0], %rd1;
20682074
; CHECK-F16-NEXT: ret;
20692075
;
20702076
; CHECK-NOF16-LABEL: test_copysign_extended(
20712077
; CHECK-NOF16: {
20722078
; CHECK-NOF16-NEXT: .reg .b16 %rs<11>;
20732079
; CHECK-NOF16-NEXT: .reg .b32 %r<5>;
2080+
; CHECK-NOF16-NEXT: .reg .b64 %rd<2>;
20742081
; CHECK-NOF16-EMPTY:
20752082
; CHECK-NOF16-NEXT: // %bb.0:
20762083
; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1];
@@ -2085,7 +2092,8 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
20852092
; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8;
20862093
; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs10;
20872094
; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs7;
2088-
; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
2095+
; CHECK-NOF16-NEXT: mov.b64 %rd1, {%r4, %r3};
2096+
; CHECK-NOF16-NEXT: st.param.b64 [func_retval0], %rd1;
20892097
; CHECK-NOF16-NEXT: ret;
20902098
%r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
20912099
%xr = fpext <2 x half> %r to <2 x float>

0 commit comments

Comments
 (0)