Skip to content

Commit 06fa116

Browse files
committed
[NVPTX] add combiner rule for expanding StoreRetval vector parameters
Do this to reduce the amount of packing movs.
1 parent d37eb0f commit 06fa116

File tree

2 files changed

+80
-44
lines changed

2 files changed

+80
-44
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 60 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5071,26 +5071,78 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
50715071
return SDValue();
50725072
}
50735073

5074-
static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
5075-
std::size_t Back) {
5074+
static SDValue PerformStoreCombineHelper(SDNode *N,
5075+
TargetLowering::DAGCombinerInfo &DCI,
5076+
std::size_t Front, std::size_t Back) {
50765077
if (all_of(N->ops().drop_front(Front).drop_back(Back),
50775078
[](const SDUse &U) { return U.get()->isUndef(); }))
50785079
// Operand 0 is the previous value in the chain. Cannot return EntryToken
50795080
// as the previous value will become unused and eliminated later.
50805081
return N->getOperand(0);
50815082

5083+
auto *MemN = cast<MemSDNode>(N);
5084+
if (MemN->getMemoryVT() == MVT::v2f32) {
5085+
// try to fold, and expand:
5086+
// c: v2f32 = BUILD_VECTOR (a: f32, b: f32)
5087+
// StoreRetval c
5088+
// -->
5089+
// StoreRetvalV2 {a, b}
5090+
// likewise for V2 -> V4 case
5091+
5092+
std::optional<NVPTXISD::NodeType> NewOpcode;
5093+
switch (N->getOpcode()) {
5094+
case NVPTXISD::StoreParam:
5095+
NewOpcode = NVPTXISD::StoreParamV2;
5096+
break;
5097+
case NVPTXISD::StoreParamV2:
5098+
NewOpcode = NVPTXISD::StoreParamV4;
5099+
break;
5100+
case NVPTXISD::StoreRetval:
5101+
NewOpcode = NVPTXISD::StoreRetvalV2;
5102+
break;
5103+
case NVPTXISD::StoreRetvalV2:
5104+
NewOpcode = NVPTXISD::StoreRetvalV4;
5105+
break;
5106+
}
5107+
5108+
if (NewOpcode) {
5109+
// copy chain, offset from existing store
5110+
SmallVector<SDValue> NewOps = {N->getOperand(0), N->getOperand(1)};
5111+
// gather all operands to expand
5112+
for (unsigned I = 2, E = N->getNumOperands(); I < E; ++I) {
5113+
SDValue CurrentOp = N->getOperand(I);
5114+
if (CurrentOp->getOpcode() == ISD::BUILD_VECTOR) {
5115+
assert(CurrentOp.getValueType() == MVT::v2f32);
5116+
NewOps.push_back(CurrentOp.getNode()->getOperand(0));
5117+
NewOps.push_back(CurrentOp.getNode()->getOperand(1));
5118+
} else {
5119+
NewOps.clear();
5120+
break;
5121+
}
5122+
}
5123+
5124+
if (!NewOps.empty()) {
5125+
return DCI.DAG.getMemIntrinsicNode(*NewOpcode, SDLoc(N), N->getVTList(),
5126+
NewOps, MVT::f32,
5127+
MemN->getMemOperand());
5128+
}
5129+
}
5130+
}
5131+
50825132
return SDValue();
50835133
}
50845134

5085-
static SDValue PerformStoreParamCombine(SDNode *N) {
5135+
static SDValue PerformStoreParamCombine(SDNode *N,
5136+
TargetLowering::DAGCombinerInfo &DCI) {
50865137
// Operands from the 3rd to the 2nd last one are the values to be stored.
50875138
// {Chain, ArgID, Offset, Val, Glue}
5088-
return PerformStoreCombineHelper(N, 3, 1);
5139+
return PerformStoreCombineHelper(N, DCI, 3, 1);
50895140
}
50905141

5091-
static SDValue PerformStoreRetvalCombine(SDNode *N) {
5142+
static SDValue PerformStoreRetvalCombine(SDNode *N,
5143+
TargetLowering::DAGCombinerInfo &DCI) {
50925144
// Operands from the 2nd to the last one are the values to be stored
5093-
return PerformStoreCombineHelper(N, 2, 0);
5145+
return PerformStoreCombineHelper(N, DCI, 2, 0);
50945146
}
50955147

50965148
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
@@ -5801,11 +5853,11 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
58015853
case NVPTXISD::StoreRetval:
58025854
case NVPTXISD::StoreRetvalV2:
58035855
case NVPTXISD::StoreRetvalV4:
5804-
return PerformStoreRetvalCombine(N);
5856+
return PerformStoreRetvalCombine(N, DCI);
58055857
case NVPTXISD::StoreParam:
58065858
case NVPTXISD::StoreParamV2:
58075859
case NVPTXISD::StoreParamV4:
5808-
return PerformStoreParamCombine(N);
5860+
return PerformStoreParamCombine(N, DCI);
58095861
case ISD::EXTRACT_VECTOR_ELT:
58105862
return PerformEXTRACTCombine(N, DCI);
58115863
case ISD::VSELECT:

llvm/test/CodeGen/NVPTX/f32x2-instructions.ll

Lines changed: 20 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,11 @@ define <2 x float> @test_ret_const() #0 {
1616
; CHECK-LABEL: test_ret_const(
1717
; CHECK: {
1818
; CHECK-NEXT: .reg .f32 %f<3>;
19-
; CHECK-NEXT: .reg .b64 %rd<2>;
2019
; CHECK-EMPTY:
2120
; CHECK-NEXT: // %bb.0:
2221
; CHECK-NEXT: mov.f32 %f1, 0f40000000;
2322
; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
24-
; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
25-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
23+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
2624
; CHECK-NEXT: ret;
2725
ret <2 x float> <float 1.0, float 2.0>
2826
}
@@ -243,7 +241,7 @@ define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
243241
; CHECK-LABEL: test_fdiv(
244242
; CHECK: {
245243
; CHECK-NEXT: .reg .f32 %f<7>;
246-
; CHECK-NEXT: .reg .b64 %rd<4>;
244+
; CHECK-NEXT: .reg .b64 %rd<3>;
247245
; CHECK-EMPTY:
248246
; CHECK-NEXT: // %bb.0:
249247
; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1];
@@ -252,8 +250,7 @@ define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
252250
; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
253251
; CHECK-NEXT: div.rn.f32 %f5, %f4, %f2;
254252
; CHECK-NEXT: div.rn.f32 %f6, %f3, %f1;
255-
; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
256-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
253+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
257254
; CHECK-NEXT: ret;
258255
%r = fdiv <2 x float> %a, %b
259256
ret <2 x float> %r
@@ -264,7 +261,7 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
264261
; CHECK: {
265262
; CHECK-NEXT: .reg .pred %p<3>;
266263
; CHECK-NEXT: .reg .f32 %f<15>;
267-
; CHECK-NEXT: .reg .b64 %rd<4>;
264+
; CHECK-NEXT: .reg .b64 %rd<3>;
268265
; CHECK-EMPTY:
269266
; CHECK-NEXT: // %bb.0:
270267
; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_param_1];
@@ -283,8 +280,7 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
283280
; CHECK-NEXT: sub.f32 %f13, %f3, %f12;
284281
; CHECK-NEXT: testp.infinite.f32 %p2, %f1;
285282
; CHECK-NEXT: selp.f32 %f14, %f3, %f13, %p2;
286-
; CHECK-NEXT: mov.b64 %rd3, {%f14, %f9};
287-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
283+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
288284
; CHECK-NEXT: ret;
289285
%r = frem <2 x float> %a, %b
290286
ret <2 x float> %r
@@ -468,7 +464,7 @@ define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
468464
; CHECK-LABEL: test_fdiv_ftz(
469465
; CHECK: {
470466
; CHECK-NEXT: .reg .f32 %f<7>;
471-
; CHECK-NEXT: .reg .b64 %rd<4>;
467+
; CHECK-NEXT: .reg .b64 %rd<3>;
472468
; CHECK-EMPTY:
473469
; CHECK-NEXT: // %bb.0:
474470
; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_ftz_param_1];
@@ -477,8 +473,7 @@ define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
477473
; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
478474
; CHECK-NEXT: div.rn.ftz.f32 %f5, %f4, %f2;
479475
; CHECK-NEXT: div.rn.ftz.f32 %f6, %f3, %f1;
480-
; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
481-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
476+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
482477
; CHECK-NEXT: ret;
483478
%r = fdiv <2 x float> %a, %b
484479
ret <2 x float> %r
@@ -489,7 +484,7 @@ define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
489484
; CHECK: {
490485
; CHECK-NEXT: .reg .pred %p<3>;
491486
; CHECK-NEXT: .reg .f32 %f<15>;
492-
; CHECK-NEXT: .reg .b64 %rd<4>;
487+
; CHECK-NEXT: .reg .b64 %rd<3>;
493488
; CHECK-EMPTY:
494489
; CHECK-NEXT: // %bb.0:
495490
; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_ftz_param_1];
@@ -508,8 +503,7 @@ define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
508503
; CHECK-NEXT: sub.ftz.f32 %f13, %f3, %f12;
509504
; CHECK-NEXT: testp.infinite.f32 %p2, %f1;
510505
; CHECK-NEXT: selp.f32 %f14, %f3, %f13, %p2;
511-
; CHECK-NEXT: mov.b64 %rd3, {%f14, %f9};
512-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
506+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
513507
; CHECK-NEXT: ret;
514508
%r = frem <2 x float> %a, %b
515509
ret <2 x float> %r
@@ -699,7 +693,7 @@ define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %
699693
; CHECK: {
700694
; CHECK-NEXT: .reg .pred %p<3>;
701695
; CHECK-NEXT: .reg .f32 %f<11>;
702-
; CHECK-NEXT: .reg .b64 %rd<6>;
696+
; CHECK-NEXT: .reg .b64 %rd<5>;
703697
; CHECK-EMPTY:
704698
; CHECK-NEXT: // %bb.0:
705699
; CHECK-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3];
@@ -714,8 +708,7 @@ define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %
714708
; CHECK-NEXT: mov.b64 {%f7, %f8}, %rd1;
715709
; CHECK-NEXT: selp.f32 %f9, %f8, %f6, %p2;
716710
; CHECK-NEXT: selp.f32 %f10, %f7, %f5, %p1;
717-
; CHECK-NEXT: mov.b64 %rd5, {%f10, %f9};
718-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd5;
711+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9};
719712
; CHECK-NEXT: ret;
720713
%cc = fcmp une <2 x float> %c, %d
721714
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
@@ -753,7 +746,7 @@ define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x
753746
; CHECK: {
754747
; CHECK-NEXT: .reg .pred %p<3>;
755748
; CHECK-NEXT: .reg .f32 %f<7>;
756-
; CHECK-NEXT: .reg .b64 %rd<4>;
749+
; CHECK-NEXT: .reg .b64 %rd<3>;
757750
; CHECK-NEXT: .reg .f64 %fd<5>;
758751
; CHECK-EMPTY:
759752
; CHECK-NEXT: // %bb.0:
@@ -767,8 +760,7 @@ define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x
767760
; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
768761
; CHECK-NEXT: selp.f32 %f5, %f4, %f2, %p2;
769762
; CHECK-NEXT: selp.f32 %f6, %f3, %f1, %p1;
770-
; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
771-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
763+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
772764
; CHECK-NEXT: ret;
773765
%cc = fcmp une <2 x double> %c, %d
774766
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
@@ -1186,14 +1178,12 @@ define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 {
11861178
; CHECK: {
11871179
; CHECK-NEXT: .reg .b32 %r<3>;
11881180
; CHECK-NEXT: .reg .f32 %f<3>;
1189-
; CHECK-NEXT: .reg .b64 %rd<2>;
11901181
; CHECK-EMPTY:
11911182
; CHECK-NEXT: // %bb.0:
11921183
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
11931184
; CHECK-NEXT: cvt.rn.f32.u32 %f1, %r2;
11941185
; CHECK-NEXT: cvt.rn.f32.u32 %f2, %r1;
1195-
; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
1196-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
1186+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
11971187
; CHECK-NEXT: ret;
11981188
%r = uitofp <2 x i32> %a to <2 x float>
11991189
ret <2 x float> %r
@@ -1203,14 +1193,13 @@ define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 {
12031193
; CHECK-LABEL: test_uitofp_2xi64(
12041194
; CHECK: {
12051195
; CHECK-NEXT: .reg .f32 %f<3>;
1206-
; CHECK-NEXT: .reg .b64 %rd<4>;
1196+
; CHECK-NEXT: .reg .b64 %rd<3>;
12071197
; CHECK-EMPTY:
12081198
; CHECK-NEXT: // %bb.0:
12091199
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
12101200
; CHECK-NEXT: cvt.rn.f32.u64 %f1, %rd2;
12111201
; CHECK-NEXT: cvt.rn.f32.u64 %f2, %rd1;
1212-
; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
1213-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
1202+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
12141203
; CHECK-NEXT: ret;
12151204
%r = uitofp <2 x i64> %a to <2 x float>
12161205
ret <2 x float> %r
@@ -1221,14 +1210,12 @@ define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 {
12211210
; CHECK: {
12221211
; CHECK-NEXT: .reg .b32 %r<3>;
12231212
; CHECK-NEXT: .reg .f32 %f<3>;
1224-
; CHECK-NEXT: .reg .b64 %rd<2>;
12251213
; CHECK-EMPTY:
12261214
; CHECK-NEXT: // %bb.0:
12271215
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
12281216
; CHECK-NEXT: cvt.rn.f32.s32 %f1, %r2;
12291217
; CHECK-NEXT: cvt.rn.f32.s32 %f2, %r1;
1230-
; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
1231-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
1218+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
12321219
; CHECK-NEXT: ret;
12331220
%r = sitofp <2 x i32> %a to <2 x float>
12341221
ret <2 x float> %r
@@ -1238,14 +1225,13 @@ define <2 x float> @test_sitofp_2xi64(<2 x i64> %a) #0 {
12381225
; CHECK-LABEL: test_sitofp_2xi64(
12391226
; CHECK: {
12401227
; CHECK-NEXT: .reg .f32 %f<3>;
1241-
; CHECK-NEXT: .reg .b64 %rd<4>;
1228+
; CHECK-NEXT: .reg .b64 %rd<3>;
12421229
; CHECK-EMPTY:
12431230
; CHECK-NEXT: // %bb.0:
12441231
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
12451232
; CHECK-NEXT: cvt.rn.f32.s64 %f1, %rd2;
12461233
; CHECK-NEXT: cvt.rn.f32.s64 %f2, %rd1;
1247-
; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
1248-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
1234+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
12491235
; CHECK-NEXT: ret;
12501236
%r = sitofp <2 x i64> %a to <2 x float>
12511237
ret <2 x float> %r
@@ -1276,15 +1262,13 @@ define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
12761262
; CHECK-LABEL: test_fptrunc_2xdouble(
12771263
; CHECK: {
12781264
; CHECK-NEXT: .reg .f32 %f<3>;
1279-
; CHECK-NEXT: .reg .b64 %rd<2>;
12801265
; CHECK-NEXT: .reg .f64 %fd<3>;
12811266
; CHECK-EMPTY:
12821267
; CHECK-NEXT: // %bb.0:
12831268
; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
12841269
; CHECK-NEXT: cvt.rn.f32.f64 %f1, %fd2;
12851270
; CHECK-NEXT: cvt.rn.f32.f64 %f2, %fd1;
1286-
; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
1287-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
1271+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
12881272
; CHECK-NEXT: ret;
12891273
%r = fptrunc <2 x double> %a to <2 x float>
12901274
ret <2 x float> %r

0 commit comments

Comments
 (0)