Skip to content

Commit 59843e1

Browse files
committed
[NVPTX] add combiner rule for expanding StoreRetval vector parameters
Do this to reduce the amount of packing movs.
1 parent 0515649 commit 59843e1

File tree

2 files changed

+80
-44
lines changed

2 files changed

+80
-44
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 60 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4958,26 +4958,78 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
49584958
return SDValue();
49594959
}
49604960

4961-
static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
4962-
std::size_t Back) {
4961+
static SDValue PerformStoreCombineHelper(SDNode *N,
4962+
TargetLowering::DAGCombinerInfo &DCI,
4963+
std::size_t Front, std::size_t Back) {
49634964
if (all_of(N->ops().drop_front(Front).drop_back(Back),
49644965
[](const SDUse &U) { return U.get()->isUndef(); }))
49654966
// Operand 0 is the previous value in the chain. Cannot return EntryToken
49664967
// as the previous value will become unused and eliminated later.
49674968
return N->getOperand(0);
49684969

4970+
auto *MemN = cast<MemSDNode>(N);
4971+
if (MemN->getMemoryVT() == MVT::v2f32) {
4972+
// try to fold, and expand:
4973+
// c: v2f32 = BUILD_VECTOR (a: f32, b: f32)
4974+
// StoreRetval c
4975+
// -->
4976+
// StoreRetvalV2 {a, b}
4977+
// likewise for V2 -> V4 case
4978+
4979+
std::optional<NVPTXISD::NodeType> NewOpcode;
4980+
switch (N->getOpcode()) {
4981+
case NVPTXISD::StoreParam:
4982+
NewOpcode = NVPTXISD::StoreParamV2;
4983+
break;
4984+
case NVPTXISD::StoreParamV2:
4985+
NewOpcode = NVPTXISD::StoreParamV4;
4986+
break;
4987+
case NVPTXISD::StoreRetval:
4988+
NewOpcode = NVPTXISD::StoreRetvalV2;
4989+
break;
4990+
case NVPTXISD::StoreRetvalV2:
4991+
NewOpcode = NVPTXISD::StoreRetvalV4;
4992+
break;
4993+
}
4994+
4995+
if (NewOpcode) {
4996+
// copy chain, offset from existing store
4997+
SmallVector<SDValue> NewOps = {N->getOperand(0), N->getOperand(1)};
4998+
// gather all operands to expand
4999+
for (unsigned I = 2, E = N->getNumOperands(); I < E; ++I) {
5000+
SDValue CurrentOp = N->getOperand(I);
5001+
if (CurrentOp->getOpcode() == ISD::BUILD_VECTOR) {
5002+
assert(CurrentOp.getValueType() == MVT::v2f32);
5003+
NewOps.push_back(CurrentOp.getNode()->getOperand(0));
5004+
NewOps.push_back(CurrentOp.getNode()->getOperand(1));
5005+
} else {
5006+
NewOps.clear();
5007+
break;
5008+
}
5009+
}
5010+
5011+
if (!NewOps.empty()) {
5012+
return DCI.DAG.getMemIntrinsicNode(*NewOpcode, SDLoc(N), N->getVTList(),
5013+
NewOps, MVT::f32,
5014+
MemN->getMemOperand());
5015+
}
5016+
}
5017+
}
5018+
49695019
return SDValue();
49705020
}
49715021

4972-
static SDValue PerformStoreParamCombine(SDNode *N) {
5022+
static SDValue PerformStoreParamCombine(SDNode *N,
5023+
TargetLowering::DAGCombinerInfo &DCI) {
49735024
// Operands from the 3rd to the 2nd last one are the values to be stored.
49745025
// {Chain, ArgID, Offset, Val, Glue}
4975-
return PerformStoreCombineHelper(N, 3, 1);
5026+
return PerformStoreCombineHelper(N, DCI, 3, 1);
49765027
}
49775028

4978-
static SDValue PerformStoreRetvalCombine(SDNode *N) {
5029+
static SDValue PerformStoreRetvalCombine(SDNode *N,
5030+
TargetLowering::DAGCombinerInfo &DCI) {
49795031
// Operands from the 2nd to the last one are the values to be stored
4980-
return PerformStoreCombineHelper(N, 2, 0);
5032+
return PerformStoreCombineHelper(N, DCI, 2, 0);
49815033
}
49825034

49835035
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
@@ -5688,11 +5740,11 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
56885740
case NVPTXISD::StoreRetval:
56895741
case NVPTXISD::StoreRetvalV2:
56905742
case NVPTXISD::StoreRetvalV4:
5691-
return PerformStoreRetvalCombine(N);
5743+
return PerformStoreRetvalCombine(N, DCI);
56925744
case NVPTXISD::StoreParam:
56935745
case NVPTXISD::StoreParamV2:
56945746
case NVPTXISD::StoreParamV4:
5695-
return PerformStoreParamCombine(N);
5747+
return PerformStoreParamCombine(N, DCI);
56965748
case ISD::EXTRACT_VECTOR_ELT:
56975749
return PerformEXTRACTCombine(N, DCI);
56985750
case ISD::VSELECT:

llvm/test/CodeGen/NVPTX/f32x2-instructions.ll

Lines changed: 20 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,11 @@ define <2 x float> @test_ret_const() #0 {
1616
; CHECK-LABEL: test_ret_const(
1717
; CHECK: {
1818
; CHECK-NEXT: .reg .f32 %f<3>;
19-
; CHECK-NEXT: .reg .b64 %rd<2>;
2019
; CHECK-EMPTY:
2120
; CHECK-NEXT: // %bb.0:
2221
; CHECK-NEXT: mov.f32 %f1, 0f40000000;
2322
; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
24-
; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
25-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
23+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
2624
; CHECK-NEXT: ret;
2725
ret <2 x float> <float 1.0, float 2.0>
2826
}
@@ -243,7 +241,7 @@ define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
243241
; CHECK-LABEL: test_fdiv(
244242
; CHECK: {
245243
; CHECK-NEXT: .reg .f32 %f<7>;
246-
; CHECK-NEXT: .reg .b64 %rd<4>;
244+
; CHECK-NEXT: .reg .b64 %rd<3>;
247245
; CHECK-EMPTY:
248246
; CHECK-NEXT: // %bb.0:
249247
; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1];
@@ -252,8 +250,7 @@ define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
252250
; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
253251
; CHECK-NEXT: div.rn.f32 %f5, %f4, %f2;
254252
; CHECK-NEXT: div.rn.f32 %f6, %f3, %f1;
255-
; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
256-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
253+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
257254
; CHECK-NEXT: ret;
258255
%r = fdiv <2 x float> %a, %b
259256
ret <2 x float> %r
@@ -264,7 +261,7 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
264261
; CHECK: {
265262
; CHECK-NEXT: .reg .pred %p<3>;
266263
; CHECK-NEXT: .reg .f32 %f<15>;
267-
; CHECK-NEXT: .reg .b64 %rd<4>;
264+
; CHECK-NEXT: .reg .b64 %rd<3>;
268265
; CHECK-EMPTY:
269266
; CHECK-NEXT: // %bb.0:
270267
; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_param_1];
@@ -283,8 +280,7 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
283280
; CHECK-NEXT: sub.f32 %f13, %f3, %f12;
284281
; CHECK-NEXT: testp.infinite.f32 %p2, %f1;
285282
; CHECK-NEXT: selp.f32 %f14, %f3, %f13, %p2;
286-
; CHECK-NEXT: mov.b64 %rd3, {%f14, %f9};
287-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
283+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
288284
; CHECK-NEXT: ret;
289285
%r = frem <2 x float> %a, %b
290286
ret <2 x float> %r
@@ -468,7 +464,7 @@ define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
468464
; CHECK-LABEL: test_fdiv_ftz(
469465
; CHECK: {
470466
; CHECK-NEXT: .reg .f32 %f<7>;
471-
; CHECK-NEXT: .reg .b64 %rd<4>;
467+
; CHECK-NEXT: .reg .b64 %rd<3>;
472468
; CHECK-EMPTY:
473469
; CHECK-NEXT: // %bb.0:
474470
; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_ftz_param_1];
@@ -477,8 +473,7 @@ define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
477473
; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
478474
; CHECK-NEXT: div.rn.ftz.f32 %f5, %f4, %f2;
479475
; CHECK-NEXT: div.rn.ftz.f32 %f6, %f3, %f1;
480-
; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
481-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
476+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
482477
; CHECK-NEXT: ret;
483478
%r = fdiv <2 x float> %a, %b
484479
ret <2 x float> %r
@@ -489,7 +484,7 @@ define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
489484
; CHECK: {
490485
; CHECK-NEXT: .reg .pred %p<3>;
491486
; CHECK-NEXT: .reg .f32 %f<15>;
492-
; CHECK-NEXT: .reg .b64 %rd<4>;
487+
; CHECK-NEXT: .reg .b64 %rd<3>;
493488
; CHECK-EMPTY:
494489
; CHECK-NEXT: // %bb.0:
495490
; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_ftz_param_1];
@@ -508,8 +503,7 @@ define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
508503
; CHECK-NEXT: sub.ftz.f32 %f13, %f3, %f12;
509504
; CHECK-NEXT: testp.infinite.f32 %p2, %f1;
510505
; CHECK-NEXT: selp.f32 %f14, %f3, %f13, %p2;
511-
; CHECK-NEXT: mov.b64 %rd3, {%f14, %f9};
512-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
506+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
513507
; CHECK-NEXT: ret;
514508
%r = frem <2 x float> %a, %b
515509
ret <2 x float> %r
@@ -699,7 +693,7 @@ define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %
699693
; CHECK: {
700694
; CHECK-NEXT: .reg .pred %p<3>;
701695
; CHECK-NEXT: .reg .f32 %f<11>;
702-
; CHECK-NEXT: .reg .b64 %rd<6>;
696+
; CHECK-NEXT: .reg .b64 %rd<5>;
703697
; CHECK-EMPTY:
704698
; CHECK-NEXT: // %bb.0:
705699
; CHECK-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3];
@@ -714,8 +708,7 @@ define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %
714708
; CHECK-NEXT: mov.b64 {%f7, %f8}, %rd1;
715709
; CHECK-NEXT: selp.f32 %f9, %f8, %f6, %p2;
716710
; CHECK-NEXT: selp.f32 %f10, %f7, %f5, %p1;
717-
; CHECK-NEXT: mov.b64 %rd5, {%f10, %f9};
718-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd5;
711+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9};
719712
; CHECK-NEXT: ret;
720713
%cc = fcmp une <2 x float> %c, %d
721714
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
@@ -753,7 +746,7 @@ define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x
753746
; CHECK: {
754747
; CHECK-NEXT: .reg .pred %p<3>;
755748
; CHECK-NEXT: .reg .f32 %f<7>;
756-
; CHECK-NEXT: .reg .b64 %rd<4>;
749+
; CHECK-NEXT: .reg .b64 %rd<3>;
757750
; CHECK-NEXT: .reg .f64 %fd<5>;
758751
; CHECK-EMPTY:
759752
; CHECK-NEXT: // %bb.0:
@@ -767,8 +760,7 @@ define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x
767760
; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
768761
; CHECK-NEXT: selp.f32 %f5, %f4, %f2, %p2;
769762
; CHECK-NEXT: selp.f32 %f6, %f3, %f1, %p1;
770-
; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
771-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
763+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
772764
; CHECK-NEXT: ret;
773765
%cc = fcmp une <2 x double> %c, %d
774766
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
@@ -1186,14 +1178,12 @@ define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 {
11861178
; CHECK: {
11871179
; CHECK-NEXT: .reg .b32 %r<3>;
11881180
; CHECK-NEXT: .reg .f32 %f<3>;
1189-
; CHECK-NEXT: .reg .b64 %rd<2>;
11901181
; CHECK-EMPTY:
11911182
; CHECK-NEXT: // %bb.0:
11921183
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
11931184
; CHECK-NEXT: cvt.rn.f32.u32 %f1, %r2;
11941185
; CHECK-NEXT: cvt.rn.f32.u32 %f2, %r1;
1195-
; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
1196-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
1186+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
11971187
; CHECK-NEXT: ret;
11981188
%r = uitofp <2 x i32> %a to <2 x float>
11991189
ret <2 x float> %r
@@ -1203,14 +1193,13 @@ define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 {
12031193
; CHECK-LABEL: test_uitofp_2xi64(
12041194
; CHECK: {
12051195
; CHECK-NEXT: .reg .f32 %f<3>;
1206-
; CHECK-NEXT: .reg .b64 %rd<4>;
1196+
; CHECK-NEXT: .reg .b64 %rd<3>;
12071197
; CHECK-EMPTY:
12081198
; CHECK-NEXT: // %bb.0:
12091199
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
12101200
; CHECK-NEXT: cvt.rn.f32.u64 %f1, %rd2;
12111201
; CHECK-NEXT: cvt.rn.f32.u64 %f2, %rd1;
1212-
; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
1213-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
1202+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
12141203
; CHECK-NEXT: ret;
12151204
%r = uitofp <2 x i64> %a to <2 x float>
12161205
ret <2 x float> %r
@@ -1221,14 +1210,12 @@ define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 {
12211210
; CHECK: {
12221211
; CHECK-NEXT: .reg .b32 %r<3>;
12231212
; CHECK-NEXT: .reg .f32 %f<3>;
1224-
; CHECK-NEXT: .reg .b64 %rd<2>;
12251213
; CHECK-EMPTY:
12261214
; CHECK-NEXT: // %bb.0:
12271215
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
12281216
; CHECK-NEXT: cvt.rn.f32.s32 %f1, %r2;
12291217
; CHECK-NEXT: cvt.rn.f32.s32 %f2, %r1;
1230-
; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
1231-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
1218+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
12321219
; CHECK-NEXT: ret;
12331220
%r = sitofp <2 x i32> %a to <2 x float>
12341221
ret <2 x float> %r
@@ -1238,14 +1225,13 @@ define <2 x float> @test_sitofp_2xi64(<2 x i64> %a) #0 {
12381225
; CHECK-LABEL: test_sitofp_2xi64(
12391226
; CHECK: {
12401227
; CHECK-NEXT: .reg .f32 %f<3>;
1241-
; CHECK-NEXT: .reg .b64 %rd<4>;
1228+
; CHECK-NEXT: .reg .b64 %rd<3>;
12421229
; CHECK-EMPTY:
12431230
; CHECK-NEXT: // %bb.0:
12441231
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
12451232
; CHECK-NEXT: cvt.rn.f32.s64 %f1, %rd2;
12461233
; CHECK-NEXT: cvt.rn.f32.s64 %f2, %rd1;
1247-
; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
1248-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
1234+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
12491235
; CHECK-NEXT: ret;
12501236
%r = sitofp <2 x i64> %a to <2 x float>
12511237
ret <2 x float> %r
@@ -1276,15 +1262,13 @@ define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
12761262
; CHECK-LABEL: test_fptrunc_2xdouble(
12771263
; CHECK: {
12781264
; CHECK-NEXT: .reg .f32 %f<3>;
1279-
; CHECK-NEXT: .reg .b64 %rd<2>;
12801265
; CHECK-NEXT: .reg .f64 %fd<3>;
12811266
; CHECK-EMPTY:
12821267
; CHECK-NEXT: // %bb.0:
12831268
; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
12841269
; CHECK-NEXT: cvt.rn.f32.f64 %f1, %fd2;
12851270
; CHECK-NEXT: cvt.rn.f32.f64 %f2, %fd1;
1286-
; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
1287-
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
1271+
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
12881272
; CHECK-NEXT: ret;
12891273
%r = fptrunc <2 x double> %a to <2 x float>
12901274
ret <2 x float> %r

0 commit comments

Comments
 (0)