Skip to content

Commit 52b65cf

Browse files
committed
[NVPTX] fold v2f32 = bitcast (i64,i64,... = NVPTXISD::Load*)
Fold i64->v2f32 bitcasts on the results of a NVPTXISD::Load* op.
1 parent cea37b9 commit 52b65cf

File tree

4 files changed

+124
-63
lines changed

4 files changed

+124
-63
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1158,15 +1158,17 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
11581158
return true;
11591159
}
11601160

1161-
static bool isSubVectorPackedInI32(EVT EltVT) {
1161+
static bool isSubVectorPackedInInteger(EVT EltVT) {
11621162
// Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
11631163
// total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
11641164
// vectorized loads/stores with the actual element type for i8/i16 as that
11651165
// would require v8/v16 variants that do not exist.
11661166
// In order to load/store such vectors efficiently, in Type Legalization
11671167
// we split the vector into word-sized chunks (v2x16/v4i8). Now, we will
11681168
// lower to PTX as vectors of b32.
1169-
return Isv2x16VT(EltVT) || EltVT == MVT::v4i8;
1169+
// We also consider v2f32 as an upsized type, which may be used in packed
1170+
// (f32x2) instructions.
1171+
return Isv2x16VT(EltVT) || EltVT == MVT::v4i8 || EltVT == MVT::v2f32;
11701172
}
11711173

11721174
bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
@@ -1215,8 +1217,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
12151217
return false;
12161218
}
12171219

1218-
if (isSubVectorPackedInI32(EltVT)) {
1219-
EltVT = MVT::i32;
1220+
if (isSubVectorPackedInInteger(EltVT)) {
1221+
FromTypeWidth = EltVT.getSizeInBits();
1222+
EltVT = MVT::getIntegerVT(FromTypeWidth);
12201223
FromType = NVPTX::PTXLdStInstCode::Untyped;
12211224
}
12221225

@@ -1552,8 +1555,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
15521555
return false;
15531556
}
15541557

1555-
if (isSubVectorPackedInI32(EltVT)) {
1556-
EltVT = MVT::i32;
1558+
if (isSubVectorPackedInInteger(EltVT)) {
1559+
ToTypeWidth = EltVT.getSizeInBits();
1560+
EltVT = MVT::getIntegerVT(ToTypeWidth);
15571561
ToType = NVPTX::PTXLdStInstCode::Untyped;
15581562
}
15591563

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -829,7 +829,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
829829
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
830830
ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT,
831831
ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::FP_ROUND,
832-
ISD::TRUNCATE, ISD::LOAD});
832+
ISD::TRUNCATE, ISD::LOAD, ISD::BITCAST});
833833

834834
// setcc for f16x2 and bf16x2 needs special handling to prevent
835835
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -6155,6 +6155,61 @@ static SDValue PerformTRUNCATECombine(SDNode *N,
61556155
return SDValue();
61566156
}
61576157

6158+
static SDValue PerformBITCASTCombine(SDNode *N,
6159+
TargetLowering::DAGCombinerInfo &DCI) {
6160+
if (N->getValueType(0) != MVT::v2f32)
6161+
return SDValue();
6162+
6163+
SDValue Operand = N->getOperand(0);
6164+
if (Operand.getValueType() != MVT::i64)
6165+
return SDValue();
6166+
6167+
// DAGCombiner handles bitcast(ISD::LOAD) already. For these, we'll do the
6168+
// same thing, by changing their output values from i64 to v2f32. Then the
6169+
// rule for combining loads (see PerformLoadCombine) may split these loads
6170+
// further.
6171+
if (Operand.getOpcode() == NVPTXISD::LoadV2 ||
6172+
Operand.getOpcode() == NVPTXISD::LoadParam ||
6173+
Operand.getOpcode() == NVPTXISD::LoadParamV2) {
6174+
// check for all bitcasts
6175+
SmallVector<std::pair<SDNode *, unsigned /* resno */>> OldUses;
6176+
for (SDUse &U : Operand->uses()) {
6177+
SDNode *User = U.getUser();
6178+
if (!(User->getOpcode() == ISD::BITCAST &&
6179+
User->getValueType(0) == MVT::v2f32 &&
6180+
U.getValueType() == MVT::i64))
6181+
return SDValue(); // unhandled pattern
6182+
OldUses.push_back({User, U.getResNo()});
6183+
}
6184+
6185+
auto *MemN = cast<MemSDNode>(Operand);
6186+
SmallVector<EVT> VTs;
6187+
for (const auto &VT : Operand->values()) {
6188+
if (VT == MVT::i64)
6189+
VTs.push_back(MVT::v2f32);
6190+
else
6191+
VTs.push_back(VT);
6192+
}
6193+
6194+
SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
6195+
Operand.getOpcode(), SDLoc(Operand), DCI.DAG.getVTList(VTs),
6196+
SmallVector<SDValue>(Operand->ops()), MemN->getMemoryVT(),
6197+
MemN->getMemOperand());
6198+
6199+
// replace all chain/glue uses of the old load
6200+
for (unsigned I = 0, E = Operand->getNumValues(); I != E; ++I)
6201+
if (Operand->getValueType(I) != MVT::i64)
6202+
DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(MemN, I),
6203+
NewLoad.getValue(I));
6204+
6205+
// replace all bitcasts with values from the new load
6206+
for (auto &[BC, ResultNum] : OldUses)
6207+
DCI.CombineTo(BC, NewLoad.getValue(ResultNum), false);
6208+
}
6209+
6210+
return SDValue();
6211+
}
6212+
61586213
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
61596214
DAGCombinerInfo &DCI) const {
61606215
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6200,6 +6255,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
62006255
return PerformFP_ROUNDCombine(N, DCI);
62016256
case ISD::TRUNCATE:
62026257
return PerformTRUNCATECombine(N, DCI);
6258+
case ISD::BITCAST:
6259+
return PerformBITCASTCombine(N, DCI);
62036260
}
62046261
return SDValue();
62056262
}

llvm/test/CodeGen/NVPTX/f32x2-instructions.ll

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,14 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
117117
define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
118118
; CHECK-LABEL: test_fadd_v4(
119119
; CHECK: {
120-
; CHECK-NEXT: .reg .b64 %rd<11>;
120+
; CHECK-NEXT: .reg .b64 %rd<7>;
121121
; CHECK-EMPTY:
122122
; CHECK-NEXT: // %bb.0:
123-
; CHECK-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [test_fadd_v4_param_1];
124-
; CHECK-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [test_fadd_v4_param_0];
125-
; CHECK-NEXT: add.rn.f32x2 %rd9, %rd8, %rd6;
126-
; CHECK-NEXT: add.rn.f32x2 %rd10, %rd7, %rd5;
127-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
123+
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1];
124+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0];
125+
; CHECK-NEXT: add.rn.f32x2 %rd5, %rd2, %rd4;
126+
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd3;
127+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5};
128128
; CHECK-NEXT: ret;
129129
%r = fadd <4 x float> %a, %b
130130
ret <4 x float> %r
@@ -134,19 +134,19 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
134134
; CHECK-LABEL: test_fadd_imm_0_v4(
135135
; CHECK: {
136136
; CHECK-NEXT: .reg .f32 %f<5>;
137-
; CHECK-NEXT: .reg .b64 %rd<9>;
137+
; CHECK-NEXT: .reg .b64 %rd<7>;
138138
; CHECK-EMPTY:
139139
; CHECK-NEXT: // %bb.0:
140-
; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_0_v4_param_0];
140+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0];
141141
; CHECK-NEXT: mov.f32 %f1, 0f40800000;
142142
; CHECK-NEXT: mov.f32 %f2, 0f40400000;
143-
; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
144-
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
143+
; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
144+
; CHECK-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3;
145145
; CHECK-NEXT: mov.f32 %f3, 0f40000000;
146146
; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
147-
; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
148-
; CHECK-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
149-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
147+
; CHECK-NEXT: mov.b64 %rd5, {%f4, %f3};
148+
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5;
149+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
150150
; CHECK-NEXT: ret;
151151
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
152152
ret <4 x float> %r
@@ -156,19 +156,19 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
156156
; CHECK-LABEL: test_fadd_imm_1_v4(
157157
; CHECK: {
158158
; CHECK-NEXT: .reg .f32 %f<5>;
159-
; CHECK-NEXT: .reg .b64 %rd<9>;
159+
; CHECK-NEXT: .reg .b64 %rd<7>;
160160
; CHECK-EMPTY:
161161
; CHECK-NEXT: // %bb.0:
162-
; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_1_v4_param_0];
162+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0];
163163
; CHECK-NEXT: mov.f32 %f1, 0f40800000;
164164
; CHECK-NEXT: mov.f32 %f2, 0f40400000;
165-
; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
166-
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
165+
; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
166+
; CHECK-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3;
167167
; CHECK-NEXT: mov.f32 %f3, 0f40000000;
168168
; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
169-
; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
170-
; CHECK-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
171-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
169+
; CHECK-NEXT: mov.b64 %rd5, {%f4, %f3};
170+
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5;
171+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
172172
; CHECK-NEXT: ret;
173173
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
174174
ret <4 x float> %r
@@ -340,14 +340,14 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
340340
define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
341341
; CHECK-LABEL: test_fadd_v4_ftz(
342342
; CHECK: {
343-
; CHECK-NEXT: .reg .b64 %rd<11>;
343+
; CHECK-NEXT: .reg .b64 %rd<7>;
344344
; CHECK-EMPTY:
345345
; CHECK-NEXT: // %bb.0:
346-
; CHECK-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [test_fadd_v4_ftz_param_1];
347-
; CHECK-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [test_fadd_v4_ftz_param_0];
348-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd9, %rd8, %rd6;
349-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd10, %rd7, %rd5;
350-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
346+
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1];
347+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0];
348+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd5, %rd2, %rd4;
349+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd3;
350+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5};
351351
; CHECK-NEXT: ret;
352352
%r = fadd <4 x float> %a, %b
353353
ret <4 x float> %r
@@ -357,19 +357,19 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
357357
; CHECK-LABEL: test_fadd_imm_0_v4_ftz(
358358
; CHECK: {
359359
; CHECK-NEXT: .reg .f32 %f<5>;
360-
; CHECK-NEXT: .reg .b64 %rd<9>;
360+
; CHECK-NEXT: .reg .b64 %rd<7>;
361361
; CHECK-EMPTY:
362362
; CHECK-NEXT: // %bb.0:
363-
; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_0_v4_ftz_param_0];
363+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0];
364364
; CHECK-NEXT: mov.f32 %f1, 0f40800000;
365365
; CHECK-NEXT: mov.f32 %f2, 0f40400000;
366-
; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
367-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
366+
; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
367+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3;
368368
; CHECK-NEXT: mov.f32 %f3, 0f40000000;
369369
; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
370-
; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
371-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
372-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
370+
; CHECK-NEXT: mov.b64 %rd5, {%f4, %f3};
371+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5;
372+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
373373
; CHECK-NEXT: ret;
374374
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
375375
ret <4 x float> %r
@@ -379,19 +379,19 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
379379
; CHECK-LABEL: test_fadd_imm_1_v4_ftz(
380380
; CHECK: {
381381
; CHECK-NEXT: .reg .f32 %f<5>;
382-
; CHECK-NEXT: .reg .b64 %rd<9>;
382+
; CHECK-NEXT: .reg .b64 %rd<7>;
383383
; CHECK-EMPTY:
384384
; CHECK-NEXT: // %bb.0:
385-
; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_1_v4_ftz_param_0];
385+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0];
386386
; CHECK-NEXT: mov.f32 %f1, 0f40800000;
387387
; CHECK-NEXT: mov.f32 %f2, 0f40400000;
388-
; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
389-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
388+
; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
389+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3;
390390
; CHECK-NEXT: mov.f32 %f3, 0f40000000;
391391
; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
392-
; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
393-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
394-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
392+
; CHECK-NEXT: mov.b64 %rd5, {%f4, %f3};
393+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5;
394+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
395395
; CHECK-NEXT: ret;
396396
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
397397
ret <4 x float> %r

llvm/test/CodeGen/NVPTX/vec-param-load.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,40 +5,40 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
55

66
define <16 x float> @test_v16f32(<16 x float> %a) {
77
; CHECK-LABEL: test_v16f32(
8-
; CHECK-DAG: ld.param.v4.f32 {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];
9-
; CHECK-DAG: ld.param.v4.f32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];
10-
; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];
11-
; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0];
12-
; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]}
13-
; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]}
14-
; CHECK-DAG: st.param.v4.f32 [func_retval0+32], {[[V_8_11]]}
15-
; CHECK-DAG: st.param.v4.f32 [func_retval0+48], {[[V_12_15]]}
8+
; CHECK-DAG: ld.param.v2.b64 {[[V_12_15:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0+48];
9+
; CHECK-DAG: ld.param.v2.b64 {[[V_8_11:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0+32];
10+
; CHECK-DAG: ld.param.v2.b64 {[[V_4_7:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0+16];
11+
; CHECK-DAG: ld.param.v2.b64 {[[V_0_3:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0];
12+
; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_3]]}
13+
; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_4_7]]}
14+
; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_8_11]]}
15+
; CHECK-DAG: st.param.v2.b64 [func_retval0+48], {[[V_12_15]]}
1616
; CHECK: ret;
1717
ret <16 x float> %a
1818
}
1919

2020
define <8 x float> @test_v8f32(<8 x float> %a) {
2121
; CHECK-LABEL: test_v8f32(
22-
; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16];
23-
; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0];
24-
; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]}
25-
; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]}
22+
; CHECK-DAG: ld.param.v2.b64 {[[V_4_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8f32_param_0+16];
23+
; CHECK-DAG: ld.param.v2.b64 {[[V_0_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8f32_param_0];
24+
; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_3]]}
25+
; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_4_7]]}
2626
; CHECK: ret;
2727
ret <8 x float> %a
2828
}
2929

3030
define <4 x float> @test_v4f32(<4 x float> %a) {
3131
; CHECK-LABEL: test_v4f32(
32-
; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0];
33-
; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]}
32+
; CHECK-DAG: ld.param.v2.b64 {[[V_0_3:(%rd[0-9]+[, ]*){2}]]}, [test_v4f32_param_0];
33+
; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_3]]}
3434
; CHECK: ret;
3535
ret <4 x float> %a
3636
}
3737

3838
define <2 x float> @test_v2f32(<2 x float> %a) {
3939
; CHECK-LABEL: test_v2f32(
40-
; CHECK-DAG: ld.param.v2.f32 {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0];
41-
; CHECK-DAG: st.param.v2.f32 [func_retval0], {[[V_0_3]]}
40+
; CHECK-DAG: ld.param.b64 [[V_0_3:%rd[0-9]+]], [test_v2f32_param_0];
41+
; CHECK-DAG: st.param.b64 [func_retval0], [[V_0_3]]
4242
; CHECK: ret;
4343
ret <2 x float> %a
4444
}

0 commit comments

Comments
 (0)