Skip to content

Commit eba9b25

Browse files
committed
[NVPTX] fold v2f32 = bitcast (i64,i64,... = NVPTXISD::Load*)
Fold i64->v2f32 bitcasts on the results of a NVPTXISD::Load* op.
1 parent 600af45 commit eba9b25

File tree

4 files changed

+125
-63
lines changed

4 files changed

+125
-63
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1127,15 +1127,17 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
11271127
return true;
11281128
}
11291129

1130-
static bool isSubVectorPackedInI32(EVT EltVT) {
1130+
static bool isSubVectorPackedInInteger(EVT EltVT) {
11311131
// Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
11321132
// total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
11331133
// vectorized loads/stores with the actual element type for i8/i16 as that
11341134
// would require v8/v16 variants that do not exist.
11351135
// In order to load/store such vectors efficiently, in Type Legalization
11361136
// we split the vector into word-sized chunks (v2x16/v4i8). Now, we will
11371137
// lower to PTX as vectors of b32.
1138-
return Isv2x16VT(EltVT) || EltVT == MVT::v4i8;
1138+
// We also consider v2f32 as an upsized type, which may be used in packed
1139+
// (f32x2) instructions.
1140+
return Isv2x16VT(EltVT) || EltVT == MVT::v4i8 || EltVT == MVT::v2f32;
11391141
}
11401142

11411143
static unsigned getLoadStoreVectorNumElts(SDNode *N) {
@@ -1187,9 +1189,11 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
11871189

11881190
unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N);
11891191

1190-
if (isSubVectorPackedInI32(EltVT)) {
1192+
if (isSubVectorPackedInInteger(EltVT)) {
11911193
assert(ExtensionType == ISD::NON_EXTLOAD);
1192-
EltVT = MVT::i32;
1194+
FromTypeWidth = EltVT.getSizeInBits();
1195+
EltVT = MVT::getIntegerVT(FromTypeWidth);
1196+
FromType = NVPTX::PTXLdStInstCode::Untyped;
11931197
}
11941198

11951199
assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
@@ -1497,8 +1501,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
14971501
SDValue N2 = N->getOperand(NumElts + 1);
14981502
unsigned ToTypeWidth = TotalWidth / NumElts;
14991503

1500-
if (isSubVectorPackedInI32(EltVT)) {
1501-
EltVT = MVT::i32;
1504+
if (isSubVectorPackedInInteger(EltVT)) {
1505+
ToTypeWidth = EltVT.getSizeInBits();
1506+
EltVT = MVT::getIntegerVT(ToTypeWidth);
15021507
}
15031508

15041509
assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
865865
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
866866
ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT,
867867
ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::FP_ROUND,
868-
ISD::TRUNCATE, ISD::LOAD});
868+
ISD::TRUNCATE, ISD::LOAD, ISD::BITCAST});
869869

870870
// setcc for f16x2 and bf16x2 needs special handling to prevent
871871
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -6287,6 +6287,61 @@ static SDValue PerformTRUNCATECombine(SDNode *N,
62876287
return SDValue();
62886288
}
62896289

6290+
static SDValue PerformBITCASTCombine(SDNode *N,
6291+
TargetLowering::DAGCombinerInfo &DCI) {
6292+
if (N->getValueType(0) != MVT::v2f32)
6293+
return SDValue();
6294+
6295+
SDValue Operand = N->getOperand(0);
6296+
if (Operand.getValueType() != MVT::i64)
6297+
return SDValue();
6298+
6299+
// DAGCombiner handles bitcast(ISD::LOAD) already. For these, we'll do the
6300+
// same thing, by changing their output values from i64 to v2f32. Then the
6301+
// rule for combining loads (see PerformLoadCombine) may split these loads
6302+
// further.
6303+
if (Operand.getOpcode() == NVPTXISD::LoadV2 ||
6304+
Operand.getOpcode() == NVPTXISD::LoadParam ||
6305+
Operand.getOpcode() == NVPTXISD::LoadParamV2) {
6306+
// check for all bitcasts
6307+
SmallVector<std::pair<SDNode *, unsigned /* resno */>> OldUses;
6308+
for (SDUse &U : Operand->uses()) {
6309+
SDNode *User = U.getUser();
6310+
if (!(User->getOpcode() == ISD::BITCAST &&
6311+
User->getValueType(0) == MVT::v2f32 &&
6312+
U.getValueType() == MVT::i64))
6313+
return SDValue(); // unhandled pattern
6314+
OldUses.push_back({User, U.getResNo()});
6315+
}
6316+
6317+
auto *MemN = cast<MemSDNode>(Operand);
6318+
SmallVector<EVT> VTs;
6319+
for (const auto &VT : Operand->values()) {
6320+
if (VT == MVT::i64)
6321+
VTs.push_back(MVT::v2f32);
6322+
else
6323+
VTs.push_back(VT);
6324+
}
6325+
6326+
SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
6327+
Operand.getOpcode(), SDLoc(Operand), DCI.DAG.getVTList(VTs),
6328+
SmallVector<SDValue>(Operand->ops()), MemN->getMemoryVT(),
6329+
MemN->getMemOperand());
6330+
6331+
// replace all chain/glue uses of the old load
6332+
for (unsigned I = 0, E = Operand->getNumValues(); I != E; ++I)
6333+
if (Operand->getValueType(I) != MVT::i64)
6334+
DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(MemN, I),
6335+
NewLoad.getValue(I));
6336+
6337+
// replace all bitcasts with values from the new load
6338+
for (auto &[BC, ResultNum] : OldUses)
6339+
DCI.CombineTo(BC, NewLoad.getValue(ResultNum), false);
6340+
}
6341+
6342+
return SDValue();
6343+
}
6344+
62906345
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
62916346
DAGCombinerInfo &DCI) const {
62926347
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6332,6 +6387,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
63326387
return PerformFP_ROUNDCombine(N, DCI);
63336388
case ISD::TRUNCATE:
63346389
return PerformTRUNCATECombine(N, DCI);
6390+
case ISD::BITCAST:
6391+
return PerformBITCASTCombine(N, DCI);
63356392
}
63366393
return SDValue();
63376394
}

llvm/test/CodeGen/NVPTX/f32x2-instructions.ll

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,14 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
117117
define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
118118
; CHECK-LABEL: test_fadd_v4(
119119
; CHECK: {
120-
; CHECK-NEXT: .reg .b64 %rd<11>;
120+
; CHECK-NEXT: .reg .b64 %rd<7>;
121121
; CHECK-EMPTY:
122122
; CHECK-NEXT: // %bb.0:
123-
; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_fadd_v4_param_1];
124-
; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [test_fadd_v4_param_0];
125-
; CHECK-NEXT: add.rn.f32x2 %rd9, %rd8, %rd6;
126-
; CHECK-NEXT: add.rn.f32x2 %rd10, %rd7, %rd5;
127-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
123+
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1];
124+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0];
125+
; CHECK-NEXT: add.rn.f32x2 %rd5, %rd2, %rd4;
126+
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd3;
127+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5};
128128
; CHECK-NEXT: ret;
129129
%r = fadd <4 x float> %a, %b
130130
ret <4 x float> %r
@@ -134,19 +134,19 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
134134
; CHECK-LABEL: test_fadd_imm_0_v4(
135135
; CHECK: {
136136
; CHECK-NEXT: .reg .b32 %r<5>;
137-
; CHECK-NEXT: .reg .b64 %rd<9>;
137+
; CHECK-NEXT: .reg .b64 %rd<7>;
138138
; CHECK-EMPTY:
139139
; CHECK-NEXT: // %bb.0:
140-
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_imm_0_v4_param_0];
140+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0];
141141
; CHECK-NEXT: mov.b32 %r1, 0f40800000;
142142
; CHECK-NEXT: mov.b32 %r2, 0f40400000;
143-
; CHECK-NEXT: mov.b64 %rd5, {%r2, %r1};
144-
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
143+
; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1};
144+
; CHECK-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3;
145145
; CHECK-NEXT: mov.b32 %r3, 0f40000000;
146146
; CHECK-NEXT: mov.b32 %r4, 0f3F800000;
147-
; CHECK-NEXT: mov.b64 %rd7, {%r4, %r3};
148-
; CHECK-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
149-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
147+
; CHECK-NEXT: mov.b64 %rd5, {%r4, %r3};
148+
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5;
149+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
150150
; CHECK-NEXT: ret;
151151
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
152152
ret <4 x float> %r
@@ -156,19 +156,19 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
156156
; CHECK-LABEL: test_fadd_imm_1_v4(
157157
; CHECK: {
158158
; CHECK-NEXT: .reg .b32 %r<5>;
159-
; CHECK-NEXT: .reg .b64 %rd<9>;
159+
; CHECK-NEXT: .reg .b64 %rd<7>;
160160
; CHECK-EMPTY:
161161
; CHECK-NEXT: // %bb.0:
162-
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_imm_1_v4_param_0];
162+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0];
163163
; CHECK-NEXT: mov.b32 %r1, 0f40800000;
164164
; CHECK-NEXT: mov.b32 %r2, 0f40400000;
165-
; CHECK-NEXT: mov.b64 %rd5, {%r2, %r1};
166-
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
165+
; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1};
166+
; CHECK-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3;
167167
; CHECK-NEXT: mov.b32 %r3, 0f40000000;
168168
; CHECK-NEXT: mov.b32 %r4, 0f3F800000;
169-
; CHECK-NEXT: mov.b64 %rd7, {%r4, %r3};
170-
; CHECK-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
171-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
169+
; CHECK-NEXT: mov.b64 %rd5, {%r4, %r3};
170+
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5;
171+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
172172
; CHECK-NEXT: ret;
173173
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
174174
ret <4 x float> %r
@@ -340,14 +340,14 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
340340
define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
341341
; CHECK-LABEL: test_fadd_v4_ftz(
342342
; CHECK: {
343-
; CHECK-NEXT: .reg .b64 %rd<11>;
343+
; CHECK-NEXT: .reg .b64 %rd<7>;
344344
; CHECK-EMPTY:
345345
; CHECK-NEXT: // %bb.0:
346-
; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_fadd_v4_ftz_param_1];
347-
; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [test_fadd_v4_ftz_param_0];
348-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd9, %rd8, %rd6;
349-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd10, %rd7, %rd5;
350-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
346+
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1];
347+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0];
348+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd5, %rd2, %rd4;
349+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd3;
350+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5};
351351
; CHECK-NEXT: ret;
352352
%r = fadd <4 x float> %a, %b
353353
ret <4 x float> %r
@@ -357,19 +357,19 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
357357
; CHECK-LABEL: test_fadd_imm_0_v4_ftz(
358358
; CHECK: {
359359
; CHECK-NEXT: .reg .b32 %r<5>;
360-
; CHECK-NEXT: .reg .b64 %rd<9>;
360+
; CHECK-NEXT: .reg .b64 %rd<7>;
361361
; CHECK-EMPTY:
362362
; CHECK-NEXT: // %bb.0:
363-
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_imm_0_v4_ftz_param_0];
363+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0];
364364
; CHECK-NEXT: mov.b32 %r1, 0f40800000;
365365
; CHECK-NEXT: mov.b32 %r2, 0f40400000;
366-
; CHECK-NEXT: mov.b64 %rd5, {%r2, %r1};
367-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
366+
; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1};
367+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3;
368368
; CHECK-NEXT: mov.b32 %r3, 0f40000000;
369369
; CHECK-NEXT: mov.b32 %r4, 0f3F800000;
370-
; CHECK-NEXT: mov.b64 %rd7, {%r4, %r3};
371-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
372-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
370+
; CHECK-NEXT: mov.b64 %rd5, {%r4, %r3};
371+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5;
372+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
373373
; CHECK-NEXT: ret;
374374
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
375375
ret <4 x float> %r
@@ -379,19 +379,19 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
379379
; CHECK-LABEL: test_fadd_imm_1_v4_ftz(
380380
; CHECK: {
381381
; CHECK-NEXT: .reg .b32 %r<5>;
382-
; CHECK-NEXT: .reg .b64 %rd<9>;
382+
; CHECK-NEXT: .reg .b64 %rd<7>;
383383
; CHECK-EMPTY:
384384
; CHECK-NEXT: // %bb.0:
385-
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_imm_1_v4_ftz_param_0];
385+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0];
386386
; CHECK-NEXT: mov.b32 %r1, 0f40800000;
387387
; CHECK-NEXT: mov.b32 %r2, 0f40400000;
388-
; CHECK-NEXT: mov.b64 %rd5, {%r2, %r1};
389-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
388+
; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1};
389+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3;
390390
; CHECK-NEXT: mov.b32 %r3, 0f40000000;
391391
; CHECK-NEXT: mov.b32 %r4, 0f3F800000;
392-
; CHECK-NEXT: mov.b64 %rd7, {%r4, %r3};
393-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
394-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
392+
; CHECK-NEXT: mov.b64 %rd5, {%r4, %r3};
393+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5;
394+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
395395
; CHECK-NEXT: ret;
396396
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
397397
ret <4 x float> %r

llvm/test/CodeGen/NVPTX/vec-param-load.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,40 +5,40 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
55

66
define <16 x float> @test_v16f32(<16 x float> %a) {
77
; CHECK-LABEL: test_v16f32(
8-
; CHECK-DAG: ld.param.v4.b32 {[[V_12_15:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];
9-
; CHECK-DAG: ld.param.v4.b32 {[[V_8_11:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];
10-
; CHECK-DAG: ld.param.v4.b32 {[[V_4_7:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];
11-
; CHECK-DAG: ld.param.v4.b32 {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0];
12-
; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_3]]}
13-
; CHECK-DAG: st.param.v4.b32 [func_retval0+16], {[[V_4_7]]}
14-
; CHECK-DAG: st.param.v4.b32 [func_retval0+32], {[[V_8_11]]}
15-
; CHECK-DAG: st.param.v4.b32 [func_retval0+48], {[[V_12_15]]}
8+
; CHECK-DAG: ld.param.v2.b64 {[[V_12_15:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0+48];
9+
; CHECK-DAG: ld.param.v2.b64 {[[V_8_11:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0+32];
10+
; CHECK-DAG: ld.param.v2.b64 {[[V_4_7:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0+16];
11+
; CHECK-DAG: ld.param.v2.b64 {[[V_0_3:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0];
12+
; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_3]]}
13+
; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_4_7]]}
14+
; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_8_11]]}
15+
; CHECK-DAG: st.param.v2.b64 [func_retval0+48], {[[V_12_15]]}
1616
; CHECK: ret;
1717
ret <16 x float> %a
1818
}
1919

2020
define <8 x float> @test_v8f32(<8 x float> %a) {
2121
; CHECK-LABEL: test_v8f32(
22-
; CHECK-DAG: ld.param.v4.b32 {[[V_4_7:(%r[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16];
23-
; CHECK-DAG: ld.param.v4.b32 {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v8f32_param_0];
24-
; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_3]]}
25-
; CHECK-DAG: st.param.v4.b32 [func_retval0+16], {[[V_4_7]]}
22+
; CHECK-DAG: ld.param.v2.b64 {[[V_4_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8f32_param_0+16];
23+
; CHECK-DAG: ld.param.v2.b64 {[[V_0_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8f32_param_0];
24+
; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_3]]}
25+
; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_4_7]]}
2626
; CHECK: ret;
2727
ret <8 x float> %a
2828
}
2929

3030
define <4 x float> @test_v4f32(<4 x float> %a) {
3131
; CHECK-LABEL: test_v4f32(
32-
; CHECK-DAG: ld.param.v4.b32 {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v4f32_param_0];
33-
; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_3]]}
32+
; CHECK-DAG: ld.param.v2.b64 {[[V_0_3:(%rd[0-9]+[, ]*){2}]]}, [test_v4f32_param_0];
33+
; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_3]]}
3434
; CHECK: ret;
3535
ret <4 x float> %a
3636
}
3737

3838
define <2 x float> @test_v2f32(<2 x float> %a) {
3939
; CHECK-LABEL: test_v2f32(
40-
; CHECK-DAG: ld.param.v2.b32 {[[V_0_3:(%r[0-9]+[, ]*){2}]]}, [test_v2f32_param_0];
41-
; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[V_0_3]]}
40+
; CHECK-DAG: ld.param.b64 [[V_0_3:%rd[0-9]+]], [test_v2f32_param_0];
41+
; CHECK-DAG: st.param.b64 [func_retval0], [[V_0_3]]
4242
; CHECK: ret;
4343
ret <2 x float> %a
4444
}

0 commit comments

Comments
 (0)