Skip to content

Commit fd94002

Browse files
committed
[NVPTX] use sink symbol for single-element unpacking of v2f32s
1 parent 852a4a9 commit fd94002

File tree

3 files changed

+34
-20
lines changed

3 files changed

+34
-20
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2933,9 +2933,18 @@ let hasSideEffects = false in {
29332933
(ins Int64Regs:$s),
29342934
"{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
29352935
[]>;
2936-
29372936
// PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the
29382937
// unused high/low part.
2938+
def I64toF32H_Sink : NVPTXInst<(outs Float32Regs:$high),
2939+
(ins Int64Regs:$s),
2940+
"mov.b64 {{_, $high}}, $s;",
2941+
[]>,
2942+
Requires<[hasPTX<71>]>;
2943+
def I64toF32L_Sink : NVPTXInst<(outs Float32Regs:$low),
2944+
(ins Int64Regs:$s),
2945+
"mov.b64 {{$low, _}}, $s;",
2946+
[]>,
2947+
Requires<[hasPTX<71>]>;
29392948
def I32toI16H_Sink : NVPTXInst<(outs Int16Regs:$high),
29402949
(ins Int32Regs:$s),
29412950
"mov.b32 \t{{_, $high}}, $s;",
@@ -2976,6 +2985,11 @@ foreach vt = [v2f16, v2bf16, v2i16] in {
29762985
def : Pat<(extractelt vt:$src, 1), (I32toI16H $src)>;
29772986
}
29782987

2988+
def : Pat<(extractelt v2f32:$src, 0),
2989+
(I64toF32L_Sink $src)>, Requires<[hasPTX<71>]>;
2990+
def : Pat<(extractelt v2f32:$src, 1),
2991+
(I64toF32H_Sink $src)>, Requires<[hasPTX<71>]>;
2992+
29792993
def : Pat<(extractelt v2f32:$src, 0),
29802994
(I64toF32L $src)>;
29812995
def : Pat<(extractelt v2f32:$src, 1),

llvm/test/CodeGen/NVPTX/f32x2-instructions.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ define float @test_extract_0(<2 x float> %a) #0 {
3333
; CHECK-EMPTY:
3434
; CHECK-NEXT: // %bb.0:
3535
; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_0_param_0];
36-
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
36+
; CHECK-NEXT: mov.b64 {%r1, _}, %rd1;
3737
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
3838
; CHECK-NEXT: ret;
3939
%e = extractelement <2 x float> %a, i32 0
@@ -48,7 +48,7 @@ define float @test_extract_1(<2 x float> %a) #0 {
4848
; CHECK-EMPTY:
4949
; CHECK-NEXT: // %bb.0:
5050
; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_1_param_0];
51-
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; }
51+
; CHECK-NEXT: mov.b64 {_, %r1}, %rd1;
5252
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
5353
; CHECK-NEXT: ret;
5454
%e = extractelement <2 x float> %a, i32 1

llvm/test/CodeGen/NVPTX/ldg-invariant.ll

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -128,14 +128,14 @@ define half @ld_global_v8f16(ptr addrspace(1) %ptr) {
128128
define float @ld_global_v2f32(ptr addrspace(1) %ptr) {
129129
; CHECK-LABEL: ld_global_v2f32(
130130
; CHECK: {
131-
; CHECK-NEXT: .reg .b32 %f<4>;
131+
; CHECK-NEXT: .reg .b32 %r<4>;
132132
; CHECK-NEXT: .reg .b64 %rd<2>;
133133
; CHECK-EMPTY:
134134
; CHECK-NEXT: // %bb.0:
135135
; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v2f32_param_0];
136-
; CHECK-NEXT: ld.global.nc.v2.b32 {%f1, %f2}, [%rd1];
137-
; CHECK-NEXT: add.rn.f32 %f3, %f1, %f2;
138-
; CHECK-NEXT: st.param.b32 [func_retval0], %f3;
136+
; CHECK-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd1];
137+
; CHECK-NEXT: add.rn.f32 %r3, %r1, %r2;
138+
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
139139
; CHECK-NEXT: ret;
140140
%a = load <2 x float>, ptr addrspace(1) %ptr, !invariant.load !0
141141
%v1 = extractelement <2 x float> %a, i32 0
@@ -147,16 +147,16 @@ define float @ld_global_v2f32(ptr addrspace(1) %ptr) {
147147
define float @ld_global_v4f32(ptr addrspace(1) %ptr) {
148148
; CHECK-LABEL: ld_global_v4f32(
149149
; CHECK: {
150-
; CHECK-NEXT: .reg .b32 %f<8>;
150+
; CHECK-NEXT: .reg .b32 %r<8>;
151151
; CHECK-NEXT: .reg .b64 %rd<2>;
152152
; CHECK-EMPTY:
153153
; CHECK-NEXT: // %bb.0:
154154
; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v4f32_param_0];
155-
; CHECK-NEXT: ld.global.nc.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
156-
; CHECK-NEXT: add.rn.f32 %f5, %f1, %f2;
157-
; CHECK-NEXT: add.rn.f32 %f6, %f3, %f4;
158-
; CHECK-NEXT: add.rn.f32 %f7, %f5, %f6;
159-
; CHECK-NEXT: st.param.b32 [func_retval0], %f7;
155+
; CHECK-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
156+
; CHECK-NEXT: add.rn.f32 %r5, %r1, %r2;
157+
; CHECK-NEXT: add.rn.f32 %r6, %r3, %r4;
158+
; CHECK-NEXT: add.rn.f32 %r7, %r5, %r6;
159+
; CHECK-NEXT: st.param.b32 [func_retval0], %r7;
160160
; CHECK-NEXT: ret;
161161
%a = load <4 x float>, ptr addrspace(1) %ptr, !invariant.load !0
162162
%v1 = extractelement <4 x float> %a, i32 0
@@ -172,17 +172,17 @@ define float @ld_global_v4f32(ptr addrspace(1) %ptr) {
172172
define float @ld_global_v8f32(ptr addrspace(1) %ptr) {
173173
; CHECK-LABEL: ld_global_v8f32(
174174
; CHECK: {
175-
; CHECK-NEXT: .reg .b32 %f<12>;
175+
; CHECK-NEXT: .reg .b32 %r<12>;
176176
; CHECK-NEXT: .reg .b64 %rd<2>;
177177
; CHECK-EMPTY:
178178
; CHECK-NEXT: // %bb.0:
179179
; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v8f32_param_0];
180-
; CHECK-NEXT: ld.global.nc.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1+16];
181-
; CHECK-NEXT: ld.global.nc.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1];
182-
; CHECK-NEXT: add.rn.f32 %f9, %f5, %f7;
183-
; CHECK-NEXT: add.rn.f32 %f10, %f1, %f3;
184-
; CHECK-NEXT: add.rn.f32 %f11, %f9, %f10;
185-
; CHECK-NEXT: st.param.b32 [func_retval0], %f11;
180+
; CHECK-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
181+
; CHECK-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
182+
; CHECK-NEXT: add.rn.f32 %r9, %r5, %r7;
183+
; CHECK-NEXT: add.rn.f32 %r10, %r1, %r3;
184+
; CHECK-NEXT: add.rn.f32 %r11, %r9, %r10;
185+
; CHECK-NEXT: st.param.b32 [func_retval0], %r11;
186186
; CHECK-NEXT: ret;
187187
%a = load <8 x float>, ptr addrspace(1) %ptr, !invariant.load !0
188188
%v1 = extractelement <8 x float> %a, i32 0

0 commit comments

Comments
 (0)