Skip to content

Commit 520da79

Browse files
committed
[NVPTX] add coverage for v2f32 in ldg-invariant and fp-contract
for fp-contract: - test folding of fma.f32x2 - bump SM version to 100 for ldg-invariant: - test proper splitting of loads on vectors of f32
1 parent 4c67342 commit 520da79

File tree

2 files changed

+96
-4
lines changed

2 files changed

+96
-4
lines changed

llvm/test/CodeGen/NVPTX/fp-contract.ll

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST
2-
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT
3-
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
4-
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %}
1+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | FileCheck %s --check-prefix=FAST
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefix=DEFAULT
3+
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch=sm_100 %}
4+
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 | %ptxas-verify -arch=sm_100 %}
55

66
target triple = "nvptx64-unknown-cuda"
77

@@ -33,3 +33,25 @@ define float @t1(float %a, float %b) {
3333
%v1 = fadd float %a, %b
3434
ret float %v1
3535
}
36+
37+
;; FAST-LABEL: @t0_v2
38+
;; DEFAULT-LABEL: @t0_v2
39+
define <2 x float> @t0_v2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
40+
;; FAST: fma.rn.f32x2
41+
;; DEFAULT: mul.rn.f32x2
42+
;; DEFAULT: add.rn.f32x2
43+
%v0 = fmul <2 x float> %a, %b
44+
%v1 = fadd <2 x float> %v0, %c
45+
ret <2 x float> %v1
46+
}
47+
48+
;; FAST-LABEL: @t1_v2
49+
;; DEFAULT-LABEL: @t1_v2
50+
define <2 x float> @t1_v2(<2 x float> %a, <2 x float> %b) {
51+
;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
52+
;; to prevent ptxas from fusing this with anything else.
53+
;; FAST: add.f32
54+
;; DEFAULT: add.rn.f32
55+
%v1 = fadd <2 x float> %a, %b
56+
ret <2 x float> %v1
57+
}

llvm/test/CodeGen/NVPTX/ldg-invariant.ll

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,76 @@ define half @ld_global_v8f16(ptr addrspace(1) %ptr) {
127127
ret half %sum
128128
}
129129

130+
define float @ld_global_v2f32(ptr addrspace(1) %ptr) {
131+
; CHECK-LABEL: ld_global_v2f32(
132+
; CHECK: {
133+
; CHECK-NEXT: .reg .f32 %f<4>;
134+
; CHECK-NEXT: .reg .b64 %rd<2>;
135+
; CHECK-EMPTY:
136+
; CHECK-NEXT: // %bb.0:
137+
; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v2f32_param_0];
138+
; CHECK-NEXT: ld.global.nc.v2.f32 {%f1, %f2}, [%rd1];
139+
; CHECK-NEXT: add.rn.f32 %f3, %f1, %f2;
140+
; CHECK-NEXT: st.param.f32 [func_retval0], %f3;
141+
; CHECK-NEXT: ret;
142+
%a = load <2 x float>, ptr addrspace(1) %ptr, !invariant.load !0
143+
%v1 = extractelement <2 x float> %a, i32 0
144+
%v2 = extractelement <2 x float> %a, i32 1
145+
%sum = fadd float %v1, %v2
146+
ret float %sum
147+
}
148+
149+
define float @ld_global_v4f32(ptr addrspace(1) %ptr) {
150+
; CHECK-LABEL: ld_global_v4f32(
151+
; CHECK: {
152+
; CHECK-NEXT: .reg .f32 %f<8>;
153+
; CHECK-NEXT: .reg .b64 %rd<2>;
154+
; CHECK-EMPTY:
155+
; CHECK-NEXT: // %bb.0:
156+
; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v4f32_param_0];
157+
; CHECK-NEXT: ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
158+
; CHECK-NEXT: add.rn.f32 %f5, %f1, %f2;
159+
; CHECK-NEXT: add.rn.f32 %f6, %f3, %f4;
160+
; CHECK-NEXT: add.rn.f32 %f7, %f5, %f6;
161+
; CHECK-NEXT: st.param.f32 [func_retval0], %f7;
162+
; CHECK-NEXT: ret;
163+
%a = load <4 x float>, ptr addrspace(1) %ptr, !invariant.load !0
164+
%v1 = extractelement <4 x float> %a, i32 0
165+
%v2 = extractelement <4 x float> %a, i32 1
166+
%v3 = extractelement <4 x float> %a, i32 2
167+
%v4 = extractelement <4 x float> %a, i32 3
168+
%sum1 = fadd float %v1, %v2
169+
%sum2 = fadd float %v3, %v4
170+
%sum = fadd float %sum1, %sum2
171+
ret float %sum
172+
}
173+
174+
define float @ld_global_v8f32(ptr addrspace(1) %ptr) {
175+
; CHECK-LABEL: ld_global_v8f32(
176+
; CHECK: {
177+
; CHECK-NEXT: .reg .f32 %f<12>;
178+
; CHECK-NEXT: .reg .b64 %rd<2>;
179+
; CHECK-EMPTY:
180+
; CHECK-NEXT: // %bb.0:
181+
; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v8f32_param_0];
182+
; CHECK-NEXT: ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1+16];
183+
; CHECK-NEXT: ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1];
184+
; CHECK-NEXT: add.rn.f32 %f9, %f5, %f7;
185+
; CHECK-NEXT: add.rn.f32 %f10, %f1, %f3;
186+
; CHECK-NEXT: add.rn.f32 %f11, %f9, %f10;
187+
; CHECK-NEXT: st.param.f32 [func_retval0], %f11;
188+
; CHECK-NEXT: ret;
189+
%a = load <8 x float>, ptr addrspace(1) %ptr, !invariant.load !0
190+
%v1 = extractelement <8 x float> %a, i32 0
191+
%v2 = extractelement <8 x float> %a, i32 2
192+
%v3 = extractelement <8 x float> %a, i32 4
193+
%v4 = extractelement <8 x float> %a, i32 6
194+
%sum1 = fadd float %v1, %v2
195+
%sum2 = fadd float %v3, %v4
196+
%sum = fadd float %sum1, %sum2
197+
ret float %sum
198+
}
199+
130200
define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) {
131201
; CHECK-LABEL: ld_global_v8i8(
132202
; CHECK: {

0 commit comments

Comments
 (0)