[NVPTX] add coverage for v2f32 in ldg-invariant and fp-contract

Prince781 · Prince781 · commit 520da7984feb · 2025-03-14T19:00:33.000-07:00
for fp-contract:
- test folding of fma.f32x2
- bump SM version to 100

for ldg-invariant:
- test proper splitting of loads on vectors of f32
diff --git a/llvm/test/CodeGen/NVPTX/fp-contract.ll b/llvm/test/CodeGen/NVPTX/fp-contract.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %}
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefix=DEFAULT
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch=sm_100 %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 | %ptxas-verify -arch=sm_100 %}
 
 target triple = "nvptx64-unknown-cuda"
 
@@ -33,3 +33,25 @@ define float @t1(float %a, float %b) {
   %v1 = fadd float %a, %b
   ret float %v1
 }
+
+;; FAST-LABEL: @t0_v2
+;; DEFAULT-LABEL: @t0_v2
+define <2 x float> @t0_v2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+;; FAST: fma.rn.f32x2
+;; DEFAULT: mul.rn.f32x2
+;; DEFAULT: add.rn.f32x2
+  %v0 = fmul <2 x float> %a, %b
+  %v1 = fadd <2 x float> %v0, %c
+  ret <2 x float> %v1
+}
+
+;; FAST-LABEL: @t1_v2
+;; DEFAULT-LABEL: @t1_v2
+define <2 x float> @t1_v2(<2 x float> %a, <2 x float> %b) {
+;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
+;; to prevent ptxas from fusing this with anything else.
+;; FAST: add.f32
+;; DEFAULT: add.rn.f32
+  %v1 = fadd <2 x float> %a, %b
+  ret <2 x float> %v1
+}
diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
@@ -127,6 +127,76 @@ define half @ld_global_v8f16(ptr addrspace(1) %ptr) {
   ret half %sum
 }
 
+define float @ld_global_v2f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: ld_global_v2f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<4>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v2f32_param_0];
+; CHECK-NEXT:    ld.global.nc.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-NEXT:    ret;
+  %a = load <2 x float>, ptr addrspace(1) %ptr, !invariant.load !0
+  %v1 = extractelement <2 x float> %a, i32 0
+  %v2 = extractelement <2 x float> %a, i32 1
+  %sum = fadd float %v1, %v2
+  ret float %sum
+}
+
+define float @ld_global_v4f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: ld_global_v4f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<8>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v4f32_param_0];
+; CHECK-NEXT:    ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %f5, %f1, %f2;
+; CHECK-NEXT:    add.rn.f32 %f6, %f3, %f4;
+; CHECK-NEXT:    add.rn.f32 %f7, %f5, %f6;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f7;
+; CHECK-NEXT:    ret;
+  %a = load <4 x float>, ptr addrspace(1) %ptr, !invariant.load !0
+  %v1 = extractelement <4 x float> %a, i32 0
+  %v2 = extractelement <4 x float> %a, i32 1
+  %v3 = extractelement <4 x float> %a, i32 2
+  %v4 = extractelement <4 x float> %a, i32 3
+  %sum1 = fadd float %v1, %v2
+  %sum2 = fadd float %v3, %v4
+  %sum = fadd float %sum1, %sum2
+  ret float %sum
+}
+
+define float @ld_global_v8f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: ld_global_v8f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<12>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v8f32_param_0];
+; CHECK-NEXT:    ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1+16];
+; CHECK-NEXT:    ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1];
+; CHECK-NEXT:    add.rn.f32 %f9, %f5, %f7;
+; CHECK-NEXT:    add.rn.f32 %f10, %f1, %f3;
+; CHECK-NEXT:    add.rn.f32 %f11, %f9, %f10;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f11;
+; CHECK-NEXT:    ret;
+  %a = load <8 x float>, ptr addrspace(1) %ptr, !invariant.load !0
+  %v1 = extractelement <8 x float> %a, i32 0
+  %v2 = extractelement <8 x float> %a, i32 2
+  %v3 = extractelement <8 x float> %a, i32 4
+  %v4 = extractelement <8 x float> %a, i32 6
+  %sum1 = fadd float %v1, %v2
+  %sum2 = fadd float %v3, %v4
+  %sum = fadd float %sum1, %sum2
+  ret float %sum
+}
+
 define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: ld_global_v8i8(
 ; CHECK:       {