Skip to content

[AArch64] Keep floating-point conversion in SIMD #147707

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

guy-david
Copy link
Contributor

@guy-david guy-david commented Jul 9, 2025

Stores can be issued faster if the result is kept in the SIMD/FP registers.
The HasOneUse guards against creating two floating point conversions, if for example there's some arithmetic done on the converted value as well. Another approach would be to inspect the user instructions during lowering, but I don't see that type of check in the lowering too often, so I settled for tablegen.

@llvmbot
Copy link
Member

llvmbot commented Jul 9, 2025

@llvm/pr-subscribers-backend-aarch64

Author: Guy David (guy-david)

Changes

Stores can be issued faster if the result is kept in the SIMD/FP registers.
The HasOneUse guards against creating two floating point conversions, if for example there's some arithmetic done on the converted value as well. Another approach would be to inspect the user instructions during lowering, but I don't see that pattern too often, so I settled for tablegen.


Full diff: https://github.com/llvm/llvm-project/pull/147707.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+42)
  • (added) llvm/test/CodeGen/AArch64/store-float-conversion.ll (+117)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 811877ffacedb..68a1f41535680 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6598,6 +6598,16 @@ def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
 def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
           (FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;
 
+let HasOneUse = 1 in {
+def fp_to_uint_oneuse : PatFrag<(ops node:$src0), (fp_to_uint $src0)>;
+def fp_to_sint_oneuse : PatFrag<(ops node:$src0), (fp_to_sint $src0)>;
+}
+
+def ignore_assertzext : PatFrag<
+  (ops node:$src),
+  (assertzext node:$src)
+>;
+
 // Some float -> int -> float conversion patterns for which we want to keep the
 // int values in FP registers using the corresponding NEON instructions to
 // avoid more costly int <-> fp register transfers.
@@ -6632,6 +6642,38 @@ def : Pat<(f64 (sint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
 def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
           (UCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;
 
+// float -> int conversion followed by a store should use the value in the first
+// lane to avoid expensive fpr -> gpr transfers.
+let AddedComplexity = 19 in {
+// f32 -> i32
+def : Pat<(store (ignore_assertzext (i32 (fp_to_uint_oneuse f32:$src))), GPR64sp:$Rn),
+          (STRSui (FCVTZUv1i32 f32:$src), GPR64sp:$Rn, (i64 0))>;
+def : Pat<(store (ignore_assertzext (i32 (fp_to_sint_oneuse f32:$src))), GPR64sp:$Rn),
+          (STRSui (FCVTZSv1i32 f32:$src), GPR64sp:$Rn, (i64 0))>;
+
+// f64 -> i64
+def : Pat<(store (ignore_assertzext (i64 (fp_to_uint_oneuse f64:$src))), GPR64sp:$Rn),
+          (STRDui (FCVTZUv1i64 f64:$src), GPR64sp:$Rn, (i64 0))>;
+def : Pat<(store (ignore_assertzext (i64 (fp_to_sint_oneuse f64:$src))), GPR64sp:$Rn),
+          (STRDui (FCVTZSv1i64 f64:$src), GPR64sp:$Rn, (i64 0))>;
+
+// f32 -> i8
+def : Pat<(truncstorei8 (ignore_assertzext (i32 (fp_to_uint_oneuse (f32 FPR32:$src)))), GPR64sp:$Rn),
+          (STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), bsub)),
+                  GPR64sp:$Rn, (i64 0))>;
+def : Pat<(truncstorei8 (ignore_assertzext (i32 (fp_to_sint_oneuse (f32 FPR32:$src)))), GPR64sp:$Rn),
+          (STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), bsub)),
+                  GPR64sp:$Rn, (i64 0))>;
+
+// f32 -> i16
+def : Pat<(truncstorei16 (ignore_assertzext (i32 (fp_to_uint_oneuse (f32 FPR32:$src)))), GPR64sp:$Rn),
+          (STRHui (f16 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), hsub)), 
+                  GPR64sp:$Rn, (i64 0))>;
+def : Pat<(truncstorei16 (ignore_assertzext (i32 (fp_to_sint_oneuse (f32 FPR32:$src)))), GPR64sp:$Rn),
+          (STRHui (f16 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), hsub)), 
+                  GPR64sp:$Rn, (i64 0))>;
+}
+
 // fp16: integer extraction from vector must be at least 32-bits to be legal.
 // Actual extraction result is then an in-reg sign-extension of lower 16-bits.
 let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in {
diff --git a/llvm/test/CodeGen/AArch64/store-float-conversion.ll b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
new file mode 100644
index 0000000000000..ca12fcb1dcc1b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
+
+define void @f32_to_u8(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu s0, s0
+; CHECK-NEXT:    str b0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %conv = fptoui float %f to i32
+  %trunc = trunc i32 %conv to i8
+  store i8 %trunc, ptr %dst
+  ret void
+}
+
+define void @f32_to_s8(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    str b0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %conv = fptosi float %f to i32
+  %trunc = trunc i32 %conv to i8
+  store i8 %trunc, ptr %dst
+  ret void
+}
+
+define void @f32_to_u16(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu s0, s0
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %conv = fptoui float %f to i32
+  %trunc = trunc i32 %conv to i16
+  store i16 %trunc, ptr %dst
+  ret void
+}
+
+define void @f32_to_s16(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %conv = fptosi float %f to i32
+  %trunc = trunc i32 %conv to i16
+  store i16 %trunc, ptr %dst
+  ret void
+}
+
+define void @f32_to_u32(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu s0, s0
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %conv = fptoui float %f to i32
+  store i32 %conv, ptr %dst
+  ret void
+}
+
+define void @f32_to_s32(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %conv = fptosi float %f to i32
+  store i32 %conv, ptr %dst
+  ret void
+}
+
+define void @f64_to_u64(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu d0, d0
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %conv = fptoui double %d to i64
+  store i64 %conv, ptr %dst
+  ret void
+}
+
+define void @f64_to_s64(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %conv = fptosi double %d to i64
+  store i64 %conv, ptr %dst
+  ret void
+}
+
+define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_i32_multiple_uses:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs w8, s0
+; CHECK-NEXT:    mov x9, x0
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    strb w8, [x9]
+; CHECK-NEXT:    ret
+entry:
+  %conv = fptosi float %f to i32
+  %trunc = trunc i32 %conv to i8
+  store i8 %trunc, ptr %dst
+  ret i32 %conv
+}

Stores can be issued faster if the result is kept in the SIMD/FP registers.
@guy-david guy-david force-pushed the users/guy-david/aarch64-n2i-keep-in-simd branch from 6b54790 to c73288a Compare July 9, 2025 15:02
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants