-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AArch64] Keep floating-point conversion in SIMD #147707
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Guy David (guy-david) ChangesStores can be issued faster if the result is kept in the SIMD/FP registers. Full diff: https://github.com/llvm/llvm-project/pull/147707.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 811877ffacedb..68a1f41535680 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6598,6 +6598,16 @@ def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
(FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+let HasOneUse = 1 in {
+def fp_to_uint_oneuse : PatFrag<(ops node:$src0), (fp_to_uint $src0)>;
+def fp_to_sint_oneuse : PatFrag<(ops node:$src0), (fp_to_sint $src0)>;
+}
+
+def ignore_assertzext : PatFrag<
+ (ops node:$src),
+ (assertzext node:$src)
+>;
+
// Some float -> int -> float conversion patterns for which we want to keep the
// int values in FP registers using the corresponding NEON instructions to
// avoid more costly int <-> fp register transfers.
@@ -6632,6 +6642,38 @@ def : Pat<(f64 (sint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
(UCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;
+// float -> int conversion followed by a store should use the value in the first
+// lane to avoid expensive fpr -> gpr transfers.
+let AddedComplexity = 19 in {
+// f32 -> i32
+def : Pat<(store (ignore_assertzext (i32 (fp_to_uint_oneuse f32:$src))), GPR64sp:$Rn),
+ (STRSui (FCVTZUv1i32 f32:$src), GPR64sp:$Rn, (i64 0))>;
+def : Pat<(store (ignore_assertzext (i32 (fp_to_sint_oneuse f32:$src))), GPR64sp:$Rn),
+ (STRSui (FCVTZSv1i32 f32:$src), GPR64sp:$Rn, (i64 0))>;
+
+// f64 -> i64
+def : Pat<(store (ignore_assertzext (i64 (fp_to_uint_oneuse f64:$src))), GPR64sp:$Rn),
+ (STRDui (FCVTZUv1i64 f64:$src), GPR64sp:$Rn, (i64 0))>;
+def : Pat<(store (ignore_assertzext (i64 (fp_to_sint_oneuse f64:$src))), GPR64sp:$Rn),
+ (STRDui (FCVTZSv1i64 f64:$src), GPR64sp:$Rn, (i64 0))>;
+
+// f32 -> i8
+def : Pat<(truncstorei8 (ignore_assertzext (i32 (fp_to_uint_oneuse (f32 FPR32:$src)))), GPR64sp:$Rn),
+ (STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), bsub)),
+ GPR64sp:$Rn, (i64 0))>;
+def : Pat<(truncstorei8 (ignore_assertzext (i32 (fp_to_sint_oneuse (f32 FPR32:$src)))), GPR64sp:$Rn),
+ (STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), bsub)),
+ GPR64sp:$Rn, (i64 0))>;
+
+// f32 -> i16
+def : Pat<(truncstorei16 (ignore_assertzext (i32 (fp_to_uint_oneuse (f32 FPR32:$src)))), GPR64sp:$Rn),
+ (STRHui (f16 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), hsub)),
+ GPR64sp:$Rn, (i64 0))>;
+def : Pat<(truncstorei16 (ignore_assertzext (i32 (fp_to_sint_oneuse (f32 FPR32:$src)))), GPR64sp:$Rn),
+ (STRHui (f16 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), hsub)),
+ GPR64sp:$Rn, (i64 0))>;
+}
+
// fp16: integer extraction from vector must be at least 32-bits to be legal.
// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in {
diff --git a/llvm/test/CodeGen/AArch64/store-float-conversion.ll b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
new file mode 100644
index 0000000000000..ca12fcb1dcc1b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/store-float-conversion.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
+
+define void @f32_to_u8(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_s8(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_u16(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ %trunc = trunc i32 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_s16(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i16
+ store i16 %trunc, ptr %dst
+ ret void
+}
+
+define void @f32_to_u32(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_u32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui float %f to i32
+ store i32 %conv, ptr %dst
+ ret void
+}
+
+define void @f32_to_s32(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ store i32 %conv, ptr %dst
+ ret void
+}
+
+define void @f64_to_u64(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_u64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptoui double %d to i64
+ store i64 %conv, ptr %dst
+ ret void
+}
+
+define void @f64_to_s64(double %d, ptr %dst) {
+; CHECK-LABEL: f64_to_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi double %d to i64
+ store i64 %conv, ptr %dst
+ ret void
+}
+
+define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
+; CHECK-LABEL: f32_to_i32_multiple_uses:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs w8, s0
+; CHECK-NEXT: mov x9, x0
+; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: strb w8, [x9]
+; CHECK-NEXT: ret
+entry:
+ %conv = fptosi float %f to i32
+ %trunc = trunc i32 %conv to i8
+ store i8 %trunc, ptr %dst
+ ret i32 %conv
+}
|
Stores can be issued faster if the result is kept in the SIMD/FP registers.
6b54790
to
c73288a
Compare
Stores can be issued faster if the result is kept in the SIMD/FP registers.
The
HasOneUse
guards against creating two floating point conversions, if for example there's some arithmetic done on the converted value as well. Another approach would be to inspect the user instructions during lowering, but I don't see that type of check in the lowering too often, so I settled for tablegen.