Skip to content

[AArch64] Keep floating-point conversion in SIMD #147707

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -6598,6 +6598,18 @@ def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
(FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;

let HasOneUse = 1 in {
def fp_to_uint_oneuse : PatFrag<(ops node:$src0), (fp_to_uint $src0)>;
def fp_to_sint_oneuse : PatFrag<(ops node:$src0), (fp_to_sint $src0)>;
}

class StoreMaybeAssertZext<PatFrag op> : PatFrags<(ops node:$val, node:$ptr),
[(op node:$val, node:$ptr),
(op (assertzext node:$val), node:$ptr)]>;

def truncstorei8_maybe_assertzext : StoreMaybeAssertZext<truncstorei8>;
def truncstorei16_maybe_assertzext : StoreMaybeAssertZext<truncstorei16>;

// Some float -> int -> float conversion patterns for which we want to keep the
// int values in FP registers using the corresponding NEON instructions to
// avoid more costly int <-> fp register transfers.
Expand Down Expand Up @@ -6632,6 +6644,38 @@ def : Pat<(f64 (sint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
(UCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;

// float -> int conversion followed by a store should use the value in the first
// lane to avoid expensive fpr -> gpr transfers.
let AddedComplexity = 19 in {
// f32 -> i32
def : Pat<(store (i32 (fp_to_uint_oneuse f32:$src)), GPR64sp:$Rn),
(STRSui (FCVTZUv1i32 f32:$src), GPR64sp:$Rn, (i64 0))>;
def : Pat<(store (i32 (fp_to_sint_oneuse f32:$src)), GPR64sp:$Rn),
(STRSui (FCVTZSv1i32 f32:$src), GPR64sp:$Rn, (i64 0))>;

// f64 -> i64
def : Pat<(store (i64 (fp_to_uint_oneuse f64:$src)), GPR64sp:$Rn),
(STRDui (FCVTZUv1i64 f64:$src), GPR64sp:$Rn, (i64 0))>;
def : Pat<(store (i64 (fp_to_sint_oneuse f64:$src)), GPR64sp:$Rn),
(STRDui (FCVTZSv1i64 f64:$src), GPR64sp:$Rn, (i64 0))>;

// f32 -> i8
def : Pat<(truncstorei8_maybe_assertzext (i32 (fp_to_uint_oneuse (f32 FPR32:$src))), GPR64sp:$Rn),
(STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), bsub)),
GPR64sp:$Rn, (i64 0))>;
def : Pat<(truncstorei8_maybe_assertzext (i32 (fp_to_sint_oneuse (f32 FPR32:$src))), GPR64sp:$Rn),
(STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), bsub)),
GPR64sp:$Rn, (i64 0))>;

// f32 -> i16
def : Pat<(truncstorei16_maybe_assertzext (i32 (fp_to_uint_oneuse (f32 FPR32:$src))), GPR64sp:$Rn),
(STRHui (f16 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), hsub)),
GPR64sp:$Rn, (i64 0))>;
def : Pat<(truncstorei16_maybe_assertzext (i32 (fp_to_sint_oneuse (f32 FPR32:$src))), GPR64sp:$Rn),
(STRHui (f16 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), hsub)),
GPR64sp:$Rn, (i64 0))>;
}

// fp16: integer extraction from vector must be at least 32-bits to be legal.
// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in {
Expand Down
117 changes: 117 additions & 0 deletions llvm/test/CodeGen/AArch64/store-float-conversion.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s

define void @f32_to_u8(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_u8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu s0, s0
; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptoui float %f to i32
%trunc = trunc i32 %conv to i8
store i8 %trunc, ptr %dst
ret void
}

define void @f32_to_s8(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_s8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs s0, s0
; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptosi float %f to i32
%trunc = trunc i32 %conv to i8
store i8 %trunc, ptr %dst
ret void
}

define void @f32_to_u16(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_u16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu s0, s0
; CHECK-NEXT: str h0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptoui float %f to i32
%trunc = trunc i32 %conv to i16
store i16 %trunc, ptr %dst
ret void
}

define void @f32_to_s16(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_s16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs s0, s0
; CHECK-NEXT: str h0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptosi float %f to i32
%trunc = trunc i32 %conv to i16
store i16 %trunc, ptr %dst
ret void
}

define void @f32_to_u32(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_u32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu s0, s0
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptoui float %f to i32
store i32 %conv, ptr %dst
ret void
}

define void @f32_to_s32(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_s32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs s0, s0
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptosi float %f to i32
store i32 %conv, ptr %dst
ret void
}

define void @f64_to_u64(double %d, ptr %dst) {
; CHECK-LABEL: f64_to_u64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu d0, d0
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptoui double %d to i64
store i64 %conv, ptr %dst
ret void
}

define void @f64_to_s64(double %d, ptr %dst) {
; CHECK-LABEL: f64_to_s64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs d0, d0
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptosi double %d to i64
store i64 %conv, ptr %dst
ret void
}

define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_i32_multiple_uses:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs w8, s0
; CHECK-NEXT: mov x9, x0
; CHECK-NEXT: mov w0, w8
; CHECK-NEXT: strb w8, [x9]
; CHECK-NEXT: ret
entry:
%conv = fptosi float %f to i32
%trunc = trunc i32 %conv to i8
store i8 %trunc, ptr %dst
ret i32 %conv
}
38 changes: 19 additions & 19 deletions llvm/test/CodeGen/AArch64/tbl-loops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,12 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: fcvtzs w11, s2
; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: fcsel s3, s0, s4, mi
; CHECK-NEXT: subs w10, w10, #1
; CHECK-NEXT: strb w11, [x9]
; CHECK-NEXT: fcvtzs w12, s3
; CHECK-NEXT: strb w12, [x9, #1]
; CHECK-NEXT: str b2, [x9]
; CHECK-NEXT: fcvtzs s3, s3
; CHECK-NEXT: stur b3, [x9, #1]
; CHECK-NEXT: add x9, x9, #2
; CHECK-NEXT: b.ne .LBB1_6
; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup
Expand Down Expand Up @@ -395,19 +395,19 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: ldr s3, [x8, #8]
; CHECK-NEXT: fcvtzs w11, s2
; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: add x8, x8, #12
; CHECK-NEXT: fcsel s4, s0, s4, mi
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: strb w11, [x9]
; CHECK-NEXT: str b2, [x9]
; CHECK-NEXT: fcsel s5, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: fcvtzs w12, s4
; CHECK-NEXT: fcvtzs s4, s4
; CHECK-NEXT: fcsel s3, s0, s5, mi
; CHECK-NEXT: subs w10, w10, #1
; CHECK-NEXT: strb w12, [x9, #1]
; CHECK-NEXT: fcvtzs w13, s3
; CHECK-NEXT: strb w13, [x9, #2]
; CHECK-NEXT: stur b4, [x9, #1]
; CHECK-NEXT: fcvtzs s3, s3
; CHECK-NEXT: stur b3, [x9, #2]
; CHECK-NEXT: add x9, x9, #3
; CHECK-NEXT: b.ne .LBB2_8
; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
Expand Down Expand Up @@ -563,26 +563,26 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: fcvtzs w11, s2
; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: ldp s3, s5, [x8, #8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: fcsel s4, s0, s4, mi
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: strb w11, [x9]
; CHECK-NEXT: fcvtzs w12, s4
; CHECK-NEXT: str b2, [x9]
; CHECK-NEXT: fcvtzs s4, s4
; CHECK-NEXT: fcsel s6, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: fcsel s3, s0, s6, mi
; CHECK-NEXT: fcmp s5, s1
; CHECK-NEXT: strb w12, [x9, #1]
; CHECK-NEXT: stur b4, [x9, #1]
; CHECK-NEXT: fcsel s6, s1, s5, gt
; CHECK-NEXT: fcmp s5, #0.0
; CHECK-NEXT: fcvtzs w13, s3
; CHECK-NEXT: fcsel s2, s0, s6, mi
; CHECK-NEXT: fcvtzs s3, s3
; CHECK-NEXT: fcsel s5, s0, s6, mi
; CHECK-NEXT: subs w10, w10, #1
; CHECK-NEXT: strb w13, [x9, #2]
; CHECK-NEXT: fcvtzs w14, s2
; CHECK-NEXT: strb w14, [x9, #3]
; CHECK-NEXT: stur b3, [x9, #2]
; CHECK-NEXT: fcvtzs s5, s5
; CHECK-NEXT: stur b5, [x9, #3]
; CHECK-NEXT: add x9, x9, #4
; CHECK-NEXT: b.ne .LBB3_6
; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup
Expand Down