diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 811877ffacedb..e3c2bae918658 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6598,6 +6598,18 @@ def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))), def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))), (FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>; +let HasOneUse = 1 in { +def fp_to_uint_oneuse : PatFrag<(ops node:$src0), (fp_to_uint $src0)>; +def fp_to_sint_oneuse : PatFrag<(ops node:$src0), (fp_to_sint $src0)>; +} + +class StoreMaybeAssertZext : PatFrags<(ops node:$val, node:$ptr), + [(op node:$val, node:$ptr), + (op (assertzext node:$val), node:$ptr)]>; + +def truncstorei8_maybe_assertzext : StoreMaybeAssertZext; +def truncstorei16_maybe_assertzext : StoreMaybeAssertZext; + // Some float -> int -> float conversion patterns for which we want to keep the // int values in FP registers using the corresponding NEON instructions to // avoid more costly int <-> fp register transfers. @@ -6632,6 +6644,38 @@ def : Pat<(f64 (sint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))), def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))), (UCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>; +// float -> int conversion followed by a store should use the value in the first +// lane to avoid expensive fpr -> gpr transfers. +let AddedComplexity = 19 in { +// f32 -> i32 +def : Pat<(store (i32 (fp_to_uint_oneuse f32:$src)), GPR64sp:$Rn), + (STRSui (FCVTZUv1i32 f32:$src), GPR64sp:$Rn, (i64 0))>; +def : Pat<(store (i32 (fp_to_sint_oneuse f32:$src)), GPR64sp:$Rn), + (STRSui (FCVTZSv1i32 f32:$src), GPR64sp:$Rn, (i64 0))>; + +// f64 -> i64 +def : Pat<(store (i64 (fp_to_uint_oneuse f64:$src)), GPR64sp:$Rn), + (STRDui (FCVTZUv1i64 f64:$src), GPR64sp:$Rn, (i64 0))>; +def : Pat<(store (i64 (fp_to_sint_oneuse f64:$src)), GPR64sp:$Rn), + (STRDui (FCVTZSv1i64 f64:$src), GPR64sp:$Rn, (i64 0))>; + +// f32 -> i8 +def : Pat<(truncstorei8_maybe_assertzext (i32 (fp_to_uint_oneuse (f32 FPR32:$src))), GPR64sp:$Rn), + (STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), bsub)), + GPR64sp:$Rn, (i64 0))>; +def : Pat<(truncstorei8_maybe_assertzext (i32 (fp_to_sint_oneuse (f32 FPR32:$src))), GPR64sp:$Rn), + (STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), bsub)), + GPR64sp:$Rn, (i64 0))>; + +// f32 -> i16 +def : Pat<(truncstorei16_maybe_assertzext (i32 (fp_to_uint_oneuse (f32 FPR32:$src))), GPR64sp:$Rn), + (STRHui (f16 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), hsub)), + GPR64sp:$Rn, (i64 0))>; +def : Pat<(truncstorei16_maybe_assertzext (i32 (fp_to_sint_oneuse (f32 FPR32:$src))), GPR64sp:$Rn), + (STRHui (f16 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), hsub)), + GPR64sp:$Rn, (i64 0))>; +} + // fp16: integer extraction from vector must be at least 32-bits to be legal. // Actual extraction result is then an in-reg sign-extension of lower 16-bits. let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in { diff --git a/llvm/test/CodeGen/AArch64/store-float-conversion.ll b/llvm/test/CodeGen/AArch64/store-float-conversion.ll new file mode 100644 index 0000000000000..ca12fcb1dcc1b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/store-float-conversion.ll @@ -0,0 +1,117 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s + +define void @f32_to_u8(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_u8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu s0, s0 +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptoui float %f to i32 + %trunc = trunc i32 %conv to i8 + store i8 %trunc, ptr %dst + ret void +} + +define void @f32_to_s8(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs s0, s0 +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptosi float %f to i32 + %trunc = trunc i32 %conv to i8 + store i8 %trunc, ptr %dst + ret void +} + +define void @f32_to_u16(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_u16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu s0, s0 +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptoui float %f to i32 + %trunc = trunc i32 %conv to i16 + store i16 %trunc, ptr %dst + ret void +} + +define void @f32_to_s16(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs s0, s0 +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptosi float %f to i32 + %trunc = trunc i32 %conv to i16 + store i16 %trunc, ptr %dst + ret void +} + +define void @f32_to_u32(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_u32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu s0, s0 +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptoui float %f to i32 + store i32 %conv, ptr %dst + ret void +} + +define void @f32_to_s32(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs s0, s0 +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptosi float %f to i32 + store i32 %conv, ptr %dst + ret void +} + +define void @f64_to_u64(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_u64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu d0, d0 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptoui double %d to i64 + store i64 %conv, ptr %dst + ret void +} + +define void @f64_to_s64(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_s64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs d0, d0 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptosi double %d to i64 + store i64 %conv, ptr %dst + ret void +} + +define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_i32_multiple_uses: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs w8, s0 +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: strb w8, [x9] +; CHECK-NEXT: ret +entry: + %conv = fptosi float %f to i32 + %trunc = trunc i32 %conv to i8 + store i8 %trunc, ptr %dst + ret i32 %conv +} diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll index aa0a163b96ac8..fe4e651f2955b 100644 --- a/llvm/test/CodeGen/AArch64/tbl-loops.ll +++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll @@ -178,12 +178,12 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: fcmp s3, s1 ; CHECK-NEXT: fcsel s4, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 -; CHECK-NEXT: fcvtzs w11, s2 +; CHECK-NEXT: fcvtzs s2, s2 ; CHECK-NEXT: fcsel s3, s0, s4, mi ; CHECK-NEXT: subs w10, w10, #1 -; CHECK-NEXT: strb w11, [x9] -; CHECK-NEXT: fcvtzs w12, s3 -; CHECK-NEXT: strb w12, [x9, #1] +; CHECK-NEXT: str b2, [x9] +; CHECK-NEXT: fcvtzs s3, s3 +; CHECK-NEXT: stur b3, [x9, #1] ; CHECK-NEXT: add x9, x9, #2 ; CHECK-NEXT: b.ne .LBB1_6 ; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup @@ -395,19 +395,19 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: fcsel s4, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 ; CHECK-NEXT: ldr s3, [x8, #8] -; CHECK-NEXT: fcvtzs w11, s2 +; CHECK-NEXT: fcvtzs s2, s2 ; CHECK-NEXT: add x8, x8, #12 ; CHECK-NEXT: fcsel s4, s0, s4, mi ; CHECK-NEXT: fcmp s3, s1 -; CHECK-NEXT: strb w11, [x9] +; CHECK-NEXT: str b2, [x9] ; CHECK-NEXT: fcsel s5, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 -; CHECK-NEXT: fcvtzs w12, s4 +; CHECK-NEXT: fcvtzs s4, s4 ; CHECK-NEXT: fcsel s3, s0, s5, mi ; CHECK-NEXT: subs w10, w10, #1 -; CHECK-NEXT: strb w12, [x9, #1] -; CHECK-NEXT: fcvtzs w13, s3 -; CHECK-NEXT: strb w13, [x9, #2] +; CHECK-NEXT: stur b4, [x9, #1] +; CHECK-NEXT: fcvtzs s3, s3 +; CHECK-NEXT: stur b3, [x9, #2] ; CHECK-NEXT: add x9, x9, #3 ; CHECK-NEXT: b.ne .LBB2_8 ; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup @@ -563,26 +563,26 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: fcmp s3, s1 ; CHECK-NEXT: fcsel s4, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 -; CHECK-NEXT: fcvtzs w11, s2 +; CHECK-NEXT: fcvtzs s2, s2 ; CHECK-NEXT: ldp s3, s5, [x8, #8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: fcsel s4, s0, s4, mi ; CHECK-NEXT: fcmp s3, s1 -; CHECK-NEXT: strb w11, [x9] -; CHECK-NEXT: fcvtzs w12, s4 +; CHECK-NEXT: str b2, [x9] +; CHECK-NEXT: fcvtzs s4, s4 ; CHECK-NEXT: fcsel s6, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 ; CHECK-NEXT: fcsel s3, s0, s6, mi ; CHECK-NEXT: fcmp s5, s1 -; CHECK-NEXT: strb w12, [x9, #1] +; CHECK-NEXT: stur b4, [x9, #1] ; CHECK-NEXT: fcsel s6, s1, s5, gt ; CHECK-NEXT: fcmp s5, #0.0 -; CHECK-NEXT: fcvtzs w13, s3 -; CHECK-NEXT: fcsel s2, s0, s6, mi +; CHECK-NEXT: fcvtzs s3, s3 +; CHECK-NEXT: fcsel s5, s0, s6, mi ; CHECK-NEXT: subs w10, w10, #1 -; CHECK-NEXT: strb w13, [x9, #2] -; CHECK-NEXT: fcvtzs w14, s2 -; CHECK-NEXT: strb w14, [x9, #3] +; CHECK-NEXT: stur b3, [x9, #2] +; CHECK-NEXT: fcvtzs s5, s5 +; CHECK-NEXT: stur b5, [x9, #3] ; CHECK-NEXT: add x9, x9, #4 ; CHECK-NEXT: b.ne .LBB3_6 ; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup