Skip to content

Commit 54e0bfd

Browse files
committed
[AArch64] Keep floating-point conversion in SIMD
Stores can be issued faster if the result is kept in the SIMD/FP registers.
1 parent a7a7e95 commit 54e0bfd

File tree

3 files changed

+180
-19
lines changed

3 files changed

+180
-19
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6598,6 +6598,18 @@ def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
65986598
def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
65996599
(FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;
66006600

6601+
let HasOneUse = 1 in {
6602+
def fp_to_uint_oneuse : PatFrag<(ops node:$src0), (fp_to_uint $src0)>;
6603+
def fp_to_sint_oneuse : PatFrag<(ops node:$src0), (fp_to_sint $src0)>;
6604+
}
6605+
6606+
class StoreMaybeAssertZext<PatFrag op> : PatFrags<(ops node:$val, node:$ptr),
6607+
[(op node:$val, node:$ptr),
6608+
(op (assertzext node:$val), node:$ptr)]>;
6609+
6610+
def truncstorei8_maybe_assertzext : StoreMaybeAssertZext<truncstorei8>;
6611+
def truncstorei16_maybe_assertzext : StoreMaybeAssertZext<truncstorei16>;
6612+
66016613
// Some float -> int -> float conversion patterns for which we want to keep the
66026614
// int values in FP registers using the corresponding NEON instructions to
66036615
// avoid more costly int <-> fp register transfers.
@@ -6632,6 +6644,38 @@ def : Pat<(f64 (sint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
66326644
def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
66336645
(UCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;
66346646

6647+
// float -> int conversion followed by a store should use the value in the first
6648+
// lane to avoid expensive fpr -> gpr transfers.
6649+
let AddedComplexity = 19 in {
6650+
// f32 -> i32
6651+
def : Pat<(store (i32 (fp_to_uint_oneuse f32:$src)), GPR64sp:$Rn),
6652+
(STRSui (FCVTZUv1i32 f32:$src), GPR64sp:$Rn, (i64 0))>;
6653+
def : Pat<(store (i32 (fp_to_sint_oneuse f32:$src)), GPR64sp:$Rn),
6654+
(STRSui (FCVTZSv1i32 f32:$src), GPR64sp:$Rn, (i64 0))>;
6655+
6656+
// f64 -> i64
6657+
def : Pat<(store (i64 (fp_to_uint_oneuse f64:$src)), GPR64sp:$Rn),
6658+
(STRDui (FCVTZUv1i64 f64:$src), GPR64sp:$Rn, (i64 0))>;
6659+
def : Pat<(store (i64 (fp_to_sint_oneuse f64:$src)), GPR64sp:$Rn),
6660+
(STRDui (FCVTZSv1i64 f64:$src), GPR64sp:$Rn, (i64 0))>;
6661+
6662+
// f32 -> i8
6663+
def : Pat<(truncstorei8_maybe_assertzext (i32 (fp_to_uint_oneuse (f32 FPR32:$src))), GPR64sp:$Rn),
6664+
(STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), bsub)),
6665+
GPR64sp:$Rn, (i64 0))>;
6666+
def : Pat<(truncstorei8_maybe_assertzext (i32 (fp_to_sint_oneuse (f32 FPR32:$src))), GPR64sp:$Rn),
6667+
(STRBui (aarch64mfp8 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), bsub)),
6668+
GPR64sp:$Rn, (i64 0))>;
6669+
6670+
// f32 -> i16
6671+
def : Pat<(truncstorei16_maybe_assertzext (i32 (fp_to_uint_oneuse (f32 FPR32:$src))), GPR64sp:$Rn),
6672+
(STRHui (f16 (EXTRACT_SUBREG (FCVTZUv1i32 (f32 FPR32:$src)), hsub)),
6673+
GPR64sp:$Rn, (i64 0))>;
6674+
def : Pat<(truncstorei16_maybe_assertzext (i32 (fp_to_sint_oneuse (f32 FPR32:$src))), GPR64sp:$Rn),
6675+
(STRHui (f16 (EXTRACT_SUBREG (FCVTZSv1i32 (f32 FPR32:$src)), hsub)),
6676+
GPR64sp:$Rn, (i64 0))>;
6677+
}
6678+
66356679
// fp16: integer extraction from vector must be at least 32-bits to be legal.
66366680
// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
66376681
let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in {
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
3+
4+
define void @f32_to_u8(float %f, ptr %dst) {
5+
; CHECK-LABEL: f32_to_u8:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: fcvtzu s0, s0
8+
; CHECK-NEXT: str b0, [x0]
9+
; CHECK-NEXT: ret
10+
entry:
11+
%conv = fptoui float %f to i32
12+
%trunc = trunc i32 %conv to i8
13+
store i8 %trunc, ptr %dst
14+
ret void
15+
}
16+
17+
define void @f32_to_s8(float %f, ptr %dst) {
18+
; CHECK-LABEL: f32_to_s8:
19+
; CHECK: // %bb.0: // %entry
20+
; CHECK-NEXT: fcvtzs s0, s0
21+
; CHECK-NEXT: str b0, [x0]
22+
; CHECK-NEXT: ret
23+
entry:
24+
%conv = fptosi float %f to i32
25+
%trunc = trunc i32 %conv to i8
26+
store i8 %trunc, ptr %dst
27+
ret void
28+
}
29+
30+
define void @f32_to_u16(float %f, ptr %dst) {
31+
; CHECK-LABEL: f32_to_u16:
32+
; CHECK: // %bb.0: // %entry
33+
; CHECK-NEXT: fcvtzu s0, s0
34+
; CHECK-NEXT: str h0, [x0]
35+
; CHECK-NEXT: ret
36+
entry:
37+
%conv = fptoui float %f to i32
38+
%trunc = trunc i32 %conv to i16
39+
store i16 %trunc, ptr %dst
40+
ret void
41+
}
42+
43+
define void @f32_to_s16(float %f, ptr %dst) {
44+
; CHECK-LABEL: f32_to_s16:
45+
; CHECK: // %bb.0: // %entry
46+
; CHECK-NEXT: fcvtzs s0, s0
47+
; CHECK-NEXT: str h0, [x0]
48+
; CHECK-NEXT: ret
49+
entry:
50+
%conv = fptosi float %f to i32
51+
%trunc = trunc i32 %conv to i16
52+
store i16 %trunc, ptr %dst
53+
ret void
54+
}
55+
56+
define void @f32_to_u32(float %f, ptr %dst) {
57+
; CHECK-LABEL: f32_to_u32:
58+
; CHECK: // %bb.0: // %entry
59+
; CHECK-NEXT: fcvtzu s0, s0
60+
; CHECK-NEXT: str s0, [x0]
61+
; CHECK-NEXT: ret
62+
entry:
63+
%conv = fptoui float %f to i32
64+
store i32 %conv, ptr %dst
65+
ret void
66+
}
67+
68+
define void @f32_to_s32(float %f, ptr %dst) {
69+
; CHECK-LABEL: f32_to_s32:
70+
; CHECK: // %bb.0: // %entry
71+
; CHECK-NEXT: fcvtzs s0, s0
72+
; CHECK-NEXT: str s0, [x0]
73+
; CHECK-NEXT: ret
74+
entry:
75+
%conv = fptosi float %f to i32
76+
store i32 %conv, ptr %dst
77+
ret void
78+
}
79+
80+
define void @f64_to_u64(double %d, ptr %dst) {
81+
; CHECK-LABEL: f64_to_u64:
82+
; CHECK: // %bb.0: // %entry
83+
; CHECK-NEXT: fcvtzu d0, d0
84+
; CHECK-NEXT: str d0, [x0]
85+
; CHECK-NEXT: ret
86+
entry:
87+
%conv = fptoui double %d to i64
88+
store i64 %conv, ptr %dst
89+
ret void
90+
}
91+
92+
define void @f64_to_s64(double %d, ptr %dst) {
93+
; CHECK-LABEL: f64_to_s64:
94+
; CHECK: // %bb.0: // %entry
95+
; CHECK-NEXT: fcvtzs d0, d0
96+
; CHECK-NEXT: str d0, [x0]
97+
; CHECK-NEXT: ret
98+
entry:
99+
%conv = fptosi double %d to i64
100+
store i64 %conv, ptr %dst
101+
ret void
102+
}
103+
104+
define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
105+
; CHECK-LABEL: f32_to_i32_multiple_uses:
106+
; CHECK: // %bb.0: // %entry
107+
; CHECK-NEXT: fcvtzs w8, s0
108+
; CHECK-NEXT: mov x9, x0
109+
; CHECK-NEXT: mov w0, w8
110+
; CHECK-NEXT: strb w8, [x9]
111+
; CHECK-NEXT: ret
112+
entry:
113+
%conv = fptosi float %f to i32
114+
%trunc = trunc i32 %conv to i8
115+
store i8 %trunc, ptr %dst
116+
ret i32 %conv
117+
}

llvm/test/CodeGen/AArch64/tbl-loops.ll

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,12 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
178178
; CHECK-NEXT: fcmp s3, s1
179179
; CHECK-NEXT: fcsel s4, s1, s3, gt
180180
; CHECK-NEXT: fcmp s3, #0.0
181-
; CHECK-NEXT: fcvtzs w11, s2
181+
; CHECK-NEXT: fcvtzs s2, s2
182182
; CHECK-NEXT: fcsel s3, s0, s4, mi
183183
; CHECK-NEXT: subs w10, w10, #1
184-
; CHECK-NEXT: strb w11, [x9]
185-
; CHECK-NEXT: fcvtzs w12, s3
186-
; CHECK-NEXT: strb w12, [x9, #1]
184+
; CHECK-NEXT: str b2, [x9]
185+
; CHECK-NEXT: fcvtzs s3, s3
186+
; CHECK-NEXT: stur b3, [x9, #1]
187187
; CHECK-NEXT: add x9, x9, #2
188188
; CHECK-NEXT: b.ne .LBB1_6
189189
; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup
@@ -395,19 +395,19 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
395395
; CHECK-NEXT: fcsel s4, s1, s3, gt
396396
; CHECK-NEXT: fcmp s3, #0.0
397397
; CHECK-NEXT: ldr s3, [x8, #8]
398-
; CHECK-NEXT: fcvtzs w11, s2
398+
; CHECK-NEXT: fcvtzs s2, s2
399399
; CHECK-NEXT: add x8, x8, #12
400400
; CHECK-NEXT: fcsel s4, s0, s4, mi
401401
; CHECK-NEXT: fcmp s3, s1
402-
; CHECK-NEXT: strb w11, [x9]
402+
; CHECK-NEXT: str b2, [x9]
403403
; CHECK-NEXT: fcsel s5, s1, s3, gt
404404
; CHECK-NEXT: fcmp s3, #0.0
405-
; CHECK-NEXT: fcvtzs w12, s4
405+
; CHECK-NEXT: fcvtzs s4, s4
406406
; CHECK-NEXT: fcsel s3, s0, s5, mi
407407
; CHECK-NEXT: subs w10, w10, #1
408-
; CHECK-NEXT: strb w12, [x9, #1]
409-
; CHECK-NEXT: fcvtzs w13, s3
410-
; CHECK-NEXT: strb w13, [x9, #2]
408+
; CHECK-NEXT: stur b4, [x9, #1]
409+
; CHECK-NEXT: fcvtzs s3, s3
410+
; CHECK-NEXT: stur b3, [x9, #2]
411411
; CHECK-NEXT: add x9, x9, #3
412412
; CHECK-NEXT: b.ne .LBB2_8
413413
; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
@@ -563,26 +563,26 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
563563
; CHECK-NEXT: fcmp s3, s1
564564
; CHECK-NEXT: fcsel s4, s1, s3, gt
565565
; CHECK-NEXT: fcmp s3, #0.0
566-
; CHECK-NEXT: fcvtzs w11, s2
566+
; CHECK-NEXT: fcvtzs s2, s2
567567
; CHECK-NEXT: ldp s3, s5, [x8, #8]
568568
; CHECK-NEXT: add x8, x8, #16
569569
; CHECK-NEXT: fcsel s4, s0, s4, mi
570570
; CHECK-NEXT: fcmp s3, s1
571-
; CHECK-NEXT: strb w11, [x9]
572-
; CHECK-NEXT: fcvtzs w12, s4
571+
; CHECK-NEXT: str b2, [x9]
572+
; CHECK-NEXT: fcvtzs s4, s4
573573
; CHECK-NEXT: fcsel s6, s1, s3, gt
574574
; CHECK-NEXT: fcmp s3, #0.0
575575
; CHECK-NEXT: fcsel s3, s0, s6, mi
576576
; CHECK-NEXT: fcmp s5, s1
577-
; CHECK-NEXT: strb w12, [x9, #1]
577+
; CHECK-NEXT: stur b4, [x9, #1]
578578
; CHECK-NEXT: fcsel s6, s1, s5, gt
579579
; CHECK-NEXT: fcmp s5, #0.0
580-
; CHECK-NEXT: fcvtzs w13, s3
581-
; CHECK-NEXT: fcsel s2, s0, s6, mi
580+
; CHECK-NEXT: fcvtzs s3, s3
581+
; CHECK-NEXT: fcsel s5, s0, s6, mi
582582
; CHECK-NEXT: subs w10, w10, #1
583-
; CHECK-NEXT: strb w13, [x9, #2]
584-
; CHECK-NEXT: fcvtzs w14, s2
585-
; CHECK-NEXT: strb w14, [x9, #3]
583+
; CHECK-NEXT: stur b3, [x9, #2]
584+
; CHECK-NEXT: fcvtzs s5, s5
585+
; CHECK-NEXT: stur b5, [x9, #3]
586586
; CHECK-NEXT: add x9, x9, #4
587587
; CHECK-NEXT: b.ne .LBB3_6
588588
; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup

0 commit comments

Comments
 (0)