Skip to content

Commit 232953f

Browse files
overmightydavemgreen
authored andcommitted
[AArch64] Add pattern for SQDML*Lv1i32_indexed
There was no pattern to fold into these instructions. This patch adds the pattern obtained from the following ACLE intrinsics so that they generate sqdmlal/sqdmlsl instructions instead of separate sqdmull and sqadd/sqsub instructions: - vqdmlalh_s16, vqdmlslh_s16 - vqdmlalh_lane_s16, vqdmlalh_laneq_s16, vqdmlslh_lane_s16, vqdmlslh_laneq_s16 (when the lane index is 0) It also modifies the result of the existing pattern for the latter, when the lane index is not 0, to use the v1i32_indexed instructions instead of the v4i16_indexed ones. Fixes llvm#49997. Differential Revision: https://reviews.llvm.org/D131700
1 parent d999348 commit 232953f

File tree

2 files changed

+93
-16
lines changed

2 files changed

+93
-16
lines changed

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8847,20 +8847,6 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
88478847
let Inst{20} = idx{0};
88488848
}
88498849

8850-
// FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
8851-
// intermediate EXTRACT_SUBREG would be untyped.
8852-
def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
8853-
(i32 (vector_extract (v4i32
8854-
(int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
8855-
(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
8856-
VectorIndexH:$idx)))),
8857-
(i64 0))))),
8858-
(EXTRACT_SUBREG
8859-
(!cast<Instruction>(NAME # v4i16_indexed)
8860-
(SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
8861-
V128_lo:$Rm, VectorIndexH:$idx),
8862-
ssub)>;
8863-
88648850
def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
88658851
V128, V128,
88668852
V128_lo, VectorIndexH,
@@ -8914,6 +8900,31 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
89148900
let Inst{20} = idx{0};
89158901
}
89168902

8903+
def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
8904+
(i32 (vector_extract
8905+
(v4i32 (int_aarch64_neon_sqdmull
8906+
(v4i16 V64:$Rn),
8907+
(v4i16 V64:$Rm))),
8908+
(i64 0))))),
8909+
(!cast<Instruction>(NAME # v1i32_indexed)
8910+
FPR32Op:$Rd,
8911+
(EXTRACT_SUBREG V64:$Rn, hsub),
8912+
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),
8913+
(i64 0))>;
8914+
8915+
def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
8916+
(i32 (vector_extract
8917+
(v4i32 (int_aarch64_neon_sqdmull
8918+
(v4i16 V64:$Rn),
8919+
(v4i16 (AArch64duplane16
8920+
(v8i16 V128_lo:$Rm),
8921+
VectorIndexH:$idx)))),
8922+
(i64 0))))),
8923+
(!cast<Instruction>(NAME # v1i32_indexed)
8924+
FPR32Op:$Rd,
8925+
(EXTRACT_SUBREG V64:$Rn, hsub),
8926+
V128_lo:$Rm,
8927+
VectorIndexH:$idx)>;
89178928

89188929
def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
89198930
FPR64Op, FPR32Op, V128, VectorIndexS,

llvm/test/CodeGen/AArch64/arm64-vmul.ll

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1619,7 +1619,7 @@ define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
16191619
; CHECK-NEXT: fmov s1, w1
16201620
; CHECK-NEXT: fmov s2, w0
16211621
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1622-
; CHECK-NEXT: sqdmlal.4s v2, v1, v0[1]
1622+
; CHECK-NEXT: sqdmlal.h s2, h1, v0[1]
16231623
; CHECK-NEXT: fmov w0, s2
16241624
; CHECK-NEXT: ret
16251625
%lhs = insertelement <4 x i16> undef, i16 %B, i32 0
@@ -1637,7 +1637,7 @@ define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
16371637
; CHECK-NEXT: fmov s1, w1
16381638
; CHECK-NEXT: fmov s2, w0
16391639
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1640-
; CHECK-NEXT: sqdmlsl.4s v2, v1, v0[1]
1640+
; CHECK-NEXT: sqdmlsl.h s2, h1, v0[1]
16411641
; CHECK-NEXT: fmov w0, s2
16421642
; CHECK-NEXT: ret
16431643
%lhs = insertelement <4 x i16> undef, i16 %B, i32 0
@@ -1649,6 +1649,38 @@ define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
16491649
}
16501650
declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
16511651

1652+
define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
1653+
; CHECK-LABEL: sqadd_lane1_sqdmull4s:
1654+
; CHECK: // %bb.0:
1655+
; CHECK-NEXT: sqdmull.4s v0, v0, v1
1656+
; CHECK-NEXT: fmov s1, w0
1657+
; CHECK-NEXT: mov.s w8, v0[1]
1658+
; CHECK-NEXT: fmov s0, w8
1659+
; CHECK-NEXT: sqadd s0, s1, s0
1660+
; CHECK-NEXT: fmov w0, s0
1661+
; CHECK-NEXT: ret
1662+
%prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
1663+
%prod = extractelement <4 x i32> %prod.vec, i32 1
1664+
%res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
1665+
ret i32 %res
1666+
}
1667+
1668+
define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
1669+
; CHECK-LABEL: sqsub_lane1_sqdmull4s:
1670+
; CHECK: // %bb.0:
1671+
; CHECK-NEXT: sqdmull.4s v0, v0, v1
1672+
; CHECK-NEXT: fmov s1, w0
1673+
; CHECK-NEXT: mov.s w8, v0[1]
1674+
; CHECK-NEXT: fmov s0, w8
1675+
; CHECK-NEXT: sqsub s0, s1, s0
1676+
; CHECK-NEXT: fmov w0, s0
1677+
; CHECK-NEXT: ret
1678+
%prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
1679+
%prod = extractelement <4 x i32> %prod.vec, i32 1
1680+
%res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
1681+
ret i32 %res
1682+
}
1683+
16521684
define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
16531685
; CHECK-LABEL: sqdmlal_lane_1d:
16541686
; CHECK: // %bb.0:
@@ -2894,6 +2926,23 @@ define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind
28942926
ret <1 x double> %prod
28952927
}
28962928

2929+
define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind {
2930+
; CHECK-LABEL: sqdmlal_s:
2931+
; CHECK: // %bb.0:
2932+
; CHECK-NEXT: fmov s0, w1
2933+
; CHECK-NEXT: fmov s1, w0
2934+
; CHECK-NEXT: fmov s2, w2
2935+
; CHECK-NEXT: sqdmlal.h s2, h1, v0[0]
2936+
; CHECK-NEXT: fmov w0, s2
2937+
; CHECK-NEXT: ret
2938+
%tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0
2939+
%tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0
2940+
%tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
2941+
%tmp4 = extractelement <4 x i32> %tmp3, i64 0
2942+
%tmp5 = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %C, i32 %tmp4)
2943+
ret i32 %tmp5
2944+
}
2945+
28972946
define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
28982947
; CHECK-LABEL: sqdmlal_d:
28992948
; CHECK: // %bb.0:
@@ -2908,6 +2957,23 @@ define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
29082957
ret i64 %tmp5
29092958
}
29102959

2960+
define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind {
2961+
; CHECK-LABEL: sqdmlsl_s:
2962+
; CHECK: // %bb.0:
2963+
; CHECK-NEXT: fmov s0, w1
2964+
; CHECK-NEXT: fmov s1, w0
2965+
; CHECK-NEXT: fmov s2, w2
2966+
; CHECK-NEXT: sqdmlsl.h s2, h1, v0[0]
2967+
; CHECK-NEXT: fmov w0, s2
2968+
; CHECK-NEXT: ret
2969+
%tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0
2970+
%tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0
2971+
%tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
2972+
%tmp4 = extractelement <4 x i32> %tmp3, i64 0
2973+
%tmp5 = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %C, i32 %tmp4)
2974+
ret i32 %tmp5
2975+
}
2976+
29112977
define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
29122978
; CHECK-LABEL: sqdmlsl_d:
29132979
; CHECK: // %bb.0:

0 commit comments

Comments
 (0)