@@ -1619,7 +1619,7 @@ define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1619
1619
; CHECK-NEXT: fmov s1, w1
1620
1620
; CHECK-NEXT: fmov s2, w0
1621
1621
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1622
- ; CHECK-NEXT: sqdmlal.4s v2, v1 , v0[1]
1622
+ ; CHECK-NEXT: sqdmlal.h s2, h1 , v0[1]
1623
1623
; CHECK-NEXT: fmov w0, s2
1624
1624
; CHECK-NEXT: ret
1625
1625
%lhs = insertelement <4 x i16 > undef , i16 %B , i32 0
@@ -1637,7 +1637,7 @@ define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1637
1637
; CHECK-NEXT: fmov s1, w1
1638
1638
; CHECK-NEXT: fmov s2, w0
1639
1639
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1640
- ; CHECK-NEXT: sqdmlsl.4s v2, v1 , v0[1]
1640
+ ; CHECK-NEXT: sqdmlsl.h s2, h1 , v0[1]
1641
1641
; CHECK-NEXT: fmov w0, s2
1642
1642
; CHECK-NEXT: ret
1643
1643
%lhs = insertelement <4 x i16 > undef , i16 %B , i32 0
@@ -1649,6 +1649,38 @@ define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1649
1649
}
1650
1650
declare i32 @llvm.aarch64.neon.sqsub.i32 (i32 , i32 )
1651
1651
1652
+ define i32 @sqadd_lane1_sqdmull4s (i32 %A , <4 x i16 > %B , <4 x i16 > %C ) nounwind {
1653
+ ; CHECK-LABEL: sqadd_lane1_sqdmull4s:
1654
+ ; CHECK: // %bb.0:
1655
+ ; CHECK-NEXT: sqdmull.4s v0, v0, v1
1656
+ ; CHECK-NEXT: fmov s1, w0
1657
+ ; CHECK-NEXT: mov.s w8, v0[1]
1658
+ ; CHECK-NEXT: fmov s0, w8
1659
+ ; CHECK-NEXT: sqadd s0, s1, s0
1660
+ ; CHECK-NEXT: fmov w0, s0
1661
+ ; CHECK-NEXT: ret
1662
+ %prod.vec = call <4 x i32 > @llvm.aarch64.neon.sqdmull.v4i32 (<4 x i16 > %B , <4 x i16 > %C )
1663
+ %prod = extractelement <4 x i32 > %prod.vec , i32 1
1664
+ %res = call i32 @llvm.aarch64.neon.sqadd.i32 (i32 %A , i32 %prod )
1665
+ ret i32 %res
1666
+ }
1667
+
1668
+ define i32 @sqsub_lane1_sqdmull4s (i32 %A , <4 x i16 > %B , <4 x i16 > %C ) nounwind {
1669
+ ; CHECK-LABEL: sqsub_lane1_sqdmull4s:
1670
+ ; CHECK: // %bb.0:
1671
+ ; CHECK-NEXT: sqdmull.4s v0, v0, v1
1672
+ ; CHECK-NEXT: fmov s1, w0
1673
+ ; CHECK-NEXT: mov.s w8, v0[1]
1674
+ ; CHECK-NEXT: fmov s0, w8
1675
+ ; CHECK-NEXT: sqsub s0, s1, s0
1676
+ ; CHECK-NEXT: fmov w0, s0
1677
+ ; CHECK-NEXT: ret
1678
+ %prod.vec = call <4 x i32 > @llvm.aarch64.neon.sqdmull.v4i32 (<4 x i16 > %B , <4 x i16 > %C )
1679
+ %prod = extractelement <4 x i32 > %prod.vec , i32 1
1680
+ %res = call i32 @llvm.aarch64.neon.sqsub.i32 (i32 %A , i32 %prod )
1681
+ ret i32 %res
1682
+ }
1683
+
1652
1684
define i64 @sqdmlal_lane_1d (i64 %A , i32 %B , <2 x i32 > %C ) nounwind {
1653
1685
; CHECK-LABEL: sqdmlal_lane_1d:
1654
1686
; CHECK: // %bb.0:
@@ -2894,6 +2926,23 @@ define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind
2894
2926
ret <1 x double > %prod
2895
2927
}
2896
2928
2929
+ define i32 @sqdmlal_s (i16 %A , i16 %B , i32 %C ) nounwind {
2930
+ ; CHECK-LABEL: sqdmlal_s:
2931
+ ; CHECK: // %bb.0:
2932
+ ; CHECK-NEXT: fmov s0, w1
2933
+ ; CHECK-NEXT: fmov s1, w0
2934
+ ; CHECK-NEXT: fmov s2, w2
2935
+ ; CHECK-NEXT: sqdmlal.h s2, h1, v0[0]
2936
+ ; CHECK-NEXT: fmov w0, s2
2937
+ ; CHECK-NEXT: ret
2938
+ %tmp1 = insertelement <4 x i16 > undef , i16 %A , i64 0
2939
+ %tmp2 = insertelement <4 x i16 > undef , i16 %B , i64 0
2940
+ %tmp3 = tail call <4 x i32 > @llvm.aarch64.neon.sqdmull.v4i32 (<4 x i16 > %tmp1 , <4 x i16 > %tmp2 )
2941
+ %tmp4 = extractelement <4 x i32 > %tmp3 , i64 0
2942
+ %tmp5 = tail call i32 @llvm.aarch64.neon.sqadd.i32 (i32 %C , i32 %tmp4 )
2943
+ ret i32 %tmp5
2944
+ }
2945
+
2897
2946
define i64 @sqdmlal_d (i32 %A , i32 %B , i64 %C ) nounwind {
2898
2947
; CHECK-LABEL: sqdmlal_d:
2899
2948
; CHECK: // %bb.0:
@@ -2908,6 +2957,23 @@ define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
2908
2957
ret i64 %tmp5
2909
2958
}
2910
2959
2960
+ define i32 @sqdmlsl_s (i16 %A , i16 %B , i32 %C ) nounwind {
2961
+ ; CHECK-LABEL: sqdmlsl_s:
2962
+ ; CHECK: // %bb.0:
2963
+ ; CHECK-NEXT: fmov s0, w1
2964
+ ; CHECK-NEXT: fmov s1, w0
2965
+ ; CHECK-NEXT: fmov s2, w2
2966
+ ; CHECK-NEXT: sqdmlsl.h s2, h1, v0[0]
2967
+ ; CHECK-NEXT: fmov w0, s2
2968
+ ; CHECK-NEXT: ret
2969
+ %tmp1 = insertelement <4 x i16 > undef , i16 %A , i64 0
2970
+ %tmp2 = insertelement <4 x i16 > undef , i16 %B , i64 0
2971
+ %tmp3 = tail call <4 x i32 > @llvm.aarch64.neon.sqdmull.v4i32 (<4 x i16 > %tmp1 , <4 x i16 > %tmp2 )
2972
+ %tmp4 = extractelement <4 x i32 > %tmp3 , i64 0
2973
+ %tmp5 = tail call i32 @llvm.aarch64.neon.sqsub.i32 (i32 %C , i32 %tmp4 )
2974
+ ret i32 %tmp5
2975
+ }
2976
+
2911
2977
define i64 @sqdmlsl_d (i32 %A , i32 %B , i64 %C ) nounwind {
2912
2978
; CHECK-LABEL: sqdmlsl_d:
2913
2979
; CHECK: // %bb.0:
0 commit comments