[DAGCombiner] Fold ty1 extract_vector(ty2 splat(V)) -> ty1 splat(V)

sdesmalen-arm · sdesmalen-arm · commit ec462325178a · 2022-02-09T14:30:01.000Z
This seems like an obvious fold, which leads to a few improvements. Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D118920
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -21109,6 +21109,11 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     }
   }
 
+  // ty1 extract_vector(ty2 splat(V))) -> ty1 splat(V)
+  if (V.getOpcode() == ISD::SPLAT_VECTOR)
+    if (DAG.isConstantValueOfAnyType(V.getOperand(0)) || V.hasOneUse())
+      return DAG.getSplatVector(NVT, SDLoc(N), V.getOperand(0));
+
   // Try to move vector bitcast after extract_subv by scaling extraction index:
   // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
   if (V.getOpcode() == ISD::BITCAST &&
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -424,6 +424,44 @@ entry:
   ret <4 x i32> %out
 }
 
+;
+; Extract fixed-width vector from a scalable vector splat.
+;
+
+define <2 x float> @extract_v2f32_nxv4f32_splat(float %f) {
+; CHECK-LABEL: extract_v2f32_nxv4f32_splat:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 4 x float> poison, float %f, i32 0
+  %splat = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %ext = call <2 x float> @llvm.experimental.vector.extract.v2f32.nxv4f32(<vscale x 4 x float> %splat, i64 0)
+  ret <2 x float> %ext
+}
+
+define <2 x float> @extract_v2f32_nxv4f32_splat_const() {
+; CHECK-LABEL: extract_v2f32_nxv4f32_splat_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov v0.2s, #1.00000000
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 4 x float> poison, float 1.0, i32 0
+  %splat = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %ext = call <2 x float> @llvm.experimental.vector.extract.v2f32.nxv4f32(<vscale x 4 x float> %splat, i64 0)
+  ret <2 x float> %ext
+}
+
+define <4 x i32> @extract_v4i32_nxv8i32_splat_const() {
+; CHECK-LABEL: extract_v4i32_nxv8i32_splat_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.4s, #1
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 8 x i32> poison, i32 1, i32 0
+  %splat = shufflevector <vscale x 8 x i32> %ins, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %ext = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv8i32(<vscale x 8 x i32> %splat, i64 0)
+  ret <4 x i32> %ext
+}
+
 attributes #0 = { vscale_range(2,2) }
 attributes #1 = { vscale_range(8,8) }
 
@@ -442,3 +480,5 @@ declare <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv4i8(<vscale x 4 x i
 declare <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv2i8(<vscale x 2 x i8>, i64)
 
 declare <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64>, i64)
+declare <2 x float> @llvm.experimental.vector.extract.v2f32.nxv4f32(<vscale x 4 x float>, i64)
+declare <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv8i32(<vscale x 8 x i32>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
@@ -1014,3 +1014,63 @@ define <vscale x 4 x bfloat> @extract_nxv4bf16_nxv16bf16_12(<vscale x 16 x bfloa
 
 declare <vscale x 4 x bfloat> @llvm.experimental.vector.extract.nxv4bf16.nxv16bf16(<vscale x 16 x bfloat>, i64)
 
+
+;
+; Extract from a splat
+;
+define <vscale x 2 x float> @extract_nxv2f32_nxv4f32_splat(float %f) {
+; CHECK-LABEL: extract_nxv2f32_nxv4f32_splat:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 4 x float> poison, float %f, i32 0
+  %splat = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %ext = call <vscale x 2 x float> @llvm.experimental.vector.extract.nxv2f32.nxv4f32(<vscale x 4 x float> %splat, i64 0)
+  ret <vscale x 2 x float> %ext
+}
+
+define <vscale x 2 x float> @extract_nxv2f32_nxv4f32_splat_const() {
+; CHECK-LABEL: extract_nxv2f32_nxv4f32_splat_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.s, #1.00000000
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 4 x float> poison, float 1.0, i32 0
+  %splat = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %ext = call <vscale x 2 x float> @llvm.experimental.vector.extract.nxv2f32.nxv4f32(<vscale x 4 x float> %splat, i64 0)
+  ret <vscale x 2 x float> %ext
+}
+
+define <vscale x 4 x i32> @extract_nxv4i32_nxv8i32_splat_const() {
+; CHECK-LABEL: extract_nxv4i32_nxv8i32_splat_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #1 // =0x1
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 8 x i32> poison, i32 1, i32 0
+  %splat = shufflevector <vscale x 8 x i32> %ins, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %ext = call <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> %splat, i64 0)
+  ret <vscale x 4 x i32> %ext
+}
+
+define <vscale x 2 x i1> @extract_nxv2i1_nxv16i1_all_ones() {
+; CHECK-LABEL: extract_nxv2i1_nxv16i1_all_ones:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 16 x i1> poison, i1 1, i32 0
+  %splat = shufflevector <vscale x 16 x i1> %ins, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %ext = call <vscale x 2 x i1> @llvm.experimental.vector.extract.nxv2i1.nxv16i1(<vscale x 16 x i1> %splat, i64 0)
+  ret <vscale x 2 x i1> %ext
+}
+
+define <vscale x 2 x i1> @extract_nxv2i1_nxv16i1_all_zero() {
+; CHECK-LABEL: extract_nxv2i1_nxv16i1_all_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p0.b
+; CHECK-NEXT:    ret
+  %ext = call <vscale x 2 x i1> @llvm.experimental.vector.extract.nxv2i1.nxv16i1(<vscale x 16 x i1> zeroinitializer, i64 0)
+  ret <vscale x 2 x i1> %ext
+}
+
+declare <vscale x 2 x float> @llvm.experimental.vector.extract.nxv2f32.nxv4f32(<vscale x 4 x float>, i64)
+declare <vscale x 4 x i32> @llvm.experimental.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -554,10 +554,7 @@ define <vscale x 16 x i1> @insert_nxv16i1_nxv4i1_into_zero(<vscale x 4 x i1> %sv
 ; CHECK-LABEL: insert_nxv16i1_nxv4i1_into_zero:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    pfalse p1.b
-; CHECK-NEXT:    punpklo p2.h, p1.b
-; CHECK-NEXT:    punpkhi p1.h, p1.b
-; CHECK-NEXT:    punpkhi p2.h, p2.b
-; CHECK-NEXT:    uzp1 p0.h, p0.h, p2.h
+; CHECK-NEXT:    uzp1 p0.h, p0.h, p1.h
 ; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
 ; CHECK-NEXT:    ret
   %v0 = call <vscale x 16 x i1> @llvm.experimental.vector.insert.nx16i1.nxv4i1(<vscale x 16 x i1> zeroinitializer, <vscale x 4 x i1> %sv, i64 0)
diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll
@@ -134,11 +134,10 @@ define void @store_nxv6f32(<vscale x 6 x float>* %out) {
 ; CHECK-LABEL: store_nxv6f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov z0.s, #1.00000000
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpklo z1.d, z0.s
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
-; CHECK-NEXT:    st1w { z1.d }, p1, [x0, #2, mul vl]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    st1w { z0.d }, p0, [x0, #2, mul vl]
+; CHECK-NEXT:    st1w { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %ins = insertelement <vscale x 6 x float> undef, float 1.0, i32 0
   %splat = shufflevector <vscale x 6 x float> %ins, <vscale x 6 x float> undef, <vscale x 6 x i32> zeroinitializer
@@ -150,11 +149,10 @@ define void @store_nxv12f16(<vscale x 12 x half>* %out) {
 ; CHECK-LABEL: store_nxv12f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov z0.h, #1.00000000
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    uunpklo z1.s, z0.h
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
-; CHECK-NEXT:    st1h { z1.s }, p1, [x0, #2, mul vl]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    st1h { z0.s }, p0, [x0, #2, mul vl]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x0]
 ; CHECK-NEXT:    ret
   %ins = insertelement <vscale x 12 x half> undef, half 1.0, i32 0
   %splat = shufflevector <vscale x 12 x half> %ins, <vscale x 12 x half> undef, <vscale x 12 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
@@ -1572,33 +1572,26 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
   ret <vscale x 32 x i32> %v
 }
 
-; FIXME: We don't catch this as unmasked.
-
 define <vscale x 32 x i32> @vadd_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vadd_vi_nxv32i32_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a4, a1, 2
-; CHECK-NEXT:    vsetvli a3, zero, e8, m4, ta, mu
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, mu
 ; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vslidedown.vx v0, v24, a4
-; CHECK-NEXT:    bltu a0, a3, .LBB119_2
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:    bltu a0, a1, .LBB119_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    li a3, 0
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; CHECK-NEXT:    vadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    vadd.vi v8, v8, -1
 ; CHECK-NEXT:    bltu a0, a1, .LBB119_4
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:  .LBB119_4:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, mu
+; CHECK-NEXT:    vadd.vi v16, v16, -1
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
   %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer

Original file line number	Diff line number	Diff line change
`@@ -21109,6 +21109,11 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {`
`21109`	`21109`	`}`
`21110`	`21110`	`}`
`21111`	`21111`
	`21112`	`+ // ty1 extract_vector(ty2 splat(V))) -> ty1 splat(V)`
	`21113`	`+ if (V.getOpcode() == ISD::SPLAT_VECTOR)`
	`21114`	`+ if (DAG.isConstantValueOfAnyType(V.getOperand(0)) \|\| V.hasOneUse())`
	`21115`	`+ return DAG.getSplatVector(NVT, SDLoc(N), V.getOperand(0));`
	`21116`	`+`
`21112`	`21117`	`// Try to move vector bitcast after extract_subv by scaling extraction index:`
`21113`	`21118`	`// extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')`
`21114`	`21119`	`if (V.getOpcode() == ISD::BITCAST &&`