[AArch64][SVE] Remove Redundant aarch64.sve.convert.to.svbool

MDevereau · MDevereau · commit cee8b255be5c · 2022-01-17T14:35:24.000Z
Generated code resulted in redundant aarch64.sve.convert.to.svbool calls for AArch64 Binary Operations. Narrow the more precise operands instead of widening the less precise operands Differential Revision: https://reviews.llvm.org/D116730
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -416,12 +416,76 @@ static Optional<Instruction *> processPhiNode(InstCombiner &IC,
   return IC.replaceInstUsesWith(II, NPN);
 }
 
+// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
+// => (binop (pred) (from_svbool _) (from_svbool _))
+//
+// The above transformation eliminates a `to_svbool` in the predicate
+// operand of bitwise operation `binop` by narrowing the vector width of
+// the operation. For example, it would convert a `<vscale x 16 x i1>
+// and` into a `<vscale x 4 x i1> and`. This is profitable because
+// to_svbool must zero the new lanes during widening, whereas
+// from_svbool is free.
+static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC,
+                                                         IntrinsicInst &II) {
+  auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
+  if (!BinOp)
+    return None;
+
+  auto IntrinsicID = BinOp->getIntrinsicID();
+  switch (IntrinsicID) {
+  case Intrinsic::aarch64_sve_and_z:
+  case Intrinsic::aarch64_sve_bic_z:
+  case Intrinsic::aarch64_sve_eor_z:
+  case Intrinsic::aarch64_sve_nand_z:
+  case Intrinsic::aarch64_sve_nor_z:
+  case Intrinsic::aarch64_sve_orn_z:
+  case Intrinsic::aarch64_sve_orr_z:
+    break;
+  default:
+    return None;
+  }
+
+  auto BinOpPred = BinOp->getOperand(0);
+  auto BinOpOp1 = BinOp->getOperand(1);
+  auto BinOpOp2 = BinOp->getOperand(2);
+
+  auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
+  if (!PredIntr ||
+      PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
+    return None;
+
+  auto PredOp = PredIntr->getOperand(0);
+  auto PredOpTy = cast<VectorType>(PredOp->getType());
+  if (PredOpTy != II.getType())
+    return None;
+
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+
+  SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
+  auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
+      Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
+  NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
+  if (BinOpOp1 == BinOpOp2)
+    NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
+  else
+    NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
+        Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
+
+  auto NarrowedBinOp =
+      Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
+  return IC.replaceInstUsesWith(II, NarrowedBinOp);
+}
+
 static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
                                                             IntrinsicInst &II) {
   // If the reinterpret instruction operand is a PHI Node
   if (isa<PHINode>(II.getArgOperand(0)))
     return processPhiNode(IC, II);
 
+  if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
+    return BinOpCombine;
+
   SmallVector<Instruction *, 32> CandidatesForRemoval;
   Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
 
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 4 x i1> @try_combine_svbool_binop_and_0(<vscale x 4 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c) {
+; CHECK-LABEL: @try_combine_svbool_binop_and_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[B:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[C:%.*]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.and.z.nxv4i1(<vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i1> [[TMP2]])
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP3]]
+;
+  %t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
+  %t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.and.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c)
+  %t3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %t2)
+  ret <vscale x 4 x i1> %t3
+}
+
+define <vscale x 8 x i1> @try_combine_svbool_binop_and_1(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_and_1(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.and.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[TMP2]]
+;
+  %t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+  %t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.and.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
+  %t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
+  ret <vscale x 8 x i1> %t3
+}
+
+define <vscale x 4 x i1> @try_combine_svbool_binop_and_2(<vscale x 4 x i1> %a, <vscale x 16 x i1> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_and_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[B:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.and.z.nxv4i1(<vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i1> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP2]]
+;
+  %t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
+  %t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.and.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
+  %t3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %t2)
+  ret <vscale x 4 x i1> %t3
+}
+
+define <vscale x 2 x i1> @try_combine_svbool_binop_and_3(<vscale x 2 x i1> %a, <vscale x 16 x i1> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_and_3(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[B:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.and.z.nxv2i1(<vscale x 2 x i1> [[A:%.*]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i1> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
+;
+  %t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
+  %t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.and.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
+  %t3 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %t2)
+  ret <vscale x 2 x i1> %t3
+}
+
+define <vscale x 8 x i1> @try_combine_svbool_binop_bic(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_bic(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.bic.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[TMP2]]
+;
+  %t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+  %t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.bic.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
+  %t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
+  ret <vscale x 8 x i1> %t3
+}
+
+define <vscale x 8 x i1> @try_combine_svbool_binop_eor(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_eor(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.eor.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[TMP2]]
+;
+  %t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+  %t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.eor.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
+  %t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
+  ret <vscale x 8 x i1> %t3
+}
+
+define <vscale x 8 x i1> @try_combine_svbool_binop_nand(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_nand(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.nand.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[TMP2]]
+;
+  %t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+  %t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.nand.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
+  %t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
+  ret <vscale x 8 x i1> %t3
+}
+
+define <vscale x 8 x i1> @try_combine_svbool_binop_nor(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_nor(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.nor.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[TMP2]]
+;
+  %t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+  %t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.nor.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
+  %t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
+  ret <vscale x 8 x i1> %t3
+}
+
+define <vscale x 8 x i1> @try_combine_svbool_binop_orn(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_orn(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.orn.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[TMP2]]
+;
+  %t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+  %t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.orn.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
+  %t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
+  ret <vscale x 8 x i1> %t3
+}
+
+define <vscale x 8 x i1> @try_combine_svbool_binop_orr(<vscale x 8 x i1> %a, <vscale x 16 x i1> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_orr(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[B:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.orr.z.nxv8i1(<vscale x 8 x i1> [[A:%.*]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i1> [[TMP1]])
+; CHECK-NEXT:    ret <vscale x 8 x i1> [[TMP2]]
+;
+  %t1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+  %t2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.orr.z.nxv16i1(<vscale x 16 x i1> %t1, <vscale x 16 x i1> %b, <vscale x 16 x i1> %b)
+  %t3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %t2)
+  ret <vscale x 8 x i1> %t3
+}
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.and.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.bic.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.eor.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.nand.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.nor.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.orn.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.orr.z.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+
+attributes #0 = { "target-features"="+sve" }