[InstCombine] fold mul with masked low bit operand to trunc+select

rotateright · rotateright · commit 3f33d67d8a5c · 2022-06-05T20:07:18.000-04:00
https://alive2.llvm.org/ce/z/o7rQ5q This shows an extra instruction in some cases, but that is caused by an existing canonicalization of trunc -> and+icmp. Codegen should be better for any target where a multiply is more costly than the most simple ALU op. This ends up producing the requested x86 asm from issue #55618, but it's not the same IR. We are missing a canonicalization from the negate+mask pattern to the trunc+select created here.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -390,6 +390,12 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
     return SelectInst::Create(IsNeg, Y, ConstantInt::getNullValue(Ty));
   }
 
+  // (and X, 1) * Y --> (trunc X) ? Y : 0
+  if (match(&I, m_c_BinOp(m_OneUse(m_And(m_Value(X), m_One())), m_Value(Y)))) {
+    Value *Tr = Builder.CreateTrunc(X, CmpInst::makeCmpResultType(Ty));
+    return SelectInst::Create(Tr, Y, ConstantInt::getNullValue(Ty));
+  }
+
   // ((ashr X, 31) | 1) * X --> abs(X)
   // X * ((ashr X, 31) | 1) --> abs(X)
   if (match(&I, m_c_BinOp(m_Or(m_AShr(m_Value(X),
diff --git a/llvm/test/Transforms/InstCombine/icmp-mul-and.ll b/llvm/test/Transforms/InstCombine/icmp-mul-and.ll
@@ -267,10 +267,11 @@ define i1 @pr51551_neg1(i32 %x, i32 %y) {
 
 define i1 @pr51551_neg2(i32 %x, i32 %y) {
 ; CHECK-LABEL: @pr51551_neg2(
-; CHECK-NEXT:    [[T0:%.*]] = and i32 [[Y:%.*]], 1
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i32 [[T0]], [[X:%.*]]
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[MUL]], 7
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[Y:%.*]], 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    [[X_OP:%.*]] = and i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[X_OP]], 0
+; CHECK-NEXT:    [[CMP:%.*]] = select i1 [[DOTNOT]], i1 true, i1 [[CMP1]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %t0 = and i32 %y, -7
diff --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll
@@ -749,16 +749,11 @@ define i1 @not_mul_of_bool_commute(i32 %x, i32 %y) {
   ret i1 %r
 }
 
-; negative test - no leading zeros for 's'
-; TODO: If analysis was generalized for sign bits, we could reduce this to false.
+; no leading zeros for 's', but we reduce this with other transforms
 
 define i1 @mul_of_bool_no_lz_other_op(i32 %x, i8 %y) {
 ; CHECK-LABEL: @mul_of_bool_no_lz_other_op(
-; CHECK-NEXT:    [[B:%.*]] = and i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[S:%.*]] = sext i8 [[Y:%.*]] to i32
-; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[B]], [[S]]
-; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[M]], 127
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %b = and i32 %x, 1
   %s = sext i8 %y to i32
diff --git a/llvm/test/Transforms/InstCombine/mul-masked-bits.ll b/llvm/test/Transforms/InstCombine/mul-masked-bits.ll
@@ -80,8 +80,8 @@ define <4 x i32> @combine_mul_self_demandedbits_vector(<4 x i32> %x) {
 
 define i8 @one_demanded_bit(i8 %x) {
 ; CHECK-LABEL: @one_demanded_bit(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i8 [[X:%.*]], 6
-; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP1]], -65
+; CHECK-NEXT:    [[M:%.*]] = shl i8 [[X:%.*]], 6
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[M]], -65
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %m = mul i8 %x, 192  ; 0b1100_0000
@@ -91,8 +91,8 @@ define i8 @one_demanded_bit(i8 %x) {
 
 define <2 x i8> @one_demanded_bit_splat(<2 x i8> %x) {
 ; CHECK-LABEL: @one_demanded_bit_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i8> [[X:%.*]], <i8 5, i8 5>
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i8> [[TMP1]], <i8 32, i8 32>
+; CHECK-NEXT:    [[M:%.*]] = shl <2 x i8> [[X:%.*]], <i8 5, i8 5>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i8> [[M]], <i8 32, i8 32>
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %m = mul <2 x i8> %x, <i8 160, i8 160> ; 0b1010_0000
@@ -201,9 +201,10 @@ define i64 @scalar_mul_bit_x0_y0_uses(i64 %x, i64 %y) {
 ; Negative test
 define i64 @scalar_mul_bit_x0_y1(i64 %x, i64 %y) {
 ; CHECK-LABEL: @scalar_mul_bit_x0_y1(
-; CHECK-NEXT:    [[AND1:%.*]] = and i64 [[X:%.*]], 1
 ; CHECK-NEXT:    [[AND2:%.*]] = and i64 [[Y:%.*]], 2
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[AND1]], [[AND2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[X:%.*]], 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[DOTNOT]], i64 0, i64 [[AND2]]
 ; CHECK-NEXT:    ret i64 [[MUL]]
 ;
   %and1 = and i64 %x, 1
@@ -214,9 +215,10 @@ define i64 @scalar_mul_bit_x0_y1(i64 %x, i64 %y) {
 
 define i64 @scalar_mul_bit_x0_yC(i64 %x, i64 %y, i64 %c) {
 ; CHECK-LABEL: @scalar_mul_bit_x0_yC(
-; CHECK-NEXT:    [[AND1:%.*]] = and i64 [[X:%.*]], 1
 ; CHECK-NEXT:    [[AND2:%.*]] = and i64 [[Y:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i64 [[AND1]], [[AND2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[X:%.*]], 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[DOTNOT]], i64 0, i64 [[AND2]]
 ; CHECK-NEXT:    ret i64 [[MUL]]
 ;
   %and1 = and i64 %x, 1
diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll
@@ -466,8 +466,9 @@ define <2 x i32> @signbit_mul_vec_commute(<2 x i32> %a, <2 x i32> %b) {
 
 define i32 @lowbit_mul(i32 %a, i32 %b) {
 ; CHECK-LABEL: @lowbit_mul(
-; CHECK-NEXT:    [[D:%.*]] = and i32 [[A:%.*]], 1
-; CHECK-NEXT:    [[E:%.*]] = mul nuw i32 [[D]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[A:%.*]], 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    [[E:%.*]] = select i1 [[DOTNOT]], i32 0, i32 [[B:%.*]]
 ; CHECK-NEXT:    ret i32 [[E]]
 ;
   %d = and i32 %a, 1
@@ -480,8 +481,8 @@ define i32 @lowbit_mul(i32 %a, i32 %b) {
 define <2 x i17> @lowbit_mul_commute(<2 x i17> %a, <2 x i17> %p) {
 ; CHECK-LABEL: @lowbit_mul_commute(
 ; CHECK-NEXT:    [[B:%.*]] = xor <2 x i17> [[P:%.*]], <i17 42, i17 43>
-; CHECK-NEXT:    [[D:%.*]] = and <2 x i17> [[A:%.*]], <i17 1, i17 1>
-; CHECK-NEXT:    [[E:%.*]] = mul nuw <2 x i17> [[B]], [[D]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i17> [[A:%.*]] to <2 x i1>
+; CHECK-NEXT:    [[E:%.*]] = select <2 x i1> [[TMP1]], <2 x i17> [[B]], <2 x i17> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i17> [[E]]
 ;
   %b = xor <2 x i17> %p, <i17 42, i17 43> ; thwart complexity-based canonicalization
@@ -490,6 +491,8 @@ define <2 x i17> @lowbit_mul_commute(<2 x i17> %a, <2 x i17> %p) {
   ret <2 x i17> %e
 }
 
+; negative test - extra use
+
 define i32 @lowbit_mul_use(i32 %a, i32 %b) {
 ; CHECK-LABEL: @lowbit_mul_use(
 ; CHECK-NEXT:    [[D:%.*]] = and i32 [[A:%.*]], 1
@@ -503,6 +506,8 @@ define i32 @lowbit_mul_use(i32 %a, i32 %b) {
   ret i32 %e
 }
 
+; negative test - wrong mask
+
 define i32 @not_lowbit_mul(i32 %a, i32 %b) {
 ; CHECK-LABEL: @not_lowbit_mul(
 ; CHECK-NEXT:    [[D:%.*]] = and i32 [[A:%.*]], 2
diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll
@@ -1499,8 +1499,8 @@ define i32 @mul_no_common_bits_const_op(i32 %p) {
 
 define <2 x i12> @mul_no_common_bits_commute(<2 x i12> %p) {
 ; CHECK-LABEL: @mul_no_common_bits_commute(
-; CHECK-NEXT:    [[X:%.*]] = and <2 x i12> [[P:%.*]], <i12 1, i12 1>
-; CHECK-NEXT:    [[R:%.*]] = mul nuw nsw <2 x i12> [[X]], <i12 15, i12 17>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i12> [[P:%.*]] to <2 x i1>
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[TMP1]], <2 x i12> <i12 15, i12 17>, <2 x i12> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i12> [[R]]
 ;
   %x = and <2 x i12> %p, <i12 1, i12 1>