[AArch64] Combine ISD::AND into AArch64ISD::ANDS

davemgreen · davemgreen · commit 0b6df40c52ac · 2022-03-17T09:44:11.000Z
If we already have a AArch64ISD::ANDS node with identical operands, we can merge any ISD::AND into it, reducing the instruction count by calculating the value and the flags in a single operation. This code is taken from the X86 backend, and could also handle AArch64ISD::ADDS and AArch64ISD::SUBS, but I couldn't find any test cases where it came up. Differential Revision: https://reviews.llvm.org/D118584
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17351,6 +17351,31 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+// Combines for S forms of generic opcodes (AArch64ISD::ANDS into ISD::AND for
+// example). NOTE: This could be used for ADDS and SUBS too, if we can find test
+// cases.
+static SDValue performANDSCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI) {
+  SDLoc DL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // If the flag result isn't used, convert back to a generic opcode.
+  if (!N->hasAnyUseOfValue(1)) {
+    SDValue Res = DCI.DAG.getNode(ISD::AND, DL, VT, LHS, RHS);
+    return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
+                                  DL);
+  }
+
+  // Combine identical generic nodes into this node, re-using the result.
+  if (SDNode *GenericAddSub =
+          DCI.DAG.getNodeIfExists(ISD::AND, DCI.DAG.getVTList(VT), {LHS, RHS}))
+    DCI.CombineTo(GenericAddSub, SDValue(N, 0));
+
+  return SDValue();
+}
+
 static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
   // setcc_merge_zero pred
   //   (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
@@ -18415,6 +18440,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performTBZCombine(N, DCI, DAG);
   case AArch64ISD::CSEL:
     return performCSELCombine(N, DCI, DAG);
+  case AArch64ISD::ANDS:
+    return performANDSCombine(N, DCI);
   case AArch64ISD::DUP:
     return performPostLD1Combine(N, DCI, false);
   case AArch64ISD::NVCAST:
diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
@@ -126,8 +126,7 @@ do.end:                                           ; preds = %4
 define i64 @test_and1(i64 %x, i64 %y) {
 ; CHECK-LABEL: test_and1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x3
-; CHECK-NEXT:    tst x0, #0x3
+; CHECK-NEXT:    ands x8, x0, #0x3
 ; CHECK-NEXT:    csel x0, x8, x1, eq
 ; CHECK-NEXT:    ret
   %a = and i64 %x, 3
@@ -151,22 +150,20 @@ define i64 @test_and2(i64 %x, i64 %y) {
 define i64 @test_and3(i64 %x, i64 %y) {
 ; CHECK-LABEL: test_and3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    and x21, x0, #0x3
 ; CHECK-NEXT:    mov x0, xzr
 ; CHECK-NEXT:    mov x19, x1
 ; CHECK-NEXT:    bl callee
-; CHECK-NEXT:    tst x20, #0x3
-; CHECK-NEXT:    csel x0, x21, x19, eq
+; CHECK-NEXT:    ands x8, x20, #0x3
+; CHECK-NEXT:    csel x0, x8, x19, eq
 ; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = and i64 %x, 3
   %b = call i64 @callee(i64 0)
@@ -185,8 +182,7 @@ define i64 @test_and_4(i64 %x, i64 %y) {
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    ands x0, x0, #0x3
 ; CHECK-NEXT:    bl callee
-; CHECK-NEXT:    tst x19, #0x3
-; CHECK-NEXT:    and x8, x19, #0x3
+; CHECK-NEXT:    ands x8, x19, #0x3
 ; CHECK-NEXT:    csel x0, x8, x0, eq
 ; CHECK-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret