-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AArch64] Use SVE2 bit-sel instructions for some binary patterns. #147544
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
We can use NBSL/BSL2N to implement the following operations via the corresponding identities: * EON(a, b) = BSL2N(a, a, b) * NAND(a, b) = NBSL(a, b, b) = NBSL(b, a, a) * NOR(a, b) = NBSL(a, b, a) = NBSL(b, a, b) * ORN(a, b) = BSL2N(a, b, a) These operations are currently lowered into at least two instructions because we don't have dedicated Neon/SVE instructions for them. With the appropriate pattern of NBSL/BSL2N we can lower them in a single instruction. P.S. We can also use NBSL to implement an unpredicated NOT(a) = NBSL(a, a, a). However, because of the tied register constraint, this may not be always profitable.
@llvm/pr-subscribers-backend-aarch64 Author: Ricardo Jesus (rj-jesus) ChangesWe can use NBSL/BSL2N to implement the following operations via the
Most of these operations are currently lowered into at least two instructions P.S. We can also use NBSL to implement an unpredicated NOT(a) = Full diff: https://github.com/llvm/llvm-project/pull/147544.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 261df563bb2a9..8f02fc0b647ac 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4034,6 +4034,36 @@ let Predicates = [HasSVE2_or_SME] in {
defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", AArch64bsl2n>;
defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", AArch64nbsl>;
+ multiclass binary_bitwise<ValueType VT, SDPatternOperator InOp, SDPatternOperator OutOp> {
+ def : Pat<(InOp VT:$op1, VT:$op2), (OutOp $op1, $op2)>;
+
+ def : Pat<(SVEType<VT>.DSub (InOp V64:$op1, V64:$op2)),
+ (EXTRACT_SUBREG (OutOp (INSERT_SUBREG (IMPLICIT_DEF), (SVEType<VT>.DSub $op1), dsub),
+ (INSERT_SUBREG (IMPLICIT_DEF), (SVEType<VT>.DSub $op2), dsub)), dsub)>;
+
+ def : Pat<(SVEType<VT>.ZSub (InOp V128:$op1, V128:$op2)),
+ (EXTRACT_SUBREG (OutOp (INSERT_SUBREG (IMPLICIT_DEF), (SVEType<VT>.ZSub $op1), zsub),
+ (INSERT_SUBREG (IMPLICIT_DEF), (SVEType<VT>.ZSub $op2), zsub)), zsub)>;
+ }
+
+ foreach VT = [nxv16i8, nxv8i16, nxv4i32, nxv2i64] in {
+ // EON (a, b) = BSL2N (a, a, b) = BSL2N (b, b, a)
+ defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (xor node:$op1, node:$op2))>,
+ OutPatFrag<(ops node:$op1, node:$op2), (BSL2N_ZZZZ $op1, $op1, $op2)>>;
+
+ // NAND (a, b) = NBSL (a, b, b) = NBSL (b, a, a)
+ defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (and node:$op1, node:$op2))>,
+ OutPatFrag<(ops node:$op1, node:$op2), (NBSL_ZZZZ $op2, $op1, $op1)>>;
+
+ // NOR (a, b) = NBSL (a, b, a) = NBSL (b, a, b)
+ defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (or node:$op1, node:$op2))>,
+ OutPatFrag<(ops node:$op1, node:$op2), (NBSL_ZZZZ $op2, $op1, $op2)>>;
+
+ // ORN (a, b) = BSL2N (a, b, a)
+ defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (or node:$op1, (vnot node:$op2))>,
+ OutPatFrag<(ops node:$op1, node:$op2), (BSL2N_ZZZZ $op1, $op2, $op1)>>;
+ }
+
// SVE2 bitwise xor and rotate right by immediate
defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>;
diff --git a/llvm/test/CodeGen/AArch64/bsl.ll b/llvm/test/CodeGen/AArch64/bsl.ll
index 5a270bc71cfc1..df6b6f75b8935 100644
--- a/llvm/test/CodeGen/AArch64/bsl.ll
+++ b/llvm/test/CodeGen/AArch64/bsl.ll
@@ -431,3 +431,94 @@ define <4 x i8> @bsl2n_v4i8(<4 x i8> %0, <4 x i8> %1, <4 x i8> %2) {
%7 = or <4 x i8> %4, %6
ret <4 x i8> %7
}
+
+; NOT (a) has a dedicated instruction (MVN).
+define <2 x i64> @not_q(<2 x i64> %0) #0 {
+; NEON-LABEL: not_q:
+; NEON: // %bb.0:
+; NEON-NEXT: mvn v0.16b, v0.16b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: not_q:
+; SVE2: // %bb.0:
+; SVE2-NEXT: mvn v0.16b, v0.16b
+; SVE2-NEXT: ret
+ %2 = xor <2 x i64> %0, splat (i64 -1)
+ ret <2 x i64> %2
+}
+
+; NAND (a, b) = NBSL (a, b, b) = NBSL (b, a, a).
+define <2 x i64> @nand_q(<2 x i64> %0, <2 x i64> %1) #0 {
+; NEON-LABEL: nand_q:
+; NEON: // %bb.0:
+; NEON-NEXT: and v0.16b, v1.16b, v0.16b
+; NEON-NEXT: mvn v0.16b, v0.16b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: nand_q:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: nbsl z0.d, z0.d, z1.d, z1.d
+; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT: ret
+ %3 = and <2 x i64> %1, %0
+ %4 = xor <2 x i64> %3, splat (i64 -1)
+ ret <2 x i64> %4
+}
+
+; NOR (a, b) = NBSL (a, b, a) = NBSL (b, a, b).
+define <2 x i64> @nor_q(<2 x i64> %0, <2 x i64> %1) #0 {
+; NEON-LABEL: nor_q:
+; NEON: // %bb.0:
+; NEON-NEXT: orr v0.16b, v1.16b, v0.16b
+; NEON-NEXT: mvn v0.16b, v0.16b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: nor_q:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: nbsl z0.d, z0.d, z1.d, z0.d
+; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT: ret
+ %3 = or <2 x i64> %1, %0
+ %4 = xor <2 x i64> %3, splat (i64 -1)
+ ret <2 x i64> %4
+}
+
+; EON (a, b) = BSL2N (a, a, b) = BSL2N (b, b, a).
+define <2 x i64> @eon_q(<2 x i64> %0, <2 x i64> %1) #0 {
+; NEON-LABEL: eon_q:
+; NEON: // %bb.0:
+; NEON-NEXT: eor v0.16b, v0.16b, v1.16b
+; NEON-NEXT: mvn v0.16b, v0.16b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: eon_q:
+; SVE2: // %bb.0:
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: bsl2n z0.d, z0.d, z0.d, z1.d
+; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT: ret
+ %3 = xor <2 x i64> %0, %1
+ %4 = xor <2 x i64> %3, splat (i64 -1)
+ ret <2 x i64> %4
+}
+
+; ORN (a, b) has a dedicated instruction (ORN).
+define <2 x i64> @orn_q(<2 x i64> %0, <2 x i64> %1) #0 {
+; NEON-LABEL: orn_q:
+; NEON: // %bb.0:
+; NEON-NEXT: orn v0.16b, v0.16b, v1.16b
+; NEON-NEXT: ret
+;
+; SVE2-LABEL: orn_q:
+; SVE2: // %bb.0:
+; SVE2-NEXT: orn v0.16b, v0.16b, v1.16b
+; SVE2-NEXT: ret
+ %3 = xor <2 x i64> %1, splat (i64 -1)
+ %4 = or <2 x i64> %0, %3
+ ret <2 x i64> %4
+}
diff --git a/llvm/test/CodeGen/AArch64/eor3.ll b/llvm/test/CodeGen/AArch64/eor3.ll
index b89d9d608575c..eccd09131b525 100644
--- a/llvm/test/CodeGen/AArch64/eor3.ll
+++ b/llvm/test/CodeGen/AArch64/eor3.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s
-; RUN: llc -mtriple=aarch64 -mattr=+sha3,+sve2 < %s | FileCheck --check-prefix=SHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3,+sve2 < %s | FileCheck --check-prefix=SHA3-SVE2 %s
define <16 x i8> @eor3_16x8_left(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
; SHA3-LABEL: eor3_16x8_left:
@@ -24,6 +24,11 @@ define <16 x i8> @eor3_16x8_left(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
; SVE2-NEXT: eor3 z2.d, z2.d, z0.d, z1.d
; SVE2-NEXT: mov v0.16b, v2.16b
; SVE2-NEXT: ret
+;
+; SHA3-SVE2-LABEL: eor3_16x8_left:
+; SHA3-SVE2: // %bb.0:
+; SHA3-SVE2-NEXT: eor3 v0.16b, v0.16b, v1.16b, v2.16b
+; SHA3-SVE2-NEXT: ret
%4 = xor <16 x i8> %0, %1
%5 = xor <16 x i8> %2, %4
ret <16 x i8> %5
@@ -49,6 +54,11 @@ define <16 x i8> @eor3_16x8_right(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d
; SVE2-NEXT: mov v0.16b, v1.16b
; SVE2-NEXT: ret
+;
+; SHA3-SVE2-LABEL: eor3_16x8_right:
+; SHA3-SVE2: // %bb.0:
+; SHA3-SVE2-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-SVE2-NEXT: ret
%4 = xor <16 x i8> %1, %2
%5 = xor <16 x i8> %4, %0
ret <16 x i8> %5
@@ -74,6 +84,11 @@ define <8 x i16> @eor3_8x16_left(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2) {
; SVE2-NEXT: eor3 z2.d, z2.d, z0.d, z1.d
; SVE2-NEXT: mov v0.16b, v2.16b
; SVE2-NEXT: ret
+;
+; SHA3-SVE2-LABEL: eor3_8x16_left:
+; SHA3-SVE2: // %bb.0:
+; SHA3-SVE2-NEXT: eor3 v0.16b, v0.16b, v1.16b, v2.16b
+; SHA3-SVE2-NEXT: ret
%4 = xor <8 x i16> %0, %1
%5 = xor <8 x i16> %2, %4
ret <8 x i16> %5
@@ -99,6 +114,11 @@ define <8 x i16> @eor3_8x16_right(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2) {
; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d
; SVE2-NEXT: mov v0.16b, v1.16b
; SVE2-NEXT: ret
+;
+; SHA3-SVE2-LABEL: eor3_8x16_right:
+; SHA3-SVE2: // %bb.0:
+; SHA3-SVE2-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-SVE2-NEXT: ret
%4 = xor <8 x i16> %1, %2
%5 = xor <8 x i16> %4, %0
ret <8 x i16> %5
@@ -124,6 +144,11 @@ define <4 x i32> @eor3_4x32_left(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) {
; SVE2-NEXT: eor3 z2.d, z2.d, z0.d, z1.d
; SVE2-NEXT: mov v0.16b, v2.16b
; SVE2-NEXT: ret
+;
+; SHA3-SVE2-LABEL: eor3_4x32_left:
+; SHA3-SVE2: // %bb.0:
+; SHA3-SVE2-NEXT: eor3 v0.16b, v0.16b, v1.16b, v2.16b
+; SHA3-SVE2-NEXT: ret
%4 = xor <4 x i32> %0, %1
%5 = xor <4 x i32> %2, %4
ret <4 x i32> %5
@@ -149,6 +174,11 @@ define <4 x i32> @eor3_4x32_right(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) {
; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d
; SVE2-NEXT: mov v0.16b, v1.16b
; SVE2-NEXT: ret
+;
+; SHA3-SVE2-LABEL: eor3_4x32_right:
+; SHA3-SVE2: // %bb.0:
+; SHA3-SVE2-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-SVE2-NEXT: ret
%4 = xor <4 x i32> %1, %2
%5 = xor <4 x i32> %4, %0
ret <4 x i32> %5
@@ -174,6 +204,11 @@ define <2 x i64> @eor3_2x64_left(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
; SVE2-NEXT: eor3 z2.d, z2.d, z0.d, z1.d
; SVE2-NEXT: mov v0.16b, v2.16b
; SVE2-NEXT: ret
+;
+; SHA3-SVE2-LABEL: eor3_2x64_left:
+; SHA3-SVE2: // %bb.0:
+; SHA3-SVE2-NEXT: eor3 v0.16b, v0.16b, v1.16b, v2.16b
+; SHA3-SVE2-NEXT: ret
%4 = xor <2 x i64> %0, %1
%5 = xor <2 x i64> %2, %4
ret <2 x i64> %5
@@ -199,6 +234,11 @@ define <2 x i64> @eor3_2x64_right(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d
; SVE2-NEXT: mov v0.16b, v1.16b
; SVE2-NEXT: ret
+;
+; SHA3-SVE2-LABEL: eor3_2x64_right:
+; SHA3-SVE2: // %bb.0:
+; SHA3-SVE2-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-SVE2-NEXT: ret
%4 = xor <2 x i64> %1, %2
%5 = xor <2 x i64> %4, %0
ret <2 x i64> %5
@@ -219,9 +259,19 @@ define <2 x i64> @eor3_vnot(<2 x i64> %0, <2 x i64> %1) {
;
; SVE2-LABEL: eor3_vnot:
; SVE2: // %bb.0:
-; SVE2-NEXT: eor v0.16b, v0.16b, v1.16b
-; SVE2-NEXT: mvn v0.16b, v0.16b
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT: bsl2n z0.d, z0.d, z0.d, z1.d
+; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE2-NEXT: ret
+;
+; SHA3-SVE2-LABEL: eor3_vnot:
+; SHA3-SVE2: // %bb.0:
+; SHA3-SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SHA3-SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
+; SHA3-SVE2-NEXT: bsl2n z0.d, z0.d, z0.d, z1.d
+; SHA3-SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SHA3-SVE2-NEXT: ret
%3 = xor <2 x i64> %0, <i64 -1, i64 -1>
%4 = xor <2 x i64> %3, %1
ret <2 x i64> %4
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
index 30ec2de2bd9cc..9a78726c450d1 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
@@ -322,11 +322,9 @@ entry:
define <vscale x 4 x i32> @ornot_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
; CHECK-LABEL: ornot_v4i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z3.s, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: bsl2n z1.d, z1.d, z2.d, z1.d
; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0
-; CHECK-NEXT: eor z2.d, z2.d, z3.d
-; CHECK-NEXT: orr z1.d, z1.d, z2.d
; CHECK-NEXT: mov z0.s, p0/m, z1.s
; CHECK-NEXT: ret
entry:
@@ -340,11 +338,9 @@ entry:
define <vscale x 8 x i16> @ornot_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
; CHECK-LABEL: ornot_v8i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z3.h, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: bsl2n z1.d, z1.d, z2.d, z1.d
; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0
-; CHECK-NEXT: eor z2.d, z2.d, z3.d
-; CHECK-NEXT: orr z1.d, z1.d, z2.d
; CHECK-NEXT: mov z0.h, p0/m, z1.h
; CHECK-NEXT: ret
entry:
@@ -358,11 +354,9 @@ entry:
define <vscale x 16 x i8> @ornot_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
; CHECK-LABEL: ornot_v16i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z3.b, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: bsl2n z1.d, z1.d, z2.d, z1.d
; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0
-; CHECK-NEXT: eor z2.d, z2.d, z3.d
-; CHECK-NEXT: orr z1.d, z1.d, z2.d
; CHECK-NEXT: mov z0.b, p0/m, z1.b
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 8aedeac18f64a..6cfe66eb8e633 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -312,3 +312,60 @@ entry:
%t3 = xor <vscale x 4 x i32> %t2, %b
ret <vscale x 4 x i32> %t3
}
+
+; NOT (a) = NBSL (a, a, a).
+; We don't have a pattern for this right now because the tied register
+; constraint can lead to worse code gen.
+define <vscale x 2 x i64> @not(<vscale x 2 x i64> %0) #0 {
+; CHECK-LABEL: not:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: eor z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %2 = xor <vscale x 2 x i64> %0, splat (i64 -1)
+ ret <vscale x 2 x i64> %2
+}
+
+; NAND (a, b) = NBSL (a, b, b) = NBSL (b, a, a).
+define <vscale x 2 x i64> @nand(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: nand:
+; CHECK: // %bb.0:
+; CHECK-NEXT: nbsl z0.d, z0.d, z1.d, z1.d
+; CHECK-NEXT: ret
+ %3 = and <vscale x 2 x i64> %1, %0
+ %4 = xor <vscale x 2 x i64> %3, splat (i64 -1)
+ ret <vscale x 2 x i64> %4
+}
+
+; NOR (a, b) = NBSL (a, b, a) = NBSL (b, a, b).
+define <vscale x 2 x i64> @nor(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: nor:
+; CHECK: // %bb.0:
+; CHECK-NEXT: nbsl z0.d, z0.d, z1.d, z0.d
+; CHECK-NEXT: ret
+ %3 = or <vscale x 2 x i64> %1, %0
+ %4 = xor <vscale x 2 x i64> %3, splat (i64 -1)
+ ret <vscale x 2 x i64> %4
+}
+
+; EON (a, b) = BSL2N (a, a, b) = BSL2N (b, b, a).
+define <vscale x 2 x i64> @eon(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: eon:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl2n z0.d, z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+ %3 = xor <vscale x 2 x i64> %0, %1
+ %4 = xor <vscale x 2 x i64> %3, splat (i64 -1)
+ ret <vscale x 2 x i64> %4
+}
+
+; ORN (a, b) = BSL2N (a, b, a).
+define <vscale x 2 x i64> @orn(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: orn:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bsl2n z0.d, z0.d, z1.d, z0.d
+; CHECK-NEXT: ret
+ %3 = xor <vscale x 2 x i64> %1, splat (i64 -1)
+ %4 = or <vscale x 2 x i64> %0, %3
+ ret <vscale x 2 x i64> %4
+}
|
foreach VT = [nxv16i8, nxv8i16, nxv4i32, nxv2i64] in { | ||
// EON (a, b) = BSL2N (a, a, b) = BSL2N (b, b, a) | ||
defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (xor node:$op1, node:$op2))>, | ||
OutPatFrag<(ops node:$op1, node:$op2), (BSL2N_ZZZZ $op1, $op1, $op2)>>; | ||
|
||
// NAND (a, b) = NBSL (a, b, b) = NBSL (b, a, a) | ||
defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (and node:$op1, node:$op2))>, | ||
OutPatFrag<(ops node:$op1, node:$op2), (NBSL_ZZZZ $op2, $op1, $op1)>>; | ||
|
||
// NOR (a, b) = NBSL (a, b, a) = NBSL (b, a, b) | ||
defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (or node:$op1, node:$op2))>, | ||
OutPatFrag<(ops node:$op1, node:$op2), (NBSL_ZZZZ $op2, $op1, $op2)>>; | ||
|
||
// ORN (a, b) = BSL2N (a, b, a) | ||
defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (or node:$op1, (vnot node:$op2))>, | ||
OutPatFrag<(ops node:$op1, node:$op2), (BSL2N_ZZZZ $op1, $op2, $op1)>>; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it not possibly to just update the existing AArch64*bsl*
PatFrags? I guess it's not possible to construct the third operand?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It wasn't obvious to me how that could be done and the order of the operands in the *BSL*
instructions propagated, but if there's an example you could point me to, I'd be happy to give it a try.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's ok. I realised by mistake as soon as I wrote the comment but felt committed :)
I'll investigate more next time I'm spring cleaning the patterns.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we may end up wanting to introduce some pseudo instructions so that we can pick the operands based on the result of register allocation but for today I doubt we'll ever be worse off so let's see how it play out.
foreach VT = [nxv16i8, nxv8i16, nxv4i32, nxv2i64] in { | ||
// EON (a, b) = BSL2N (a, a, b) = BSL2N (b, b, a) | ||
defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (xor node:$op1, node:$op2))>, | ||
OutPatFrag<(ops node:$op1, node:$op2), (BSL2N_ZZZZ $op1, $op1, $op2)>>; | ||
|
||
// NAND (a, b) = NBSL (a, b, b) = NBSL (b, a, a) | ||
defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (and node:$op1, node:$op2))>, | ||
OutPatFrag<(ops node:$op1, node:$op2), (NBSL_ZZZZ $op2, $op1, $op1)>>; | ||
|
||
// NOR (a, b) = NBSL (a, b, a) = NBSL (b, a, b) | ||
defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (or node:$op1, node:$op2))>, | ||
OutPatFrag<(ops node:$op1, node:$op2), (NBSL_ZZZZ $op2, $op1, $op2)>>; | ||
|
||
// ORN (a, b) = BSL2N (a, b, a) | ||
defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (or node:$op1, (vnot node:$op2))>, | ||
OutPatFrag<(ops node:$op1, node:$op2), (BSL2N_ZZZZ $op1, $op2, $op1)>>; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's ok. I realised by mistake as soon as I wrote the comment but felt committed :)
I'll investigate more next time I'm spring cleaning the patterns.
We can use NBSL/BSL2N to implement the following operations via the
corresponding identities:
Most of these operations are currently lowered into at least two instructions
because we don't have dedicated Neon/SVE instructions for them. With the
appropriate pattern of NBSL/BSL2N we can lower them in a single
instruction.
P.S. We can also use NBSL to implement an unpredicated NOT(a) =
NBSL(a, a, a). However, because of the tied register constraint, this
may not always be profitable.