Skip to content

[AArch64] Use SVE2 bit-sel instructions for some binary patterns. #147544

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -4034,6 +4034,36 @@ let Predicates = [HasSVE2_or_SME] in {
defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", AArch64bsl2n>;
defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", AArch64nbsl>;

multiclass binary_bitwise<ValueType VT, SDPatternOperator InOp, SDPatternOperator OutOp> {
def : Pat<(InOp VT:$op1, VT:$op2), (OutOp $op1, $op2)>;

def : Pat<(SVEType<VT>.DSub (InOp V64:$op1, V64:$op2)),
(EXTRACT_SUBREG (OutOp (INSERT_SUBREG (IMPLICIT_DEF), (SVEType<VT>.DSub $op1), dsub),
(INSERT_SUBREG (IMPLICIT_DEF), (SVEType<VT>.DSub $op2), dsub)), dsub)>;

def : Pat<(SVEType<VT>.ZSub (InOp V128:$op1, V128:$op2)),
(EXTRACT_SUBREG (OutOp (INSERT_SUBREG (IMPLICIT_DEF), (SVEType<VT>.ZSub $op1), zsub),
(INSERT_SUBREG (IMPLICIT_DEF), (SVEType<VT>.ZSub $op2), zsub)), zsub)>;
}

foreach VT = [nxv16i8, nxv8i16, nxv4i32, nxv2i64] in {
// EON (a, b) = BSL2N (a, a, b) = BSL2N (b, b, a)
defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (xor node:$op1, node:$op2))>,
OutPatFrag<(ops node:$op1, node:$op2), (BSL2N_ZZZZ $op1, $op1, $op2)>>;

// NAND (a, b) = NBSL (a, b, b) = NBSL (b, a, a)
defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (and node:$op1, node:$op2))>,
OutPatFrag<(ops node:$op1, node:$op2), (NBSL_ZZZZ $op2, $op1, $op1)>>;

// NOR (a, b) = NBSL (a, b, a) = NBSL (b, a, b)
defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (or node:$op1, node:$op2))>,
OutPatFrag<(ops node:$op1, node:$op2), (NBSL_ZZZZ $op2, $op1, $op2)>>;

// ORN (a, b) = BSL2N (a, b, a)
defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (or node:$op1, (vnot node:$op2))>,
OutPatFrag<(ops node:$op1, node:$op2), (BSL2N_ZZZZ $op1, $op2, $op1)>>;
}
Comment on lines +4049 to +4065
Copy link
Collaborator

@paulwalker-arm paulwalker-arm Jul 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it not possibly to just update the existing AArch64*bsl* PatFrags? I guess it's not possible to construct the third operand?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It wasn't obvious to me how that could be done and the order of the operands in the *BSL* instructions propagated, but if there's an example you could point me to, I'd be happy to give it a try.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's ok. I realised by mistake as soon as I wrote the comment but felt committed :)

I'll investigate more next time I'm spring cleaning the patterns.


// SVE2 bitwise xor and rotate right by immediate
defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>;

Expand Down
91 changes: 91 additions & 0 deletions llvm/test/CodeGen/AArch64/bsl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -431,3 +431,94 @@ define <4 x i8> @bsl2n_v4i8(<4 x i8> %0, <4 x i8> %1, <4 x i8> %2) {
%7 = or <4 x i8> %4, %6
ret <4 x i8> %7
}

; NOT (a) has a dedicated instruction (MVN).
define <2 x i64> @not_q(<2 x i64> %0) #0 {
; NEON-LABEL: not_q:
; NEON: // %bb.0:
; NEON-NEXT: mvn v0.16b, v0.16b
; NEON-NEXT: ret
;
; SVE2-LABEL: not_q:
; SVE2: // %bb.0:
; SVE2-NEXT: mvn v0.16b, v0.16b
; SVE2-NEXT: ret
%2 = xor <2 x i64> %0, splat (i64 -1)
ret <2 x i64> %2
}

; NAND (a, b) = NBSL (a, b, b) = NBSL (b, a, a).
define <2 x i64> @nand_q(<2 x i64> %0, <2 x i64> %1) #0 {
; NEON-LABEL: nand_q:
; NEON: // %bb.0:
; NEON-NEXT: and v0.16b, v1.16b, v0.16b
; NEON-NEXT: mvn v0.16b, v0.16b
; NEON-NEXT: ret
;
; SVE2-LABEL: nand_q:
; SVE2: // %bb.0:
; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
; SVE2-NEXT: nbsl z0.d, z0.d, z1.d, z1.d
; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE2-NEXT: ret
%3 = and <2 x i64> %1, %0
%4 = xor <2 x i64> %3, splat (i64 -1)
ret <2 x i64> %4
}

; NOR (a, b) = NBSL (a, b, a) = NBSL (b, a, b).
define <2 x i64> @nor_q(<2 x i64> %0, <2 x i64> %1) #0 {
; NEON-LABEL: nor_q:
; NEON: // %bb.0:
; NEON-NEXT: orr v0.16b, v1.16b, v0.16b
; NEON-NEXT: mvn v0.16b, v0.16b
; NEON-NEXT: ret
;
; SVE2-LABEL: nor_q:
; SVE2: // %bb.0:
; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
; SVE2-NEXT: nbsl z0.d, z0.d, z1.d, z0.d
; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE2-NEXT: ret
%3 = or <2 x i64> %1, %0
%4 = xor <2 x i64> %3, splat (i64 -1)
ret <2 x i64> %4
}

; EON (a, b) = BSL2N (a, a, b) = BSL2N (b, b, a).
define <2 x i64> @eon_q(<2 x i64> %0, <2 x i64> %1) #0 {
; NEON-LABEL: eon_q:
; NEON: // %bb.0:
; NEON-NEXT: eor v0.16b, v0.16b, v1.16b
; NEON-NEXT: mvn v0.16b, v0.16b
; NEON-NEXT: ret
;
; SVE2-LABEL: eon_q:
; SVE2: // %bb.0:
; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
; SVE2-NEXT: bsl2n z0.d, z0.d, z0.d, z1.d
; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE2-NEXT: ret
%3 = xor <2 x i64> %0, %1
%4 = xor <2 x i64> %3, splat (i64 -1)
ret <2 x i64> %4
}

; ORN (a, b) has a dedicated instruction (ORN).
define <2 x i64> @orn_q(<2 x i64> %0, <2 x i64> %1) #0 {
; NEON-LABEL: orn_q:
; NEON: // %bb.0:
; NEON-NEXT: orn v0.16b, v0.16b, v1.16b
; NEON-NEXT: ret
;
; SVE2-LABEL: orn_q:
; SVE2: // %bb.0:
; SVE2-NEXT: orn v0.16b, v0.16b, v1.16b
; SVE2-NEXT: ret
%3 = xor <2 x i64> %1, splat (i64 -1)
%4 = or <2 x i64> %0, %3
ret <2 x i64> %4
}
56 changes: 53 additions & 3 deletions llvm/test/CodeGen/AArch64/eor3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s
; RUN: llc -mtriple=aarch64 -mattr=+sha3,+sve2 < %s | FileCheck --check-prefix=SHA3 %s
; RUN: llc -mtriple=aarch64 -mattr=+sha3,+sve2 < %s | FileCheck --check-prefix=SHA3-SVE2 %s

define <16 x i8> @eor3_16x8_left(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
; SHA3-LABEL: eor3_16x8_left:
Expand All @@ -24,6 +24,11 @@ define <16 x i8> @eor3_16x8_left(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
; SVE2-NEXT: eor3 z2.d, z2.d, z0.d, z1.d
; SVE2-NEXT: mov v0.16b, v2.16b
; SVE2-NEXT: ret
;
; SHA3-SVE2-LABEL: eor3_16x8_left:
; SHA3-SVE2: // %bb.0:
; SHA3-SVE2-NEXT: eor3 v0.16b, v0.16b, v1.16b, v2.16b
; SHA3-SVE2-NEXT: ret
%4 = xor <16 x i8> %0, %1
%5 = xor <16 x i8> %2, %4
ret <16 x i8> %5
Expand All @@ -49,6 +54,11 @@ define <16 x i8> @eor3_16x8_right(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d
; SVE2-NEXT: mov v0.16b, v1.16b
; SVE2-NEXT: ret
;
; SHA3-SVE2-LABEL: eor3_16x8_right:
; SHA3-SVE2: // %bb.0:
; SHA3-SVE2-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b
; SHA3-SVE2-NEXT: ret
%4 = xor <16 x i8> %1, %2
%5 = xor <16 x i8> %4, %0
ret <16 x i8> %5
Expand All @@ -74,6 +84,11 @@ define <8 x i16> @eor3_8x16_left(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2) {
; SVE2-NEXT: eor3 z2.d, z2.d, z0.d, z1.d
; SVE2-NEXT: mov v0.16b, v2.16b
; SVE2-NEXT: ret
;
; SHA3-SVE2-LABEL: eor3_8x16_left:
; SHA3-SVE2: // %bb.0:
; SHA3-SVE2-NEXT: eor3 v0.16b, v0.16b, v1.16b, v2.16b
; SHA3-SVE2-NEXT: ret
%4 = xor <8 x i16> %0, %1
%5 = xor <8 x i16> %2, %4
ret <8 x i16> %5
Expand All @@ -99,6 +114,11 @@ define <8 x i16> @eor3_8x16_right(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2) {
; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d
; SVE2-NEXT: mov v0.16b, v1.16b
; SVE2-NEXT: ret
;
; SHA3-SVE2-LABEL: eor3_8x16_right:
; SHA3-SVE2: // %bb.0:
; SHA3-SVE2-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b
; SHA3-SVE2-NEXT: ret
%4 = xor <8 x i16> %1, %2
%5 = xor <8 x i16> %4, %0
ret <8 x i16> %5
Expand All @@ -124,6 +144,11 @@ define <4 x i32> @eor3_4x32_left(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) {
; SVE2-NEXT: eor3 z2.d, z2.d, z0.d, z1.d
; SVE2-NEXT: mov v0.16b, v2.16b
; SVE2-NEXT: ret
;
; SHA3-SVE2-LABEL: eor3_4x32_left:
; SHA3-SVE2: // %bb.0:
; SHA3-SVE2-NEXT: eor3 v0.16b, v0.16b, v1.16b, v2.16b
; SHA3-SVE2-NEXT: ret
%4 = xor <4 x i32> %0, %1
%5 = xor <4 x i32> %2, %4
ret <4 x i32> %5
Expand All @@ -149,6 +174,11 @@ define <4 x i32> @eor3_4x32_right(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) {
; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d
; SVE2-NEXT: mov v0.16b, v1.16b
; SVE2-NEXT: ret
;
; SHA3-SVE2-LABEL: eor3_4x32_right:
; SHA3-SVE2: // %bb.0:
; SHA3-SVE2-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b
; SHA3-SVE2-NEXT: ret
%4 = xor <4 x i32> %1, %2
%5 = xor <4 x i32> %4, %0
ret <4 x i32> %5
Expand All @@ -174,6 +204,11 @@ define <2 x i64> @eor3_2x64_left(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
; SVE2-NEXT: eor3 z2.d, z2.d, z0.d, z1.d
; SVE2-NEXT: mov v0.16b, v2.16b
; SVE2-NEXT: ret
;
; SHA3-SVE2-LABEL: eor3_2x64_left:
; SHA3-SVE2: // %bb.0:
; SHA3-SVE2-NEXT: eor3 v0.16b, v0.16b, v1.16b, v2.16b
; SHA3-SVE2-NEXT: ret
%4 = xor <2 x i64> %0, %1
%5 = xor <2 x i64> %2, %4
ret <2 x i64> %5
Expand All @@ -199,6 +234,11 @@ define <2 x i64> @eor3_2x64_right(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
; SVE2-NEXT: eor3 z1.d, z1.d, z2.d, z0.d
; SVE2-NEXT: mov v0.16b, v1.16b
; SVE2-NEXT: ret
;
; SHA3-SVE2-LABEL: eor3_2x64_right:
; SHA3-SVE2: // %bb.0:
; SHA3-SVE2-NEXT: eor3 v0.16b, v1.16b, v2.16b, v0.16b
; SHA3-SVE2-NEXT: ret
%4 = xor <2 x i64> %1, %2
%5 = xor <2 x i64> %4, %0
ret <2 x i64> %5
Expand All @@ -219,9 +259,19 @@ define <2 x i64> @eor3_vnot(<2 x i64> %0, <2 x i64> %1) {
;
; SVE2-LABEL: eor3_vnot:
; SVE2: // %bb.0:
; SVE2-NEXT: eor v0.16b, v0.16b, v1.16b
; SVE2-NEXT: mvn v0.16b, v0.16b
; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
; SVE2-NEXT: bsl2n z0.d, z0.d, z0.d, z1.d
; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE2-NEXT: ret
;
; SHA3-SVE2-LABEL: eor3_vnot:
; SHA3-SVE2: // %bb.0:
; SHA3-SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
; SHA3-SVE2-NEXT: // kill: def $q1 killed $q1 def $z1
; SHA3-SVE2-NEXT: bsl2n z0.d, z0.d, z0.d, z1.d
; SHA3-SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
; SHA3-SVE2-NEXT: ret
%3 = xor <2 x i64> %0, <i64 -1, i64 -1>
%4 = xor <2 x i64> %3, %1
ret <2 x i64> %4
Expand Down
12 changes: 3 additions & 9 deletions llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -322,11 +322,9 @@ entry:
define <vscale x 4 x i32> @ornot_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
; CHECK-LABEL: ornot_v4i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z3.s, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: bsl2n z1.d, z1.d, z2.d, z1.d
; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0
; CHECK-NEXT: eor z2.d, z2.d, z3.d
; CHECK-NEXT: orr z1.d, z1.d, z2.d
; CHECK-NEXT: mov z0.s, p0/m, z1.s
; CHECK-NEXT: ret
entry:
Expand All @@ -340,11 +338,9 @@ entry:
define <vscale x 8 x i16> @ornot_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
; CHECK-LABEL: ornot_v8i16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z3.h, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: bsl2n z1.d, z1.d, z2.d, z1.d
; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0
; CHECK-NEXT: eor z2.d, z2.d, z3.d
; CHECK-NEXT: orr z1.d, z1.d, z2.d
; CHECK-NEXT: mov z0.h, p0/m, z1.h
; CHECK-NEXT: ret
entry:
Expand All @@ -358,11 +354,9 @@ entry:
define <vscale x 16 x i8> @ornot_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
; CHECK-LABEL: ornot_v16i8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z3.b, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: bsl2n z1.d, z1.d, z2.d, z1.d
; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0
; CHECK-NEXT: eor z2.d, z2.d, z3.d
; CHECK-NEXT: orr z1.d, z1.d, z2.d
; CHECK-NEXT: mov z0.b, p0/m, z1.b
; CHECK-NEXT: ret
entry:
Expand Down
57 changes: 57 additions & 0 deletions llvm/test/CodeGen/AArch64/sve2-bsl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -312,3 +312,60 @@ entry:
%t3 = xor <vscale x 4 x i32> %t2, %b
ret <vscale x 4 x i32> %t3
}

; NOT (a) = NBSL (a, a, a).
; We don't have a pattern for this right now because the tied register
; constraint can lead to worse code gen.
define <vscale x 2 x i64> @not(<vscale x 2 x i64> %0) #0 {
; CHECK-LABEL: not:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.d, #-1 // =0xffffffffffffffff
; CHECK-NEXT: eor z0.d, z0.d, z1.d
; CHECK-NEXT: ret
%2 = xor <vscale x 2 x i64> %0, splat (i64 -1)
ret <vscale x 2 x i64> %2
}

; NAND (a, b) = NBSL (a, b, b) = NBSL (b, a, a).
define <vscale x 2 x i64> @nand(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
; CHECK-LABEL: nand:
; CHECK: // %bb.0:
; CHECK-NEXT: nbsl z0.d, z0.d, z1.d, z1.d
; CHECK-NEXT: ret
%3 = and <vscale x 2 x i64> %1, %0
%4 = xor <vscale x 2 x i64> %3, splat (i64 -1)
ret <vscale x 2 x i64> %4
}

; NOR (a, b) = NBSL (a, b, a) = NBSL (b, a, b).
define <vscale x 2 x i64> @nor(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
; CHECK-LABEL: nor:
; CHECK: // %bb.0:
; CHECK-NEXT: nbsl z0.d, z0.d, z1.d, z0.d
; CHECK-NEXT: ret
%3 = or <vscale x 2 x i64> %1, %0
%4 = xor <vscale x 2 x i64> %3, splat (i64 -1)
ret <vscale x 2 x i64> %4
}

; EON (a, b) = BSL2N (a, a, b) = BSL2N (b, b, a).
define <vscale x 2 x i64> @eon(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
; CHECK-LABEL: eon:
; CHECK: // %bb.0:
; CHECK-NEXT: bsl2n z0.d, z0.d, z0.d, z1.d
; CHECK-NEXT: ret
%3 = xor <vscale x 2 x i64> %0, %1
%4 = xor <vscale x 2 x i64> %3, splat (i64 -1)
ret <vscale x 2 x i64> %4
}

; ORN (a, b) = BSL2N (a, b, a).
define <vscale x 2 x i64> @orn(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
; CHECK-LABEL: orn:
; CHECK: // %bb.0:
; CHECK-NEXT: bsl2n z0.d, z0.d, z1.d, z0.d
; CHECK-NEXT: ret
%3 = xor <vscale x 2 x i64> %1, splat (i64 -1)
%4 = or <vscale x 2 x i64> %0, %3
ret <vscale x 2 x i64> %4
}
Loading