Skip to content
This repository was archived by the owner on Jan 1, 2023. It is now read-only.

Commit dace686

Browse files
author
Sebastian Pop
committed
[x86][aarch64] ask the backend whether it has a vector blend instruction
The code to match and produce more x86 vector blends was enabled for all architectures even though the transform may pessimize the code for other architectures that do not provide a vector blend instruction. Added an aarch64 testcase to check that a VZIP instruction is generated instead of byte movs. Differential Revision: https://reviews.llvm.org/D44118 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@327132 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 0d15ae3 commit dace686

File tree

5 files changed

+43
-26
lines changed

5 files changed

+43
-26
lines changed

include/llvm/CodeGen/TargetLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2117,6 +2117,9 @@ class TargetLoweringBase {
21172117
return false;
21182118
}
21192119

2120+
/// Return true if the target has a vector blend instruction.
2121+
virtual bool hasVectorBlend() const { return false; }
2122+
21202123
/// \brief Get the maximum supported factor for interleaved memory accesses.
21212124
/// Default to be the minimum interleave factor: 2.
21222125
virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }

lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1557,33 +1557,35 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
15571557
if (N1.isUndef())
15581558
commuteShuffle(N1, N2, MaskVec);
15591559

1560-
// If shuffling a splat, try to blend the splat instead. We do this here so
1561-
// that even when this arises during lowering we don't have to re-handle it.
1562-
auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
1563-
BitVector UndefElements;
1564-
SDValue Splat = BV->getSplatValue(&UndefElements);
1565-
if (!Splat)
1566-
return;
1560+
if (TLI->hasVectorBlend()) {
1561+
// If shuffling a splat, try to blend the splat instead. We do this here so
1562+
// that even when this arises during lowering we don't have to re-handle it.
1563+
auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
1564+
BitVector UndefElements;
1565+
SDValue Splat = BV->getSplatValue(&UndefElements);
1566+
if (!Splat)
1567+
return;
15671568

1568-
for (int i = 0; i < NElts; ++i) {
1569-
if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + NElts))
1570-
continue;
1569+
for (int i = 0; i < NElts; ++i) {
1570+
if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + NElts))
1571+
continue;
15711572

1572-
// If this input comes from undef, mark it as such.
1573-
if (UndefElements[MaskVec[i] - Offset]) {
1574-
MaskVec[i] = -1;
1575-
continue;
1576-
}
1573+
// If this input comes from undef, mark it as such.
1574+
if (UndefElements[MaskVec[i] - Offset]) {
1575+
MaskVec[i] = -1;
1576+
continue;
1577+
}
15771578

1578-
// If we can blend a non-undef lane, use that instead.
1579-
if (!UndefElements[i])
1580-
MaskVec[i] = i + Offset;
1581-
}
1582-
};
1583-
if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
1584-
BlendSplat(N1BV, 0);
1585-
if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
1586-
BlendSplat(N2BV, NElts);
1579+
// If we can blend a non-undef lane, use that instead.
1580+
if (!UndefElements[i])
1581+
MaskVec[i] = i + Offset;
1582+
}
1583+
};
1584+
if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
1585+
BlendSplat(N1BV, 0);
1586+
if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
1587+
BlendSplat(N2BV, NElts);
1588+
}
15871589

15881590
// Canonicalize all index into lhs, -> shuffle lhs, undef
15891591
// Canonicalize all index into rhs, -> shuffle rhs, undef

lib/Target/X86/X86ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,6 +1098,8 @@ namespace llvm {
10981098

10991099
StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
11001100

1101+
bool hasVectorBlend() const override { return true; }
1102+
11011103
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
11021104

11031105
/// \brief Lower interleaved load(s) into target specific

test/CodeGen/AArch64/aarch64-vuzp.ll

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,13 @@ entry:
5858
store <4 x i32> %y, <4 x i32>* %z, align 4
5959
ret void
6060
}
61+
62+
; Check that this pattern is recognized as a VZIP and
63+
; that the vector blend transform does not scramble the pattern.
64+
; CHECK-LABEL: vzipNoBlend:
65+
; CHECK: zip1
66+
define <8 x i8> @vzipNoBlend(<8 x i8>* %A, <8 x i16>* %B) nounwind {
67+
%t = load <8 x i8>, <8 x i8>* %A
68+
%vzip = shufflevector <8 x i8> %t, <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
69+
ret <8 x i8> %vzip
70+
}

test/CodeGen/AArch64/arm64-collect-loh.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -638,13 +638,13 @@ define void @setL(<1 x i8> %t) {
638638
; CHECK: [[LOH_LABEL1:Lloh[0-9]+]]:
639639
; CHECK: ldr q[[IDX:[0-9]+]], {{\[}}[[ADRP_REG]], [[CONSTPOOL]]@PAGEOFF]
640640
; The tuple comes from the next instruction.
641-
; CHECK-NEXT: tbl.16b v{{[0-9]+}}, { v{{[0-9]+}}, v{{[0-9]+}} }, v[[IDX]]
641+
; CHECK: ext.16b v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, #1
642642
; CHECK: ret
643643
; CHECK: .loh AdrpLdr [[LOH_LABEL0]], [[LOH_LABEL1]]
644644
define void @uninterestingSub(i8* nocapture %row) #0 {
645645
%tmp = bitcast i8* %row to <16 x i8>*
646646
%tmp1 = load <16 x i8>, <16 x i8>* %tmp, align 16
647-
%vext43 = shufflevector <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %tmp1, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
647+
%vext43 = shufflevector <16 x i8> <i8 undef, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2>, <16 x i8> %tmp1, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
648648
%add.i.414 = add <16 x i8> zeroinitializer, %vext43
649649
store <16 x i8> %add.i.414, <16 x i8>* %tmp, align 16
650650
%add.ptr51 = getelementptr inbounds i8, i8* %row, i64 16

0 commit comments

Comments
 (0)