Skip to content

Commit 5b7b009

Browse files
tlivelyalexcrichton
authored andcommitted
[WebAssembly] Add shuffles as an option for lowering BUILD_VECTOR
When lowering a BUILD_VECTOR SDNode, we choose among various possible vector creation instructions in an attempt to minimize the total number of instructions used. We previously considered using swizzles, consts, and splats, and this patch adds shuffles as well. A common pattern that now lowers to shuffles is when two 64-bit vectors are concatenated. Previously, concatenations generally lowered to sequences of extract_lane and replace_lane instructions when they could have been a single shuffle. Differential Revision: https://reviews.llvm.org/D100018
1 parent ea6bb26 commit 5b7b009

File tree

3 files changed

+184
-6
lines changed

3 files changed

+184
-6
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 89 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1600,8 +1600,8 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
16001600
// TODO: Tune this. For example, lanewise swizzling is very expensive, so
16011601
// swizzled lanes should be given greater weight.
16021602

1603-
// TODO: Investigate building vectors by shuffling together vectors built by
1604-
// separately specialized means.
1603+
// TODO: Investigate looping rather than always extracting/replacing specific
1604+
// lanes to fill gaps.
16051605

16061606
auto IsConstant = [](const SDValue &V) {
16071607
return V.getOpcode() == ISD::Constant || V.getOpcode() == ISD::ConstantFP;
@@ -1632,12 +1632,30 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
16321632
return std::make_pair(SwizzleSrc, SwizzleIndices);
16331633
};
16341634

1635+
// If the lane is extracted from another vector at a constant index, return
1636+
// that vector. The source vector must not have more lanes than the dest
1637+
// because the shufflevector indices are in terms of the destination lanes and
1638+
// would not be able to address the smaller individual source lanes.
1639+
auto GetShuffleSrc = [&](const SDValue &Lane) {
1640+
if (Lane->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
1641+
return SDValue();
1642+
if (!isa<ConstantSDNode>(Lane->getOperand(1).getNode()))
1643+
return SDValue();
1644+
if (Lane->getOperand(0).getValueType().getVectorNumElements() >
1645+
VecT.getVectorNumElements())
1646+
return SDValue();
1647+
return Lane->getOperand(0);
1648+
};
1649+
16351650
using ValueEntry = std::pair<SDValue, size_t>;
16361651
SmallVector<ValueEntry, 16> SplatValueCounts;
16371652

16381653
using SwizzleEntry = std::pair<std::pair<SDValue, SDValue>, size_t>;
16391654
SmallVector<SwizzleEntry, 16> SwizzleCounts;
16401655

1656+
using ShuffleEntry = std::pair<SDValue, size_t>;
1657+
SmallVector<ShuffleEntry, 16> ShuffleCounts;
1658+
16411659
auto AddCount = [](auto &Counts, const auto &Val) {
16421660
auto CountIt =
16431661
llvm::find_if(Counts, [&Val](auto E) { return E.first == Val; });
@@ -1666,9 +1684,11 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
16661684

16671685
AddCount(SplatValueCounts, Lane);
16681686

1669-
if (IsConstant(Lane)) {
1687+
if (IsConstant(Lane))
16701688
NumConstantLanes++;
1671-
} else if (CanSwizzle) {
1689+
if (auto ShuffleSrc = GetShuffleSrc(Lane))
1690+
AddCount(ShuffleCounts, ShuffleSrc);
1691+
if (CanSwizzle) {
16721692
auto SwizzleSrcs = GetSwizzleSrcs(I, Lane);
16731693
if (SwizzleSrcs.first)
16741694
AddCount(SwizzleCounts, SwizzleSrcs);
@@ -1686,18 +1706,81 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
16861706
std::forward_as_tuple(std::tie(SwizzleSrc, SwizzleIndices),
16871707
NumSwizzleLanes) = GetMostCommon(SwizzleCounts);
16881708

1709+
// Shuffles can draw from up to two vectors, so find the two most common
1710+
// sources.
1711+
SDValue ShuffleSrc1, ShuffleSrc2;
1712+
size_t NumShuffleLanes = 0;
1713+
if (ShuffleCounts.size()) {
1714+
std::tie(ShuffleSrc1, NumShuffleLanes) = GetMostCommon(ShuffleCounts);
1715+
ShuffleCounts.erase(std::remove_if(ShuffleCounts.begin(),
1716+
ShuffleCounts.end(),
1717+
[&](const auto &Pair) {
1718+
return Pair.first == ShuffleSrc1;
1719+
}),
1720+
ShuffleCounts.end());
1721+
}
1722+
if (ShuffleCounts.size()) {
1723+
size_t AdditionalShuffleLanes;
1724+
std::tie(ShuffleSrc2, AdditionalShuffleLanes) =
1725+
GetMostCommon(ShuffleCounts);
1726+
NumShuffleLanes += AdditionalShuffleLanes;
1727+
}
1728+
16891729
// Predicate returning true if the lane is properly initialized by the
16901730
// original instruction
16911731
std::function<bool(size_t, const SDValue &)> IsLaneConstructed;
16921732
SDValue Result;
1693-
// Prefer swizzles over vector consts over splats
1694-
if (NumSwizzleLanes >= NumSplatLanes && NumSwizzleLanes >= NumConstantLanes) {
1733+
// Prefer swizzles over shuffles over vector consts over splats
1734+
if (NumSwizzleLanes >= NumShuffleLanes &&
1735+
NumSwizzleLanes >= NumConstantLanes && NumSwizzleLanes >= NumSplatLanes) {
16951736
Result = DAG.getNode(WebAssemblyISD::SWIZZLE, DL, VecT, SwizzleSrc,
16961737
SwizzleIndices);
16971738
auto Swizzled = std::make_pair(SwizzleSrc, SwizzleIndices);
16981739
IsLaneConstructed = [&, Swizzled](size_t I, const SDValue &Lane) {
16991740
return Swizzled == GetSwizzleSrcs(I, Lane);
17001741
};
1742+
} else if (NumShuffleLanes >= NumConstantLanes &&
1743+
NumShuffleLanes >= NumSplatLanes) {
1744+
size_t DestLaneSize = VecT.getVectorElementType().getFixedSizeInBits() / 8;
1745+
size_t DestLaneCount = VecT.getVectorNumElements();
1746+
size_t Scale1 = 1;
1747+
size_t Scale2 = 1;
1748+
SDValue Src1 = ShuffleSrc1;
1749+
SDValue Src2 = ShuffleSrc2 ? ShuffleSrc2 : DAG.getUNDEF(VecT);
1750+
if (Src1.getValueType() != VecT) {
1751+
size_t LaneSize =
1752+
Src1.getValueType().getVectorElementType().getFixedSizeInBits() / 8;
1753+
assert(LaneSize > DestLaneSize);
1754+
Scale1 = LaneSize / DestLaneSize;
1755+
Src1 = DAG.getBitcast(VecT, Src1);
1756+
}
1757+
if (Src2.getValueType() != VecT) {
1758+
size_t LaneSize =
1759+
Src2.getValueType().getVectorElementType().getFixedSizeInBits() / 8;
1760+
assert(LaneSize > DestLaneSize);
1761+
Scale2 = LaneSize / DestLaneSize;
1762+
Src2 = DAG.getBitcast(VecT, Src2);
1763+
}
1764+
1765+
int Mask[16];
1766+
assert(DestLaneCount <= 16);
1767+
for (size_t I = 0; I < DestLaneCount; ++I) {
1768+
const SDValue &Lane = Op->getOperand(I);
1769+
SDValue Src = GetShuffleSrc(Lane);
1770+
if (Src == ShuffleSrc1) {
1771+
Mask[I] = Lane->getConstantOperandVal(1) * Scale1;
1772+
} else if (Src && Src == ShuffleSrc2) {
1773+
Mask[I] = DestLaneCount + Lane->getConstantOperandVal(1) * Scale2;
1774+
} else {
1775+
Mask[I] = -1;
1776+
}
1777+
}
1778+
ArrayRef<int> MaskRef(Mask, DestLaneCount);
1779+
Result = DAG.getVectorShuffle(VecT, DL, Src1, Src2, MaskRef);
1780+
IsLaneConstructed = [&](size_t, const SDValue &Lane) {
1781+
auto Src = GetShuffleSrc(Lane);
1782+
return Src == ShuffleSrc1 || (Src && Src == ShuffleSrc2);
1783+
};
17011784
} else if (NumConstantLanes >= NumSplatLanes) {
17021785
SmallVector<SDValue, 16> ConstLanes;
17031786
for (const SDValue &Lane : Op->op_values()) {

llvm/test/CodeGen/WebAssembly/simd-build-vector.ll

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,22 @@ define <8 x i16> @swizzle_one_i16x8(<8 x i16> %src, <8 x i16> %mask) {
165165
ret <8 x i16> %v0
166166
}
167167

168+
; CHECK-LABEL: half_shuffle_i32x4:
169+
; CHECK-NEXT: .functype half_shuffle_i32x4 (v128) -> (v128)
170+
; CHECK: i8x16.shuffle $push[[L0:[0-9]+]]=, $0, $0, 0, 0, 0, 0, 8, 9, 10, 11, 0, 1, 2, 3, 0, 0, 0, 0
171+
; CHECK: i32x4.replace_lane
172+
; CHECK: i32x4.replace_lane
173+
; CHECK: return
174+
define <4 x i32> @half_shuffle_i32x4(<4 x i32> %src) {
175+
%s0 = extractelement <4 x i32> %src, i32 0
176+
%s2 = extractelement <4 x i32> %src, i32 2
177+
%v0 = insertelement <4 x i32> undef, i32 0, i32 0
178+
%v1 = insertelement <4 x i32> %v0, i32 %s2, i32 1
179+
%v2 = insertelement <4 x i32> %v1, i32 %s0, i32 2
180+
%v3 = insertelement <4 x i32> %v2, i32 3, i32 3
181+
ret <4 x i32> %v3
182+
}
183+
168184
; CHECK-LABEL: mashup_swizzle_i8x16:
169185
; CHECK-NEXT: .functype mashup_swizzle_i8x16 (v128, v128, i32) -> (v128)
170186
; CHECK-NEXT: i8x16.swizzle $push[[L0:[0-9]+]]=, $0, $1
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
3+
4+
; Check that all varieties of vector concatenations get lowered to shuffles.
5+
6+
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
7+
target triple = "wasm32-unknown--wasm"
8+
9+
define <16 x i8> @concat_v8i8(<8 x i8> %a, <8 x i8> %b) {
10+
; CHECK-LABEL: concat_v8i8:
11+
; CHECK: .functype concat_v8i8 (v128, v128) -> (v128)
12+
; CHECK-NEXT: # %bb.0:
13+
; CHECK-NEXT: local.get 0
14+
; CHECK-NEXT: local.get 1
15+
; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
16+
; CHECK-NEXT: # fallthrough-return
17+
%v = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
18+
ret <16 x i8> %v
19+
}
20+
21+
define <8 x i8> @concat_v4i8(<4 x i8> %a, <4 x i8> %b) {
22+
; CHECK-LABEL: concat_v4i8:
23+
; CHECK: .functype concat_v4i8 (v128, v128) -> (v128)
24+
; CHECK-NEXT: # %bb.0:
25+
; CHECK-NEXT: local.get 0
26+
; CHECK-NEXT: local.get 1
27+
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
28+
; CHECK-NEXT: # fallthrough-return
29+
%v = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
30+
ret <8 x i8> %v
31+
}
32+
33+
define <8 x i16> @concat_v4i16(<4 x i16> %a, <4 x i16> %b) {
34+
; CHECK-LABEL: concat_v4i16:
35+
; CHECK: .functype concat_v4i16 (v128, v128) -> (v128)
36+
; CHECK-NEXT: # %bb.0:
37+
; CHECK-NEXT: local.get 0
38+
; CHECK-NEXT: local.get 1
39+
; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
40+
; CHECK-NEXT: # fallthrough-return
41+
%v = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
42+
ret <8 x i16> %v
43+
}
44+
45+
define <4 x i8> @concat_v2i8(<2 x i8> %a, <2 x i8> %b) {
46+
; CHECK-LABEL: concat_v2i8:
47+
; CHECK: .functype concat_v2i8 (v128, v128) -> (v128)
48+
; CHECK-NEXT: # %bb.0:
49+
; CHECK-NEXT: local.get 0
50+
; CHECK-NEXT: local.get 1
51+
; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
52+
; CHECK-NEXT: # fallthrough-return
53+
%v = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
54+
ret <4 x i8> %v
55+
}
56+
57+
define <4 x i16> @concat_v2i16(<2 x i16> %a, <2 x i16> %b) {
58+
; CHECK-LABEL: concat_v2i16:
59+
; CHECK: .functype concat_v2i16 (v128, v128) -> (v128)
60+
; CHECK-NEXT: # %bb.0:
61+
; CHECK-NEXT: local.get 0
62+
; CHECK-NEXT: local.get 1
63+
; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
64+
; CHECK-NEXT: # fallthrough-return
65+
%v = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
66+
ret <4 x i16> %v
67+
}
68+
69+
define <4 x i32> @concat_v2i32(<2 x i32> %a, <2 x i32> %b) {
70+
; CHECK-LABEL: concat_v2i32:
71+
; CHECK: .functype concat_v2i32 (v128, v128) -> (v128)
72+
; CHECK-NEXT: # %bb.0:
73+
; CHECK-NEXT: local.get 0
74+
; CHECK-NEXT: local.get 1
75+
; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
76+
; CHECK-NEXT: # fallthrough-return
77+
%v = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
78+
ret <4 x i32> %v
79+
}

0 commit comments

Comments
 (0)