Skip to content

Commit dd8e1ad

Browse files
authored
[X86] LowerShift - track the number and location of constant shift elements. (llvm#120270)
We have several vector shift lowering strategies that have to analyse the distribution of non-uniform constant vector shift amounts, at the moment there is very little sharing of data between these analysis. This patch creates a SmallDenseMap of the different LEGAL constant shift amounts used, with a mask of which elements they are used in. So far I've only updated the shuffle(immshift(x,c1),immshift(x,c2)) lowering pattern to use it for clarity, there's several more that can be done in followups. Its hoped that the proposed patch llvm#117980 can be simplified after this patch as well. vec_shift6.ll - the existing shuffle(immshift(x,c1),immshift(x,c2)) lowering bails on out of range shift amounts, while this patch now skips them and treats them as UNDEF - this means we manage to fold more cases that before would have to lower to a SHL->MUL pattern, including some legalized cases.
1 parent 9826201 commit dd8e1ad

File tree

4 files changed

+85
-74
lines changed

4 files changed

+85
-74
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 36 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -30057,6 +30057,23 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3005730057
return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
3005830058
}
3005930059

30060+
// Build a map of inrange constant amounts with element mask where they occur.
30061+
SmallDenseMap<unsigned, APInt, 16> UniqueCstAmt;
30062+
if (ConstantAmt) {
30063+
for (unsigned I = 0; I != NumElts; ++I) {
30064+
SDValue A = Amt.getOperand(I);
30065+
if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30066+
continue;
30067+
unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30068+
if (UniqueCstAmt.count(CstAmt)) {
30069+
UniqueCstAmt[CstAmt].setBit(I);
30070+
continue;
30071+
}
30072+
UniqueCstAmt[CstAmt] = APInt::getOneBitSet(NumElts, I);
30073+
}
30074+
assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30075+
}
30076+
3006030077
// If possible, lower this shift as a sequence of two shifts by
3006130078
// constant plus a BLENDing shuffle instead of scalarizing it.
3006230079
// Example:
@@ -30067,45 +30084,31 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3006730084
//
3006830085
// The advantage is that the two shifts from the example would be
3006930086
// lowered as X86ISD::VSRLI nodes in parallel before blending.
30070-
if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30071-
(VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30072-
SDValue Amt1, Amt2;
30073-
SmallVector<int, 8> ShuffleMask;
30074-
for (unsigned i = 0; i != NumElts; ++i) {
30075-
SDValue A = Amt->getOperand(i);
30076-
if (A.isUndef()) {
30077-
ShuffleMask.push_back(SM_SentinelUndef);
30078-
continue;
30079-
}
30080-
if (!Amt1 || Amt1 == A) {
30081-
ShuffleMask.push_back(i);
30082-
Amt1 = A;
30083-
continue;
30084-
}
30085-
if (!Amt2 || Amt2 == A) {
30086-
ShuffleMask.push_back(i + NumElts);
30087-
Amt2 = A;
30088-
continue;
30089-
}
30090-
break;
30087+
if (UniqueCstAmt.size() == 2 &&
30088+
(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30089+
(VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30090+
unsigned AmtA = UniqueCstAmt.begin()->first;
30091+
unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30092+
const APInt &MaskA = UniqueCstAmt.begin()->second;
30093+
const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30094+
SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30095+
for (unsigned I = 0; I != NumElts; ++I) {
30096+
if (MaskA[I])
30097+
ShuffleMask[I] = I;
30098+
if (MaskB[I])
30099+
ShuffleMask[I] = I + NumElts;
3009130100
}
3009230101

3009330102
// Only perform this blend if we can perform it without loading a mask.
30094-
if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
30095-
(VT != MVT::v16i16 ||
30103+
if ((VT != MVT::v16i16 ||
3009630104
is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
3009730105
(VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
3009830106
canWidenShuffleElements(ShuffleMask))) {
30099-
auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
30100-
auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
30101-
if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
30102-
Cst2->getAPIntValue().ult(EltSizeInBits)) {
30103-
SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
30104-
Cst1->getZExtValue(), DAG);
30105-
SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
30106-
Cst2->getZExtValue(), DAG);
30107-
return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30108-
}
30107+
SDValue Shift1 =
30108+
getTargetVShiftByConstNode(X86OpcI, dl, VT, R, AmtA, DAG);
30109+
SDValue Shift2 =
30110+
getTargetVShiftByConstNode(X86OpcI, dl, VT, R, AmtB, DAG);
30111+
return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
3010930112
}
3011030113
}
3011130114

llvm/test/CodeGen/X86/vec_shift6.ll

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,27 @@ define <8 x i16> @test1(<8 x i16> %a) {
2222
ret <8 x i16> %shl
2323
}
2424

25+
; Only two legal shift amounts, so we can lower to shuffle(psllw(),psllw())
26+
2527
define <8 x i16> @test2(<8 x i16> %a) {
26-
; SSE-LABEL: test2:
27-
; SSE: # %bb.0:
28-
; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,1,1,2,u,u,2]
29-
; SSE-NEXT: retq
28+
; SSE2-LABEL: test2:
29+
; SSE2: # %bb.0:
30+
; SSE2-NEXT: movdqa %xmm0, %xmm1
31+
; SSE2-NEXT: psllw $1, %xmm1
32+
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
33+
; SSE2-NEXT: retq
34+
;
35+
; SSE41-LABEL: test2:
36+
; SSE41: # %bb.0:
37+
; SSE41-NEXT: movdqa %xmm0, %xmm1
38+
; SSE41-NEXT: psllw $1, %xmm1
39+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
40+
; SSE41-NEXT: retq
3041
;
3142
; AVX-LABEL: test2:
3243
; AVX: # %bb.0:
33-
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,u,1,1,2,u,u,2]
44+
; AVX-NEXT: vpsllw $1, %xmm0, %xmm1
45+
; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
3446
; AVX-NEXT: retq
3547
%shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
3648
ret <8 x i16> %shl
@@ -43,17 +55,18 @@ define <8 x i16> @test2(<8 x i16> %a) {
4355
define <4 x i32> @test3(<4 x i32> %a) {
4456
; SSE2-LABEL: test3:
4557
; SSE2: # %bb.0:
46-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
47-
; SSE2-NEXT: pmuludq %xmm0, %xmm1
48-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
49-
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
50-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
51-
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
58+
; SSE2-NEXT: movdqa %xmm0, %xmm1
59+
; SSE2-NEXT: pslld $1, %xmm1
60+
; SSE2-NEXT: pslld $2, %xmm0
61+
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
5262
; SSE2-NEXT: retq
5363
;
5464
; SSE41-LABEL: test3:
5565
; SSE41: # %bb.0:
56-
; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
66+
; SSE41-NEXT: movdqa %xmm0, %xmm1
67+
; SSE41-NEXT: pslld $2, %xmm1
68+
; SSE41-NEXT: pslld $1, %xmm0
69+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
5770
; SSE41-NEXT: retq
5871
;
5972
; AVX-LABEL: test3:

llvm/test/CodeGen/X86/vector-fshl-sub128.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
337337
; SSE41-NEXT: movdqa %xmm1, %xmm2
338338
; SSE41-NEXT: psrld $27, %xmm2
339339
; SSE41-NEXT: psrld $28, %xmm1
340-
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
340+
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
341341
; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
342342
; SSE41-NEXT: por %xmm2, %xmm0
343343
; SSE41-NEXT: retq
@@ -346,7 +346,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
346346
; AVX1: # %bb.0:
347347
; AVX1-NEXT: vpsrld $27, %xmm1, %xmm2
348348
; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
349-
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
349+
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
350350
; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
351351
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
352352
; AVX1-NEXT: retq

llvm/test/CodeGen/X86/vector-fshr-sub128.ll

Lines changed: 22 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -379,16 +379,11 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
379379
; SSE2-NEXT: psrld $4, %xmm3
380380
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
381381
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3]
382-
; SSE2-NEXT: movl $268435456, %eax # imm = 0x10000000
383-
; SSE2-NEXT: movd %eax, %xmm1
384-
; SSE2-NEXT: pmuludq %xmm0, %xmm1
385-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
386-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
387-
; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
388-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
389-
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
390-
; SSE2-NEXT: por %xmm3, %xmm1
391-
; SSE2-NEXT: movdqa %xmm1, %xmm0
382+
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
383+
; SSE2-NEXT: pslld $28, %xmm0
384+
; SSE2-NEXT: pslld $27, %xmm1
385+
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
386+
; SSE2-NEXT: por %xmm3, %xmm0
392387
; SSE2-NEXT: retq
393388
;
394389
; SSE41-LABEL: constant_funnnel_v2i32:
@@ -400,7 +395,10 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
400395
; SSE41-NEXT: psrld $4, %xmm3
401396
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
402397
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
403-
; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
398+
; SSE41-NEXT: movdqa %xmm0, %xmm1
399+
; SSE41-NEXT: pslld $27, %xmm1
400+
; SSE41-NEXT: pslld $28, %xmm0
401+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
404402
; SSE41-NEXT: por %xmm3, %xmm0
405403
; SSE41-NEXT: retq
406404
;
@@ -411,7 +409,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
411409
; AVX1-NEXT: vpsrld $4, %xmm1, %xmm3
412410
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
413411
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
414-
; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
412+
; AVX1-NEXT: vpslld $27, %xmm0, %xmm2
413+
; AVX1-NEXT: vpslld $28, %xmm0, %xmm0
414+
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
415415
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
416416
; AVX1-NEXT: retq
417417
;
@@ -482,22 +482,17 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
482482
;
483483
; X86-SSE2-LABEL: constant_funnnel_v2i32:
484484
; X86-SSE2: # %bb.0:
485-
; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
486-
; X86-SSE2-NEXT: psrld $5, %xmm3
487485
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
488-
; X86-SSE2-NEXT: psrld $4, %xmm2
489-
; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
490-
; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
491-
; X86-SSE2-NEXT: movl $268435456, %eax # imm = 0x10000000
492-
; X86-SSE2-NEXT: movd %eax, %xmm1
493-
; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1
494-
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
495-
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
496-
; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
497-
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
498-
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
499-
; X86-SSE2-NEXT: por %xmm2, %xmm1
500-
; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
486+
; X86-SSE2-NEXT: psrld $5, %xmm2
487+
; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
488+
; X86-SSE2-NEXT: psrld $4, %xmm3
489+
; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
490+
; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3]
491+
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
492+
; X86-SSE2-NEXT: pslld $28, %xmm0
493+
; X86-SSE2-NEXT: pslld $27, %xmm1
494+
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
495+
; X86-SSE2-NEXT: por %xmm3, %xmm0
501496
; X86-SSE2-NEXT: retl
502497
%res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 4, i32 5>)
503498
ret <2 x i32> %res

0 commit comments

Comments
 (0)