@@ -6290,6 +6290,122 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
6290
6290
return false ;
6291
6291
}
6292
6292
6293
+ // / Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
6294
+ // / buildvector sequence.
6295
+ static bool isFirstInsertElement (const InsertElementInst *IE1,
6296
+ const InsertElementInst *IE2) {
6297
+ const auto *I1 = IE1;
6298
+ const auto *I2 = IE2;
6299
+ do {
6300
+ if (I2 == IE1)
6301
+ return true ;
6302
+ if (I1 == IE2)
6303
+ return false ;
6304
+ if (I1)
6305
+ I1 = dyn_cast<InsertElementInst>(I1->getOperand (0 ));
6306
+ if (I2)
6307
+ I2 = dyn_cast<InsertElementInst>(I2->getOperand (0 ));
6308
+ } while (I1 || I2);
6309
+ llvm_unreachable (" Two different buildvectors not expected." );
6310
+ }
6311
+
6312
+ // / Does the analysis of the provided shuffle masks and performs the requested
6313
+ // / actions on the vectors with the given shuffle masks. It tries to do it in
6314
+ // / several steps.
6315
+ // / 1. If the Base vector is not undef vector, resizing the very first mask to
6316
+ // / have common VF and perform action for 2 input vectors (including non-undef
6317
+ // / Base). Other shuffle masks are combined with the resulting after the 1 stage
6318
+ // / and processed as a shuffle of 2 elements.
6319
+ // / 2. If the Base is undef vector and have only 1 shuffle mask, perform the
6320
+ // / action only for 1 vector with the given mask, if it is not the identity
6321
+ // / mask.
6322
+ // / 3. If > 2 masks are used, perform the remaining shuffle actions for 2
6323
+ // / vectors, combing the masks properly between the steps.
6324
+ template <typename T>
6325
+ static T *performExtractsShuffleAction (
6326
+ MutableArrayRef<std::pair<T *, SmallVector<int >>> ShuffleMask, Value *Base,
6327
+ function_ref<unsigned (T *)> GetVF,
6328
+ function_ref<std::pair<T *, bool>(T *, ArrayRef<int >)> ResizeAction,
6329
+ function_ref<T *(ArrayRef<int >, ArrayRef<T *>)> Action) {
6330
+ assert (!ShuffleMask.empty () && " Empty list of shuffles for inserts." );
6331
+ SmallVector<int > Mask (ShuffleMask.begin ()->second );
6332
+ auto VMIt = std::next (ShuffleMask.begin ());
6333
+ T *Prev = nullptr ;
6334
+ bool IsBaseNotUndef = !isUndefVector (Base);
6335
+ if (IsBaseNotUndef) {
6336
+ // Base is not undef, need to combine it with the next subvectors.
6337
+ std::pair<T *, bool > Res = ResizeAction (ShuffleMask.begin ()->first , Mask);
6338
+ for (unsigned Idx = 0 , VF = Mask.size (); Idx < VF; ++Idx) {
6339
+ if (Mask[Idx] == UndefMaskElem)
6340
+ Mask[Idx] = Idx;
6341
+ else
6342
+ Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
6343
+ }
6344
+ Prev = Action (Mask, {nullptr , Res.first });
6345
+ } else if (ShuffleMask.size () == 1 ) {
6346
+ // Base is undef and only 1 vector is shuffled - perform the action only for
6347
+ // single vector, if the mask is not the identity mask.
6348
+ std::pair<T *, bool > Res = ResizeAction (ShuffleMask.begin ()->first , Mask);
6349
+ if (Res.second )
6350
+ // Identity mask is found.
6351
+ Prev = Res.first ;
6352
+ else
6353
+ Prev = Action (Mask, {ShuffleMask.begin ()->first });
6354
+ } else {
6355
+ // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
6356
+ // shuffles step by step, combining shuffle between the steps.
6357
+ unsigned Vec1VF = GetVF (ShuffleMask.begin ()->first );
6358
+ unsigned Vec2VF = GetVF (VMIt->first );
6359
+ if (Vec1VF == Vec2VF) {
6360
+ // No need to resize the input vectors since they are of the same size, we
6361
+ // can shuffle them directly.
6362
+ ArrayRef<int > SecMask = VMIt->second ;
6363
+ for (unsigned I = 0 , VF = Mask.size (); I < VF; ++I) {
6364
+ if (SecMask[I] != UndefMaskElem) {
6365
+ assert (Mask[I] == UndefMaskElem && " Multiple uses of scalars." );
6366
+ Mask[I] = SecMask[I] + Vec1VF;
6367
+ }
6368
+ }
6369
+ Prev = Action (Mask, {ShuffleMask.begin ()->first , VMIt->first });
6370
+ } else {
6371
+ // Vectors of different sizes - resize and reshuffle.
6372
+ std::pair<T *, bool > Res1 =
6373
+ ResizeAction (ShuffleMask.begin ()->first , Mask);
6374
+ std::pair<T *, bool > Res2 = ResizeAction (VMIt->first , VMIt->second );
6375
+ ArrayRef<int > SecMask = VMIt->second ;
6376
+ for (unsigned I = 0 , VF = Mask.size (); I < VF; ++I) {
6377
+ if (Mask[I] != UndefMaskElem) {
6378
+ assert (SecMask[I] == UndefMaskElem && " Multiple uses of scalars." );
6379
+ if (Res1.second )
6380
+ Mask[I] = I;
6381
+ } else if (SecMask[I] != UndefMaskElem) {
6382
+ assert (Mask[I] == UndefMaskElem && " Multiple uses of scalars." );
6383
+ Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
6384
+ }
6385
+ }
6386
+ Prev = Action (Mask, {Res1.first , Res2.first });
6387
+ }
6388
+ VMIt = std::next (VMIt);
6389
+ }
6390
+ // Perform requested actions for the remaining masks/vectors.
6391
+ for (auto E = ShuffleMask.end (); VMIt != E; ++VMIt) {
6392
+ // Shuffle other input vectors, if any.
6393
+ std::pair<T *, bool > Res = ResizeAction (VMIt->first , VMIt->second );
6394
+ ArrayRef<int > SecMask = VMIt->second ;
6395
+ for (unsigned I = 0 , VF = Mask.size (); I < VF; ++I) {
6396
+ if (SecMask[I] != UndefMaskElem) {
6397
+ assert ((Mask[I] == UndefMaskElem || IsBaseNotUndef) &&
6398
+ " Multiple uses of scalars." );
6399
+ Mask[I] = (Res.second ? I : SecMask[I]) + VF;
6400
+ } else if (Mask[I] != UndefMaskElem) {
6401
+ Mask[I] = I;
6402
+ }
6403
+ }
6404
+ Prev = Action (Mask, {Prev, Res.first });
6405
+ }
6406
+ return Prev;
6407
+ }
6408
+
6293
6409
InstructionCost BoUpSLP::getTreeCost (ArrayRef<Value *> VectorizedVals) {
6294
6410
InstructionCost Cost = 0 ;
6295
6411
LLVM_DEBUG (dbgs () << " SLP: Calculating cost for tree of size "
@@ -6310,9 +6426,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
6310
6426
6311
6427
SmallPtrSet<Value *, 16 > ExtractCostCalculated;
6312
6428
InstructionCost ExtractCost = 0 ;
6313
- SmallVector<unsigned > VF;
6314
- SmallVector<SmallVector<int >> ShuffleMask;
6315
- SmallVector<Value *> FirstUsers;
6429
+ SmallVector<MapVector<const TreeEntry *, SmallVector<int >>> ShuffleMasks;
6430
+ SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
6316
6431
SmallVector<APInt> DemandedElts;
6317
6432
for (ExternalUser &EU : ExternalUses) {
6318
6433
// We only add extract cost once for the same scalar.
@@ -6341,14 +6456,16 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
6341
6456
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType ())) {
6342
6457
Optional<unsigned > InsertIdx = getInsertIndex (VU);
6343
6458
if (InsertIdx) {
6344
- auto *It = find_if (FirstUsers, [VU](Value *V) {
6345
- return areTwoInsertFromSameBuildVector (VU,
6346
- cast<InsertElementInst>(V));
6347
- });
6459
+ const TreeEntry *ScalarTE = getTreeEntry (EU.Scalar );
6460
+ auto *It =
6461
+ find_if (FirstUsers,
6462
+ [VU](const std::pair<Value *, const TreeEntry *> &Pair) {
6463
+ return areTwoInsertFromSameBuildVector (
6464
+ VU, cast<InsertElementInst>(Pair.first ));
6465
+ });
6348
6466
int VecId = -1 ;
6349
6467
if (It == FirstUsers.end ()) {
6350
- VF.push_back (FTy->getNumElements ());
6351
- ShuffleMask.emplace_back (VF.back (), UndefMaskElem);
6468
+ (void )ShuffleMasks.emplace_back ();
6352
6469
// Find the insertvector, vectorized in tree, if any.
6353
6470
Value *Base = VU;
6354
6471
while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
@@ -6357,21 +6474,31 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
6357
6474
VU = IEBase;
6358
6475
do {
6359
6476
int Idx = E->findLaneForValue (Base);
6360
- ShuffleMask.back ()[Idx] = Idx;
6477
+ SmallVectorImpl<int > &Mask = ShuffleMasks.back ()[ScalarTE];
6478
+ if (Mask.empty ())
6479
+ Mask.assign (FTy->getNumElements (), UndefMaskElem);
6480
+ Mask[Idx] = Idx;
6361
6481
Base = cast<InsertElementInst>(Base)->getOperand (0 );
6362
6482
} while (E == getTreeEntry (Base));
6363
6483
break ;
6364
6484
}
6365
6485
Base = cast<InsertElementInst>(Base)->getOperand (0 );
6366
6486
}
6367
- FirstUsers.push_back (VU);
6368
- DemandedElts.push_back (APInt::getZero (VF. back ()));
6487
+ FirstUsers.emplace_back (VU, ScalarTE );
6488
+ DemandedElts.push_back (APInt::getZero (FTy-> getNumElements ()));
6369
6489
VecId = FirstUsers.size () - 1 ;
6370
6490
} else {
6491
+ if (isFirstInsertElement (VU, cast<InsertElementInst>(It->first )))
6492
+ It->first = VU;
6371
6493
VecId = std::distance (FirstUsers.begin (), It);
6372
6494
}
6373
6495
int InIdx = *InsertIdx;
6374
- ShuffleMask[VecId][InIdx] = EU.Lane ;
6496
+ SmallVectorImpl<int > &Mask = ShuffleMasks[VecId][ScalarTE];
6497
+ if (Mask.empty ())
6498
+ Mask.assign (FTy->getNumElements (), UndefMaskElem);
6499
+ assert (Mask[InIdx] == UndefMaskElem &&
6500
+ " InsertElementInstruction used already." );
6501
+ Mask[InIdx] = EU.Lane ;
6375
6502
DemandedElts[VecId].setBit (InIdx);
6376
6503
continue ;
6377
6504
}
@@ -6398,89 +6525,75 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
6398
6525
6399
6526
InstructionCost SpillCost = getSpillCost ();
6400
6527
Cost += SpillCost + ExtractCost;
6401
- if (FirstUsers.size () == 1 ) {
6402
- int Limit = ShuffleMask.front ().size () * 2 ;
6403
- if (!all_of (ShuffleMask.front (),
6404
- [Limit](int Idx) { return Idx < Limit; }) ||
6405
- !ShuffleVectorInst::isIdentityMask (ShuffleMask.front ())) {
6406
- InstructionCost C = TTI->getShuffleCost (
6528
+ auto &&ResizeToVF = [this , &Cost](const TreeEntry *TE, ArrayRef<int > Mask) {
6529
+ InstructionCost C = 0 ;
6530
+ unsigned VF = Mask.size ();
6531
+ unsigned VecVF = TE->getVectorFactor ();
6532
+ if (VF != VecVF &&
6533
+ (any_of (Mask, [VF](int Idx) { return Idx >= static_cast <int >(VF); }) ||
6534
+ (all_of (Mask,
6535
+ [VF](int Idx) { return Idx < 2 * static_cast <int >(VF); }) &&
6536
+ !ShuffleVectorInst::isIdentityMask (Mask)))) {
6537
+ SmallVector<int > OrigMask (VecVF, UndefMaskElem);
6538
+ std::copy (Mask.begin (), std::next (Mask.begin (), std::min (VF, VecVF)),
6539
+ OrigMask.begin ());
6540
+ C = TTI->getShuffleCost (
6407
6541
TTI::SK_PermuteSingleSrc,
6408
- cast<FixedVectorType>(FirstUsers.front ()->getType ()),
6409
- ShuffleMask.front ());
6410
- LLVM_DEBUG (dbgs () << " SLP: Adding cost " << C
6411
- << " for final shuffle of insertelement external users "
6412
- << *VectorizableTree.front ()->Scalars .front () << " .\n "
6413
- << " SLP: Current total cost = " << Cost << " \n " );
6542
+ FixedVectorType::get (TE->getMainOp ()->getType (), VecVF), OrigMask);
6543
+ LLVM_DEBUG (
6544
+ dbgs () << " SLP: Adding cost " << C
6545
+ << " for final shuffle of insertelement external users.\n " ;
6546
+ TE->dump (); dbgs () << " SLP: Current total cost = " << Cost << " \n " );
6414
6547
Cost += C;
6548
+ return std::make_pair (TE, true );
6415
6549
}
6550
+ return std::make_pair (TE, false );
6551
+ };
6552
+ // Calculate the cost of the reshuffled vectors, if any.
6553
+ for (int I = 0 , E = FirstUsers.size (); I < E; ++I) {
6554
+ Value *Base = cast<Instruction>(FirstUsers[I].first )->getOperand (0 );
6555
+ unsigned VF = ShuffleMasks[I].begin ()->second .size ();
6556
+ auto *FTy = FixedVectorType::get (
6557
+ cast<VectorType>(FirstUsers[I].first ->getType ())->getElementType (), VF);
6558
+ auto Vector = ShuffleMasks[I].takeVector ();
6559
+ auto &&EstimateShufflesCost = [this , FTy,
6560
+ &Cost](ArrayRef<int > Mask,
6561
+ ArrayRef<const TreeEntry *> TEs) {
6562
+ assert ((TEs.size () == 1 || TEs.size () == 2 ) &&
6563
+ " Expected exactly 1 or 2 tree entries." );
6564
+ if (TEs.size () == 1 ) {
6565
+ int Limit = 2 * Mask.size ();
6566
+ if (!all_of (Mask, [Limit](int Idx) { return Idx < Limit; }) ||
6567
+ !ShuffleVectorInst::isIdentityMask (Mask)) {
6568
+ InstructionCost C =
6569
+ TTI->getShuffleCost (TTI::SK_PermuteSingleSrc, FTy, Mask);
6570
+ LLVM_DEBUG (dbgs () << " SLP: Adding cost " << C
6571
+ << " for final shuffle of insertelement "
6572
+ " external users.\n " ;
6573
+ TEs.front ()->dump ();
6574
+ dbgs () << " SLP: Current total cost = " << Cost << " \n " );
6575
+ Cost += C;
6576
+ }
6577
+ } else {
6578
+ InstructionCost C =
6579
+ TTI->getShuffleCost (TTI::SK_PermuteTwoSrc, FTy, Mask);
6580
+ LLVM_DEBUG (dbgs () << " SLP: Adding cost " << C
6581
+ << " for final shuffle of vector node and external "
6582
+ " insertelement users.\n " ;
6583
+ if (TEs.front ()) { TEs.front ()->dump (); } TEs.back ()->dump ();
6584
+ dbgs () << " SLP: Current total cost = " << Cost << " \n " );
6585
+ Cost += C;
6586
+ }
6587
+ return TEs.back ();
6588
+ };
6589
+ (void )performExtractsShuffleAction<const TreeEntry>(
6590
+ makeMutableArrayRef (Vector.data (), Vector.size ()), Base,
6591
+ [](const TreeEntry *E) { return E->getVectorFactor (); }, ResizeToVF,
6592
+ EstimateShufflesCost);
6416
6593
InstructionCost InsertCost = TTI->getScalarizationOverhead (
6417
- cast<FixedVectorType>(FirstUsers.front ()->getType ()),
6418
- DemandedElts.front (), /* Insert*/ true , /* Extract*/ false );
6419
- LLVM_DEBUG (dbgs () << " SLP: subtracting the cost " << InsertCost
6420
- << " for insertelements gather.\n "
6421
- << " SLP: Current total cost = " << Cost << " \n " );
6422
- Cost -= InsertCost;
6423
- } else if (FirstUsers.size () >= 2 ) {
6424
- unsigned MaxVF = *std::max_element (VF.begin (), VF.end ());
6425
- // Combined masks of the first 2 vectors.
6426
- SmallVector<int > CombinedMask (MaxVF, UndefMaskElem);
6427
- copy (ShuffleMask.front (), CombinedMask.begin ());
6428
- APInt CombinedDemandedElts = DemandedElts.front ().zextOrSelf (MaxVF);
6429
- auto *VecTy = FixedVectorType::get (
6430
- cast<VectorType>(FirstUsers.front ()->getType ())->getElementType (),
6431
- MaxVF);
6432
- for (int I = 0 , E = ShuffleMask[1 ].size (); I < E; ++I) {
6433
- if (ShuffleMask[1 ][I] != UndefMaskElem) {
6434
- CombinedMask[I] = ShuffleMask[1 ][I] + MaxVF;
6435
- CombinedDemandedElts.setBit (I);
6436
- }
6437
- }
6438
- InstructionCost C =
6439
- TTI->getShuffleCost (TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
6440
- LLVM_DEBUG (dbgs () << " SLP: Adding cost " << C
6441
- << " for final shuffle of vector node and external "
6442
- " insertelement users "
6443
- << *VectorizableTree.front ()->Scalars .front () << " .\n "
6444
- << " SLP: Current total cost = " << Cost << " \n " );
6445
- Cost += C;
6446
- InstructionCost InsertCost = TTI->getScalarizationOverhead (
6447
- VecTy, CombinedDemandedElts, /* Insert*/ true , /* Extract*/ false );
6448
- LLVM_DEBUG (dbgs () << " SLP: subtracting the cost " << InsertCost
6449
- << " for insertelements gather.\n "
6450
- << " SLP: Current total cost = " << Cost << " \n " );
6594
+ cast<FixedVectorType>(FirstUsers[I].first ->getType ()), DemandedElts[I],
6595
+ /* Insert*/ true , /* Extract*/ false );
6451
6596
Cost -= InsertCost;
6452
- for (int I = 2 , E = FirstUsers.size (); I < E; ++I) {
6453
- if (ShuffleMask[I].empty ())
6454
- continue ;
6455
- // Other elements - permutation of 2 vectors (the initial one and the
6456
- // next Ith incoming vector).
6457
- unsigned VF = ShuffleMask[I].size ();
6458
- for (unsigned Idx = 0 ; Idx < VF; ++Idx) {
6459
- int Mask = ShuffleMask[I][Idx];
6460
- if (Mask != UndefMaskElem)
6461
- CombinedMask[Idx] = MaxVF + Mask;
6462
- else if (CombinedMask[Idx] != UndefMaskElem)
6463
- CombinedMask[Idx] = Idx;
6464
- }
6465
- for (unsigned Idx = VF; Idx < MaxVF; ++Idx)
6466
- if (CombinedMask[Idx] != UndefMaskElem)
6467
- CombinedMask[Idx] = Idx;
6468
- InstructionCost C =
6469
- TTI->getShuffleCost (TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
6470
- LLVM_DEBUG (dbgs () << " SLP: Adding cost " << C
6471
- << " for final shuffle of vector node and external "
6472
- " insertelement users "
6473
- << *VectorizableTree.front ()->Scalars .front () << " .\n "
6474
- << " SLP: Current total cost = " << Cost << " \n " );
6475
- Cost += C;
6476
- InstructionCost InsertCost = TTI->getScalarizationOverhead (
6477
- cast<FixedVectorType>(FirstUsers[I]->getType ()), DemandedElts[I],
6478
- /* Insert*/ true , /* Extract*/ false );
6479
- LLVM_DEBUG (dbgs () << " SLP: subtracting the cost " << InsertCost
6480
- << " for insertelements gather.\n "
6481
- << " SLP: Current total cost = " << Cost << " \n " );
6482
- Cost -= InsertCost;
6483
- }
6484
6597
}
6485
6598
6486
6599
#ifndef NDEBUG
0 commit comments