@@ -45,12 +45,6 @@ static cl::opt<bool> WidenLoads(
45
45
cl::ReallyHidden,
46
46
cl::init(false ));
47
47
48
- static cl::opt<bool > Widen16BitOps (
49
- " amdgpu-codegenprepare-widen-16-bit-ops" ,
50
- cl::desc (
51
- " Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare" ),
52
- cl::ReallyHidden, cl::init(false ));
53
-
54
48
static cl::opt<bool >
55
49
BreakLargePHIs (" amdgpu-codegenprepare-break-large-phis" ,
56
50
cl::desc (" Break large PHI nodes for DAGISel" ),
@@ -150,18 +144,6 @@ class AMDGPUCodeGenPrepareImpl
150
144
151
145
bool canBreakPHINode (const PHINode &I);
152
146
153
- // / Copies exact/nsw/nuw flags (if any) from binary operation \p I to
154
- // / binary operation \p V.
155
- // /
156
- // / \returns Binary operation \p V.
157
- // / \returns \p T's base element bit width.
158
- unsigned getBaseElementBitWidth (const Type *T) const ;
159
-
160
- // / \returns Equivalent 32 bit integer type for given type \p T. For example,
161
- // / if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
162
- // / is returned.
163
- Type *getI32Ty (IRBuilder<> &B, const Type *T) const ;
164
-
165
147
// / \returns True if binary operation \p I is a signed binary operation, false
166
148
// / otherwise.
167
149
bool isSigned (const BinaryOperator &I) const ;
@@ -170,10 +152,6 @@ class AMDGPUCodeGenPrepareImpl
170
152
// / signed 'icmp' operation, false otherwise.
171
153
bool isSigned (const SelectInst &I) const ;
172
154
173
- // / \returns True if type \p T needs to be promoted to 32 bit integer type,
174
- // / false otherwise.
175
- bool needsPromotionToI32 (const Type *T) const ;
176
-
177
155
// / Return true if \p T is a legal scalar floating point type.
178
156
bool isLegalFloatingTy (const Type *T) const ;
179
157
@@ -188,52 +166,6 @@ class AMDGPUCodeGenPrepareImpl
188
166
computeKnownFPClass (V, fcSubnormal, CtxI).isKnownNeverSubnormal ();
189
167
}
190
168
191
- // / Promotes uniform binary operation \p I to equivalent 32 bit binary
192
- // / operation.
193
- // /
194
- // / \details \p I's base element bit width must be greater than 1 and less
195
- // / than or equal 16. Promotion is done by sign or zero extending operands to
196
- // / 32 bits, replacing \p I with equivalent 32 bit binary operation, and
197
- // / truncating the result of 32 bit binary operation back to \p I's original
198
- // / type. Division operation is not promoted.
199
- // /
200
- // / \returns True if \p I is promoted to equivalent 32 bit binary operation,
201
- // / false otherwise.
202
- bool promoteUniformOpToI32 (BinaryOperator &I) const ;
203
-
204
- // / Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
205
- // /
206
- // / \details \p I's base element bit width must be greater than 1 and less
207
- // / than or equal 16. Promotion is done by sign or zero extending operands to
208
- // / 32 bits, and replacing \p I with 32 bit 'icmp' operation.
209
- // /
210
- // / \returns True.
211
- bool promoteUniformOpToI32 (ICmpInst &I) const ;
212
-
213
- // / Promotes uniform 'select' operation \p I to 32 bit 'select'
214
- // / operation.
215
- // /
216
- // / \details \p I's base element bit width must be greater than 1 and less
217
- // / than or equal 16. Promotion is done by sign or zero extending operands to
218
- // / 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
219
- // / result of 32 bit 'select' operation back to \p I's original type.
220
- // /
221
- // / \returns True.
222
- bool promoteUniformOpToI32 (SelectInst &I) const ;
223
-
224
- // / Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
225
- // / intrinsic.
226
- // /
227
- // / \details \p I's base element bit width must be greater than 1 and less
228
- // / than or equal 16. Promotion is done by zero extending the operand to 32
229
- // / bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
230
- // / result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
231
- // / shift amount is 32 minus \p I's base element bit width), and truncating
232
- // / the result of the shift operation back to \p I's original type.
233
- // /
234
- // / \returns True.
235
- bool promoteUniformBitreverseToI32 (IntrinsicInst &I) const ;
236
-
237
169
// / \returns The minimum number of bits needed to store the value of \Op as an
238
170
// / unsigned integer. Truncating to this size and then zero-extending to
239
171
// / the original will not change the value.
@@ -320,13 +252,11 @@ class AMDGPUCodeGenPrepareImpl
320
252
bool visitInstruction (Instruction &I) { return false ; }
321
253
bool visitBinaryOperator (BinaryOperator &I);
322
254
bool visitLoadInst (LoadInst &I);
323
- bool visitICmpInst (ICmpInst &I);
324
255
bool visitSelectInst (SelectInst &I);
325
256
bool visitPHINode (PHINode &I);
326
257
bool visitAddrSpaceCastInst (AddrSpaceCastInst &I);
327
258
328
259
bool visitIntrinsicInst (IntrinsicInst &I);
329
- bool visitBitreverseIntrinsicInst (IntrinsicInst &I);
330
260
bool visitFMinLike (IntrinsicInst &I);
331
261
bool visitSqrt (IntrinsicInst &I);
332
262
bool run ();
@@ -380,22 +310,6 @@ bool AMDGPUCodeGenPrepareImpl::run() {
380
310
return MadeChange;
381
311
}
382
312
383
- unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth (const Type *T) const {
384
- assert (needsPromotionToI32 (T) && " T does not need promotion to i32" );
385
-
386
- if (T->isIntegerTy ())
387
- return T->getIntegerBitWidth ();
388
- return cast<VectorType>(T)->getElementType ()->getIntegerBitWidth ();
389
- }
390
-
391
- Type *AMDGPUCodeGenPrepareImpl::getI32Ty (IRBuilder<> &B, const Type *T) const {
392
- assert (needsPromotionToI32 (T) && " T does not need promotion to i32" );
393
-
394
- if (T->isIntegerTy ())
395
- return B.getInt32Ty ();
396
- return FixedVectorType::get (B.getInt32Ty (), cast<FixedVectorType>(T));
397
- }
398
-
399
313
bool AMDGPUCodeGenPrepareImpl::isSigned (const BinaryOperator &I) const {
400
314
return I.getOpcode () == Instruction::AShr ||
401
315
I.getOpcode () == Instruction::SDiv || I.getOpcode () == Instruction::SRem;
@@ -406,59 +320,11 @@ bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
406
320
cast<ICmpInst>(I.getOperand (0 ))->isSigned ();
407
321
}
408
322
409
- bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32 (const Type *T) const {
410
- if (!Widen16BitOps)
411
- return false ;
412
-
413
- const IntegerType *IntTy = dyn_cast<IntegerType>(T);
414
- if (IntTy && IntTy->getBitWidth () > 1 && IntTy->getBitWidth () <= 16 )
415
- return true ;
416
-
417
- if (const VectorType *VT = dyn_cast<VectorType>(T)) {
418
- // TODO: The set of packed operations is more limited, so may want to
419
- // promote some anyway.
420
- if (ST.hasVOP3PInsts ())
421
- return false ;
422
-
423
- return needsPromotionToI32 (VT->getElementType ());
424
- }
425
-
426
- return false ;
427
- }
428
-
429
323
bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy (const Type *Ty) const {
430
324
return Ty->isFloatTy () || Ty->isDoubleTy () ||
431
325
(Ty->isHalfTy () && ST.has16BitInsts ());
432
326
}
433
327
434
- // Return true if the op promoted to i32 should have nsw set.
435
- static bool promotedOpIsNSW (const Instruction &I) {
436
- switch (I.getOpcode ()) {
437
- case Instruction::Shl:
438
- case Instruction::Add:
439
- case Instruction::Sub:
440
- return true ;
441
- case Instruction::Mul:
442
- return I.hasNoUnsignedWrap ();
443
- default :
444
- return false ;
445
- }
446
- }
447
-
448
- // Return true if the op promoted to i32 should have nuw set.
449
- static bool promotedOpIsNUW (const Instruction &I) {
450
- switch (I.getOpcode ()) {
451
- case Instruction::Shl:
452
- case Instruction::Add:
453
- case Instruction::Mul:
454
- return true ;
455
- case Instruction::Sub:
456
- return I.hasNoUnsignedWrap ();
457
- default :
458
- return false ;
459
- }
460
- }
461
-
462
328
bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad (LoadInst &I) const {
463
329
Type *Ty = I.getType ();
464
330
int TySize = DL.getTypeSizeInBits (Ty);
@@ -467,134 +333,6 @@ bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
467
333
return I.isSimple () && TySize < 32 && Alignment >= 4 && UA.isUniform (&I);
468
334
}
469
335
470
- bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32 (BinaryOperator &I) const {
471
- assert (needsPromotionToI32 (I.getType ()) &&
472
- " I does not need promotion to i32" );
473
-
474
- if (I.getOpcode () == Instruction::SDiv ||
475
- I.getOpcode () == Instruction::UDiv ||
476
- I.getOpcode () == Instruction::SRem ||
477
- I.getOpcode () == Instruction::URem)
478
- return false ;
479
-
480
- IRBuilder<> Builder (&I);
481
- Builder.SetCurrentDebugLocation (I.getDebugLoc ());
482
-
483
- Type *I32Ty = getI32Ty (Builder, I.getType ());
484
- Value *ExtOp0 = nullptr ;
485
- Value *ExtOp1 = nullptr ;
486
- Value *ExtRes = nullptr ;
487
- Value *TruncRes = nullptr ;
488
-
489
- if (isSigned (I)) {
490
- ExtOp0 = Builder.CreateSExt (I.getOperand (0 ), I32Ty);
491
- ExtOp1 = Builder.CreateSExt (I.getOperand (1 ), I32Ty);
492
- } else {
493
- ExtOp0 = Builder.CreateZExt (I.getOperand (0 ), I32Ty);
494
- ExtOp1 = Builder.CreateZExt (I.getOperand (1 ), I32Ty);
495
- }
496
-
497
- ExtRes = Builder.CreateBinOp (I.getOpcode (), ExtOp0, ExtOp1);
498
- if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
499
- if (promotedOpIsNSW (cast<Instruction>(I)))
500
- Inst->setHasNoSignedWrap ();
501
-
502
- if (promotedOpIsNUW (cast<Instruction>(I)))
503
- Inst->setHasNoUnsignedWrap ();
504
-
505
- if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
506
- Inst->setIsExact (ExactOp->isExact ());
507
- }
508
-
509
- TruncRes = Builder.CreateTrunc (ExtRes, I.getType ());
510
-
511
- I.replaceAllUsesWith (TruncRes);
512
- I.eraseFromParent ();
513
-
514
- return true ;
515
- }
516
-
517
- bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32 (ICmpInst &I) const {
518
- assert (needsPromotionToI32 (I.getOperand (0 )->getType ()) &&
519
- " I does not need promotion to i32" );
520
-
521
- IRBuilder<> Builder (&I);
522
- Builder.SetCurrentDebugLocation (I.getDebugLoc ());
523
-
524
- Type *I32Ty = getI32Ty (Builder, I.getOperand (0 )->getType ());
525
- Value *ExtOp0 = nullptr ;
526
- Value *ExtOp1 = nullptr ;
527
- Value *NewICmp = nullptr ;
528
-
529
- if (I.isSigned ()) {
530
- ExtOp0 = Builder.CreateSExt (I.getOperand (0 ), I32Ty);
531
- ExtOp1 = Builder.CreateSExt (I.getOperand (1 ), I32Ty);
532
- } else {
533
- ExtOp0 = Builder.CreateZExt (I.getOperand (0 ), I32Ty);
534
- ExtOp1 = Builder.CreateZExt (I.getOperand (1 ), I32Ty);
535
- }
536
- NewICmp = Builder.CreateICmp (I.getPredicate (), ExtOp0, ExtOp1);
537
-
538
- I.replaceAllUsesWith (NewICmp);
539
- I.eraseFromParent ();
540
-
541
- return true ;
542
- }
543
-
544
- bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32 (SelectInst &I) const {
545
- assert (needsPromotionToI32 (I.getType ()) &&
546
- " I does not need promotion to i32" );
547
-
548
- IRBuilder<> Builder (&I);
549
- Builder.SetCurrentDebugLocation (I.getDebugLoc ());
550
-
551
- Type *I32Ty = getI32Ty (Builder, I.getType ());
552
- Value *ExtOp1 = nullptr ;
553
- Value *ExtOp2 = nullptr ;
554
- Value *ExtRes = nullptr ;
555
- Value *TruncRes = nullptr ;
556
-
557
- if (isSigned (I)) {
558
- ExtOp1 = Builder.CreateSExt (I.getOperand (1 ), I32Ty);
559
- ExtOp2 = Builder.CreateSExt (I.getOperand (2 ), I32Ty);
560
- } else {
561
- ExtOp1 = Builder.CreateZExt (I.getOperand (1 ), I32Ty);
562
- ExtOp2 = Builder.CreateZExt (I.getOperand (2 ), I32Ty);
563
- }
564
- ExtRes = Builder.CreateSelect (I.getOperand (0 ), ExtOp1, ExtOp2);
565
- TruncRes = Builder.CreateTrunc (ExtRes, I.getType ());
566
-
567
- I.replaceAllUsesWith (TruncRes);
568
- I.eraseFromParent ();
569
-
570
- return true ;
571
- }
572
-
573
- bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32 (
574
- IntrinsicInst &I) const {
575
- assert (I.getIntrinsicID () == Intrinsic::bitreverse &&
576
- " I must be bitreverse intrinsic" );
577
- assert (needsPromotionToI32 (I.getType ()) &&
578
- " I does not need promotion to i32" );
579
-
580
- IRBuilder<> Builder (&I);
581
- Builder.SetCurrentDebugLocation (I.getDebugLoc ());
582
-
583
- Type *I32Ty = getI32Ty (Builder, I.getType ());
584
- Value *ExtOp = Builder.CreateZExt (I.getOperand (0 ), I32Ty);
585
- Value *ExtRes =
586
- Builder.CreateIntrinsic (Intrinsic::bitreverse, {I32Ty}, {ExtOp});
587
- Value *LShrOp =
588
- Builder.CreateLShr (ExtRes, 32 - getBaseElementBitWidth (I.getType ()));
589
- Value *TruncRes =
590
- Builder.CreateTrunc (LShrOp, I.getType ());
591
-
592
- I.replaceAllUsesWith (TruncRes);
593
- I.eraseFromParent ();
594
-
595
- return true ;
596
- }
597
-
598
336
unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned (Value *Op) const {
599
337
return computeKnownBits (Op, DL, AC).countMaxActiveBits ();
600
338
}
@@ -1635,10 +1373,6 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1635
1373
if (foldBinOpIntoSelect (I))
1636
1374
return true ;
1637
1375
1638
- if (ST.has16BitInsts () && needsPromotionToI32 (I.getType ()) &&
1639
- UA.isUniform (&I) && promoteUniformOpToI32 (I))
1640
- return true ;
1641
-
1642
1376
if (UseMul24Intrin && replaceMulWithMul24 (I))
1643
1377
return true ;
1644
1378
if (tryNarrowMathIfNoOverflow (&I, ST.getTargetLowering (),
@@ -1770,29 +1504,13 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1770
1504
return false ;
1771
1505
}
1772
1506
1773
- bool AMDGPUCodeGenPrepareImpl::visitICmpInst (ICmpInst &I) {
1774
- bool Changed = false ;
1775
-
1776
- if (ST.has16BitInsts () && needsPromotionToI32 (I.getOperand (0 )->getType ()) &&
1777
- UA.isUniform (&I))
1778
- Changed |= promoteUniformOpToI32 (I);
1779
-
1780
- return Changed;
1781
- }
1782
-
1783
1507
bool AMDGPUCodeGenPrepareImpl::visitSelectInst (SelectInst &I) {
1784
1508
Value *Cond = I.getCondition ();
1785
1509
Value *TrueVal = I.getTrueValue ();
1786
1510
Value *FalseVal = I.getFalseValue ();
1787
1511
Value *CmpVal;
1788
1512
CmpPredicate Pred;
1789
1513
1790
- if (ST.has16BitInsts () && needsPromotionToI32 (I.getType ())) {
1791
- if (UA.isUniform (&I))
1792
- return promoteUniformOpToI32 (I);
1793
- return false ;
1794
- }
1795
-
1796
1514
// Match fract pattern with nan check.
1797
1515
if (!match (Cond, m_FCmp (Pred, m_Value (CmpVal), m_NonNaN ())))
1798
1516
return false ;
@@ -2196,8 +1914,6 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2196
1914
2197
1915
bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst (IntrinsicInst &I) {
2198
1916
switch (I.getIntrinsicID ()) {
2199
- case Intrinsic::bitreverse:
2200
- return visitBitreverseIntrinsicInst (I);
2201
1917
case Intrinsic::minnum:
2202
1918
case Intrinsic::minimumnum:
2203
1919
case Intrinsic::minimum:
@@ -2209,16 +1925,6 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2209
1925
}
2210
1926
}
2211
1927
2212
- bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst (IntrinsicInst &I) {
2213
- bool Changed = false ;
2214
-
2215
- if (ST.has16BitInsts () && needsPromotionToI32 (I.getType ()) &&
2216
- UA.isUniform (&I))
2217
- Changed |= promoteUniformBitreverseToI32 (I);
2218
-
2219
- return Changed;
2220
- }
2221
-
2222
1928
// / Match non-nan fract pattern.
2223
1929
// / minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2224
1930
// / minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
0 commit comments