Skip to content

Commit 29e14c3

Browse files
authored
[AMDGPU] Remove widen-16-bit-ops from CGP (#145483)
This was already off by default so there is no codegen change.
1 parent 3e4153c commit 29e14c3

File tree

4 files changed

+8
-3155
lines changed

4 files changed

+8
-3155
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 0 additions & 294 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,6 @@ static cl::opt<bool> WidenLoads(
4545
cl::ReallyHidden,
4646
cl::init(false));
4747

48-
static cl::opt<bool> Widen16BitOps(
49-
"amdgpu-codegenprepare-widen-16-bit-ops",
50-
cl::desc(
51-
"Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
52-
cl::ReallyHidden, cl::init(false));
53-
5448
static cl::opt<bool>
5549
BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
5650
cl::desc("Break large PHI nodes for DAGISel"),
@@ -150,18 +144,6 @@ class AMDGPUCodeGenPrepareImpl
150144

151145
bool canBreakPHINode(const PHINode &I);
152146

153-
/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
154-
/// binary operation \p V.
155-
///
156-
/// \returns Binary operation \p V.
157-
/// \returns \p T's base element bit width.
158-
unsigned getBaseElementBitWidth(const Type *T) const;
159-
160-
/// \returns Equivalent 32 bit integer type for given type \p T. For example,
161-
/// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
162-
/// is returned.
163-
Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
164-
165147
/// \returns True if binary operation \p I is a signed binary operation, false
166148
/// otherwise.
167149
bool isSigned(const BinaryOperator &I) const;
@@ -170,10 +152,6 @@ class AMDGPUCodeGenPrepareImpl
170152
/// signed 'icmp' operation, false otherwise.
171153
bool isSigned(const SelectInst &I) const;
172154

173-
/// \returns True if type \p T needs to be promoted to 32 bit integer type,
174-
/// false otherwise.
175-
bool needsPromotionToI32(const Type *T) const;
176-
177155
/// Return true if \p T is a legal scalar floating point type.
178156
bool isLegalFloatingTy(const Type *T) const;
179157

@@ -188,52 +166,6 @@ class AMDGPUCodeGenPrepareImpl
188166
computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal();
189167
}
190168

191-
/// Promotes uniform binary operation \p I to equivalent 32 bit binary
192-
/// operation.
193-
///
194-
/// \details \p I's base element bit width must be greater than 1 and less
195-
/// than or equal 16. Promotion is done by sign or zero extending operands to
196-
/// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
197-
/// truncating the result of 32 bit binary operation back to \p I's original
198-
/// type. Division operation is not promoted.
199-
///
200-
/// \returns True if \p I is promoted to equivalent 32 bit binary operation,
201-
/// false otherwise.
202-
bool promoteUniformOpToI32(BinaryOperator &I) const;
203-
204-
/// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
205-
///
206-
/// \details \p I's base element bit width must be greater than 1 and less
207-
/// than or equal 16. Promotion is done by sign or zero extending operands to
208-
/// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
209-
///
210-
/// \returns True.
211-
bool promoteUniformOpToI32(ICmpInst &I) const;
212-
213-
/// Promotes uniform 'select' operation \p I to 32 bit 'select'
214-
/// operation.
215-
///
216-
/// \details \p I's base element bit width must be greater than 1 and less
217-
/// than or equal 16. Promotion is done by sign or zero extending operands to
218-
/// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
219-
/// result of 32 bit 'select' operation back to \p I's original type.
220-
///
221-
/// \returns True.
222-
bool promoteUniformOpToI32(SelectInst &I) const;
223-
224-
/// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
225-
/// intrinsic.
226-
///
227-
/// \details \p I's base element bit width must be greater than 1 and less
228-
/// than or equal 16. Promotion is done by zero extending the operand to 32
229-
/// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
230-
/// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
231-
/// shift amount is 32 minus \p I's base element bit width), and truncating
232-
/// the result of the shift operation back to \p I's original type.
233-
///
234-
/// \returns True.
235-
bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
236-
237169
/// \returns The minimum number of bits needed to store the value of \Op as an
238170
/// unsigned integer. Truncating to this size and then zero-extending to
239171
/// the original will not change the value.
@@ -320,13 +252,11 @@ class AMDGPUCodeGenPrepareImpl
320252
bool visitInstruction(Instruction &I) { return false; }
321253
bool visitBinaryOperator(BinaryOperator &I);
322254
bool visitLoadInst(LoadInst &I);
323-
bool visitICmpInst(ICmpInst &I);
324255
bool visitSelectInst(SelectInst &I);
325256
bool visitPHINode(PHINode &I);
326257
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
327258

328259
bool visitIntrinsicInst(IntrinsicInst &I);
329-
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
330260
bool visitFMinLike(IntrinsicInst &I);
331261
bool visitSqrt(IntrinsicInst &I);
332262
bool run();
@@ -380,22 +310,6 @@ bool AMDGPUCodeGenPrepareImpl::run() {
380310
return MadeChange;
381311
}
382312

383-
unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
384-
assert(needsPromotionToI32(T) && "T does not need promotion to i32");
385-
386-
if (T->isIntegerTy())
387-
return T->getIntegerBitWidth();
388-
return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
389-
}
390-
391-
Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
392-
assert(needsPromotionToI32(T) && "T does not need promotion to i32");
393-
394-
if (T->isIntegerTy())
395-
return B.getInt32Ty();
396-
return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
397-
}
398-
399313
bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
400314
return I.getOpcode() == Instruction::AShr ||
401315
I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
@@ -406,59 +320,11 @@ bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
406320
cast<ICmpInst>(I.getOperand(0))->isSigned();
407321
}
408322

409-
bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
410-
if (!Widen16BitOps)
411-
return false;
412-
413-
const IntegerType *IntTy = dyn_cast<IntegerType>(T);
414-
if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
415-
return true;
416-
417-
if (const VectorType *VT = dyn_cast<VectorType>(T)) {
418-
// TODO: The set of packed operations is more limited, so may want to
419-
// promote some anyway.
420-
if (ST.hasVOP3PInsts())
421-
return false;
422-
423-
return needsPromotionToI32(VT->getElementType());
424-
}
425-
426-
return false;
427-
}
428-
429323
bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
430324
return Ty->isFloatTy() || Ty->isDoubleTy() ||
431325
(Ty->isHalfTy() && ST.has16BitInsts());
432326
}
433327

434-
// Return true if the op promoted to i32 should have nsw set.
435-
static bool promotedOpIsNSW(const Instruction &I) {
436-
switch (I.getOpcode()) {
437-
case Instruction::Shl:
438-
case Instruction::Add:
439-
case Instruction::Sub:
440-
return true;
441-
case Instruction::Mul:
442-
return I.hasNoUnsignedWrap();
443-
default:
444-
return false;
445-
}
446-
}
447-
448-
// Return true if the op promoted to i32 should have nuw set.
449-
static bool promotedOpIsNUW(const Instruction &I) {
450-
switch (I.getOpcode()) {
451-
case Instruction::Shl:
452-
case Instruction::Add:
453-
case Instruction::Mul:
454-
return true;
455-
case Instruction::Sub:
456-
return I.hasNoUnsignedWrap();
457-
default:
458-
return false;
459-
}
460-
}
461-
462328
bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
463329
Type *Ty = I.getType();
464330
int TySize = DL.getTypeSizeInBits(Ty);
@@ -467,134 +333,6 @@ bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
467333
return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
468334
}
469335

470-
bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
471-
assert(needsPromotionToI32(I.getType()) &&
472-
"I does not need promotion to i32");
473-
474-
if (I.getOpcode() == Instruction::SDiv ||
475-
I.getOpcode() == Instruction::UDiv ||
476-
I.getOpcode() == Instruction::SRem ||
477-
I.getOpcode() == Instruction::URem)
478-
return false;
479-
480-
IRBuilder<> Builder(&I);
481-
Builder.SetCurrentDebugLocation(I.getDebugLoc());
482-
483-
Type *I32Ty = getI32Ty(Builder, I.getType());
484-
Value *ExtOp0 = nullptr;
485-
Value *ExtOp1 = nullptr;
486-
Value *ExtRes = nullptr;
487-
Value *TruncRes = nullptr;
488-
489-
if (isSigned(I)) {
490-
ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
491-
ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
492-
} else {
493-
ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
494-
ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
495-
}
496-
497-
ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
498-
if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
499-
if (promotedOpIsNSW(cast<Instruction>(I)))
500-
Inst->setHasNoSignedWrap();
501-
502-
if (promotedOpIsNUW(cast<Instruction>(I)))
503-
Inst->setHasNoUnsignedWrap();
504-
505-
if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
506-
Inst->setIsExact(ExactOp->isExact());
507-
}
508-
509-
TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
510-
511-
I.replaceAllUsesWith(TruncRes);
512-
I.eraseFromParent();
513-
514-
return true;
515-
}
516-
517-
bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
518-
assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
519-
"I does not need promotion to i32");
520-
521-
IRBuilder<> Builder(&I);
522-
Builder.SetCurrentDebugLocation(I.getDebugLoc());
523-
524-
Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
525-
Value *ExtOp0 = nullptr;
526-
Value *ExtOp1 = nullptr;
527-
Value *NewICmp = nullptr;
528-
529-
if (I.isSigned()) {
530-
ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
531-
ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
532-
} else {
533-
ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
534-
ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
535-
}
536-
NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
537-
538-
I.replaceAllUsesWith(NewICmp);
539-
I.eraseFromParent();
540-
541-
return true;
542-
}
543-
544-
bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
545-
assert(needsPromotionToI32(I.getType()) &&
546-
"I does not need promotion to i32");
547-
548-
IRBuilder<> Builder(&I);
549-
Builder.SetCurrentDebugLocation(I.getDebugLoc());
550-
551-
Type *I32Ty = getI32Ty(Builder, I.getType());
552-
Value *ExtOp1 = nullptr;
553-
Value *ExtOp2 = nullptr;
554-
Value *ExtRes = nullptr;
555-
Value *TruncRes = nullptr;
556-
557-
if (isSigned(I)) {
558-
ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
559-
ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
560-
} else {
561-
ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
562-
ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
563-
}
564-
ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
565-
TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
566-
567-
I.replaceAllUsesWith(TruncRes);
568-
I.eraseFromParent();
569-
570-
return true;
571-
}
572-
573-
bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
574-
IntrinsicInst &I) const {
575-
assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
576-
"I must be bitreverse intrinsic");
577-
assert(needsPromotionToI32(I.getType()) &&
578-
"I does not need promotion to i32");
579-
580-
IRBuilder<> Builder(&I);
581-
Builder.SetCurrentDebugLocation(I.getDebugLoc());
582-
583-
Type *I32Ty = getI32Ty(Builder, I.getType());
584-
Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
585-
Value *ExtRes =
586-
Builder.CreateIntrinsic(Intrinsic::bitreverse, {I32Ty}, {ExtOp});
587-
Value *LShrOp =
588-
Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
589-
Value *TruncRes =
590-
Builder.CreateTrunc(LShrOp, I.getType());
591-
592-
I.replaceAllUsesWith(TruncRes);
593-
I.eraseFromParent();
594-
595-
return true;
596-
}
597-
598336
unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
599337
return computeKnownBits(Op, DL, AC).countMaxActiveBits();
600338
}
@@ -1635,10 +1373,6 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
16351373
if (foldBinOpIntoSelect(I))
16361374
return true;
16371375

1638-
if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
1639-
UA.isUniform(&I) && promoteUniformOpToI32(I))
1640-
return true;
1641-
16421376
if (UseMul24Intrin && replaceMulWithMul24(I))
16431377
return true;
16441378
if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(),
@@ -1770,29 +1504,13 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
17701504
return false;
17711505
}
17721506

1773-
bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
1774-
bool Changed = false;
1775-
1776-
if (ST.has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
1777-
UA.isUniform(&I))
1778-
Changed |= promoteUniformOpToI32(I);
1779-
1780-
return Changed;
1781-
}
1782-
17831507
bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
17841508
Value *Cond = I.getCondition();
17851509
Value *TrueVal = I.getTrueValue();
17861510
Value *FalseVal = I.getFalseValue();
17871511
Value *CmpVal;
17881512
CmpPredicate Pred;
17891513

1790-
if (ST.has16BitInsts() && needsPromotionToI32(I.getType())) {
1791-
if (UA.isUniform(&I))
1792-
return promoteUniformOpToI32(I);
1793-
return false;
1794-
}
1795-
17961514
// Match fract pattern with nan check.
17971515
if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
17981516
return false;
@@ -2196,8 +1914,6 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
21961914

21971915
bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
21981916
switch (I.getIntrinsicID()) {
2199-
case Intrinsic::bitreverse:
2200-
return visitBitreverseIntrinsicInst(I);
22011917
case Intrinsic::minnum:
22021918
case Intrinsic::minimumnum:
22031919
case Intrinsic::minimum:
@@ -2209,16 +1925,6 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
22091925
}
22101926
}
22111927

2212-
bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
2213-
bool Changed = false;
2214-
2215-
if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
2216-
UA.isUniform(&I))
2217-
Changed |= promoteUniformBitreverseToI32(I);
2218-
2219-
return Changed;
2220-
}
2221-
22221928
/// Match non-nan fract pattern.
22231929
/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
22241930
/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))

0 commit comments

Comments
 (0)