Skip to content

Commit 3df36a2

Browse files
doru1004erichkeane
andauthored
[AMDGPU] Enable vectorization of i8 values. (#134934)
This patch adjusts the cost model to account for the ability of the AMDGPU optimizer to group together i8 values into i32 values. Co-authored-by: Erich Keane <ekeane@nvidia.com>
1 parent 9e4981c commit 3df36a2

File tree

6 files changed

+447
-674
lines changed

6 files changed

+447
-674
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346
return 32 * 4 / ElemWidth;
347-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
348-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
349-
: 1;
347+
// For a given width return the max 0number of elements that can be combined
348+
// into a wider bit value:
349+
return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
350+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
351+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
352+
: 1;
350353
}
351354

352355
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1195,14 +1198,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11951198

11961199
Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
11971200

1198-
// Larger vector widths may require additional instructions, but are
1199-
// typically cheaper than scalarized versions.
1200-
unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1201+
unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
12011202
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1202-
DL.getTypeSizeInBits(SrcTy->getElementType()) == 16) {
1203-
bool HasVOP3P = ST->hasVOP3PInsts();
1203+
(ScalarSize == 16 || ScalarSize == 8)) {
1204+
// Larger vector widths may require additional instructions, but are
1205+
// typically cheaper than scalarized versions.
1206+
unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
12041207
unsigned RequestedElts =
12051208
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1209+
unsigned EltsPerReg = 32 / ScalarSize;
12061210
if (RequestedElts == 0)
12071211
return 0;
12081212
switch (Kind) {
@@ -1211,9 +1215,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
12111215
case TTI::SK_PermuteSingleSrc: {
12121216
// With op_sel VOP3P instructions freely can access the low half or high
12131217
// half of a register, so any swizzle of two elements is free.
1214-
if (HasVOP3P && NumVectorElts == 2)
1218+
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
12151219
return 0;
1216-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1220+
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
12171221
// SK_Broadcast just reuses the same mask
12181222
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
12191223
return NumPerms + NumPermMasks;
@@ -1225,12 +1229,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
12251229
return 0;
12261230
// Insert/extract subvectors only require shifts / extract code to get the
12271231
// relevant bits
1228-
return alignTo(RequestedElts, 2) / 2;
1232+
return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
12291233
}
12301234
case TTI::SK_PermuteTwoSrc:
12311235
case TTI::SK_Splice:
12321236
case TTI::SK_Select: {
1233-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1237+
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
12341238
// SK_Select just reuses the same mask
12351239
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
12361240
return NumPerms + NumPermMasks;
@@ -1505,3 +1509,30 @@ GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
15051509
return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
15061510
: KnownIEEEMode::On;
15071511
}
1512+
1513+
InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1514+
Align Alignment,
1515+
unsigned AddressSpace,
1516+
TTI::TargetCostKind CostKind,
1517+
TTI::OperandValueInfo OpInfo,
1518+
const Instruction *I) const {
1519+
if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1520+
if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1521+
VecTy->getElementType()->isIntegerTy(8)) {
1522+
return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1523+
getLoadStoreVecRegBitWidth(AddressSpace));
1524+
}
1525+
}
1526+
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1527+
OpInfo, I);
1528+
}
1529+
1530+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
1531+
if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1532+
if (VecTy->getElementType()->isIntegerTy(8)) {
1533+
unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1534+
return divideCeil(ElementCount - 1, 4);
1535+
}
1536+
}
1537+
return BaseT::getNumberOfParts(Tp);
1538+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
288288
/// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
289289
/// "amdgpu-ieee"="false".
290290
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;
291+
292+
/// Account for loads of i8 vector types to have reduced cost. For
293+
/// example the cost of load 4 i8s values is one is the cost of loading
294+
/// a single i32 value.
295+
InstructionCost getMemoryOpCost(
296+
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
297+
TTI::TargetCostKind CostKind,
298+
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
299+
const Instruction *I = nullptr) const override;
300+
301+
/// When counting parts on AMD GPUs, account for i8s being grouped
302+
/// together under a single i32 value. Otherwise fall back to base
303+
/// implementation.
304+
unsigned getNumberOfParts(Type *Tp) const override;
291305
};
292306

293307
} // end namespace llvm

0 commit comments

Comments
 (0)