Skip to content

Commit a13de25

Browse files
arsenmmemfrob
authored andcommitted
clang: Use byref for aggregate kernel arguments
Add address space to indirect abi info and use it for kernels. Previously, indirect arguments assumed assumed a stack passed object in the alloca address space using byval. A stack pointer is unsuitable for kernel arguments, which are passed in a separate, constant buffer with a different address space. Start using the new byref for aggregate kernel arguments. Previously these were emitted as raw struct arguments, and turned into loads in the backend. These will lower identically, although with byref you now have the option of applying an explicit alignment. In the future, a reasonable implementation would use byref for all kernel arguments (this would be a practical problem at the moment due to losing things like noalias on pointer arguments). This is mostly to avoid fighting the optimizer's treatment of aggregate load/store. SROA and instcombine both turn aggregate loads and stores into a long sequence of element loads and stores, rather than the optimizable memcpy I would expect in this situation. Now an explicit memcpy will be introduced up-front which is better understood and helps eliminate the alloca in more situations. This skips using byref in the case where HIP kernel pointer arguments in structs are promoted to global pointers. At minimum an additional patch is needed to allow coercion with indirect arguments. This also skips using it for OpenCL due to the current workaround used to support kernels calling kernels. Distinct function bodies would need to be generated up front instead of emitting an illegal call.
1 parent bdd0159 commit a13de25

File tree

5 files changed

+113
-29
lines changed

5 files changed

+113
-29
lines changed

clang/include/clang/CodeGen/CGFunctionInfo.h

Lines changed: 50 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,23 @@ class ABIArgInfo {
4444
/// but also emit a zero/sign extension attribute.
4545
Extend,
4646

47-
/// Indirect - Pass the argument indirectly via a hidden pointer
48-
/// with the specified alignment (0 indicates default alignment).
47+
/// Indirect - Pass the argument indirectly via a hidden pointer with the
48+
/// specified alignment (0 indicates default alignment) and address space.
4949
Indirect,
5050

51+
/// IndirectAliased - Similar to Indirect, but the pointer may be to an
52+
/// object that is otherwise referenced. The object is known to not be
53+
/// modified through any other references for the duration of the call, and
54+
/// the callee must not itself modify the object. Because C allows
55+
/// parameter variables to be modified and guarantees that they have unique
56+
/// addresses, the callee must defensively copy the object into a local
57+
/// variable if it might be modified or its address might be compared.
58+
/// Since those are uncommon, in principle this convention allows programs
59+
/// to avoid copies in more situations. However, it may introduce *extra*
60+
/// copies if the callee fails to prove that a copy is unnecessary and the
61+
/// caller naturally produces an unaliased object for the argument.
62+
IndirectAliased,
63+
5164
/// Ignore - Ignore the argument (treat as void). Useful for void and
5265
/// empty structs.
5366
Ignore,
@@ -86,6 +99,7 @@ class ABIArgInfo {
8699
unsigned AllocaFieldIndex; // isInAlloca()
87100
};
88101
Kind TheKind;
102+
unsigned IndirectAddrSpace : 24; // isIndirect()
89103
bool PaddingInReg : 1;
90104
bool InAllocaSRet : 1; // isInAlloca()
91105
bool InAllocaIndirect : 1;// isInAlloca()
@@ -97,7 +111,8 @@ class ABIArgInfo {
97111
bool SignExt : 1; // isExtend()
98112

99113
bool canHavePaddingType() const {
100-
return isDirect() || isExtend() || isIndirect() || isExpand();
114+
return isDirect() || isExtend() || isIndirect() || isIndirectAliased() ||
115+
isExpand();
101116
}
102117
void setPaddingType(llvm::Type *T) {
103118
assert(canHavePaddingType());
@@ -112,9 +127,10 @@ class ABIArgInfo {
112127
public:
113128
ABIArgInfo(Kind K = Direct)
114129
: TypeData(nullptr), PaddingType(nullptr), DirectOffset(0), TheKind(K),
115-
PaddingInReg(false), InAllocaSRet(false), InAllocaIndirect(false),
116-
IndirectByVal(false), IndirectRealign(false), SRetAfterThis(false),
117-
InReg(false), CanBeFlattened(false), SignExt(false) {}
130+
IndirectAddrSpace(0), PaddingInReg(false), InAllocaSRet(false),
131+
InAllocaIndirect(false), IndirectByVal(false), IndirectRealign(false),
132+
SRetAfterThis(false), InReg(false), CanBeFlattened(false),
133+
SignExt(false) {}
118134

119135
static ABIArgInfo getDirect(llvm::Type *T = nullptr, unsigned Offset = 0,
120136
llvm::Type *Padding = nullptr,
@@ -180,6 +196,19 @@ class ABIArgInfo {
180196
AI.setPaddingType(Padding);
181197
return AI;
182198
}
199+
200+
/// Pass this in memory using the IR byref attribute.
201+
static ABIArgInfo getIndirectAliased(CharUnits Alignment, unsigned AddrSpace,
202+
bool Realign = false,
203+
llvm::Type *Padding = nullptr) {
204+
auto AI = ABIArgInfo(IndirectAliased);
205+
AI.setIndirectAlign(Alignment);
206+
AI.setIndirectRealign(Realign);
207+
AI.setPaddingType(Padding);
208+
AI.setIndirectAddrSpace(AddrSpace);
209+
return AI;
210+
}
211+
183212
static ABIArgInfo getIndirectInReg(CharUnits Alignment, bool ByVal = true,
184213
bool Realign = false) {
185214
auto AI = getIndirect(Alignment, ByVal, Realign);
@@ -259,6 +288,7 @@ class ABIArgInfo {
259288
bool isExtend() const { return TheKind == Extend; }
260289
bool isIgnore() const { return TheKind == Ignore; }
261290
bool isIndirect() const { return TheKind == Indirect; }
291+
bool isIndirectAliased() const { return TheKind == IndirectAliased; }
262292
bool isExpand() const { return TheKind == Expand; }
263293
bool isCoerceAndExpand() const { return TheKind == CoerceAndExpand; }
264294

@@ -338,11 +368,11 @@ class ABIArgInfo {
338368

339369
// Indirect accessors
340370
CharUnits getIndirectAlign() const {
341-
assert(isIndirect() && "Invalid kind!");
371+
assert((isIndirect() || isIndirectAliased()) && "Invalid kind!");
342372
return CharUnits::fromQuantity(IndirectAlign);
343373
}
344374
void setIndirectAlign(CharUnits IA) {
345-
assert(isIndirect() && "Invalid kind!");
375+
assert((isIndirect() || isIndirectAliased()) && "Invalid kind!");
346376
IndirectAlign = IA.getQuantity();
347377
}
348378

@@ -355,12 +385,22 @@ class ABIArgInfo {
355385
IndirectByVal = IBV;
356386
}
357387

388+
unsigned getIndirectAddrSpace() const {
389+
assert(isIndirectAliased() && "Invalid kind!");
390+
return IndirectAddrSpace;
391+
}
392+
393+
void setIndirectAddrSpace(unsigned AddrSpace) {
394+
assert(isIndirectAliased() && "Invalid kind!");
395+
IndirectAddrSpace = AddrSpace;
396+
}
397+
358398
bool getIndirectRealign() const {
359-
assert(isIndirect() && "Invalid kind!");
399+
assert((isIndirect() || isIndirectAliased()) && "Invalid kind!");
360400
return IndirectRealign;
361401
}
362402
void setIndirectRealign(bool IR) {
363-
assert(isIndirect() && "Invalid kind!");
403+
assert((isIndirect() || isIndirectAliased()) && "Invalid kind!");
364404
IndirectRealign = IR;
365405
}
366406

clang/lib/CodeGen/CGCall.cpp

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1470,6 +1470,7 @@ void ClangToLLVMArgMapping::construct(const ASTContext &Context,
14701470
break;
14711471
}
14721472
case ABIArgInfo::Indirect:
1473+
case ABIArgInfo::IndirectAliased:
14731474
IRArgs.NumberOfArgs = 1;
14741475
break;
14751476
case ABIArgInfo::Ignore:
@@ -1560,6 +1561,7 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI) {
15601561
const ABIArgInfo &retAI = FI.getReturnInfo();
15611562
switch (retAI.getKind()) {
15621563
case ABIArgInfo::Expand:
1564+
case ABIArgInfo::IndirectAliased:
15631565
llvm_unreachable("Invalid ABI kind for return argument");
15641566

15651567
case ABIArgInfo::Extend:
@@ -1637,7 +1639,12 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI) {
16371639
CGM.getDataLayout().getAllocaAddrSpace());
16381640
break;
16391641
}
1640-
1642+
case ABIArgInfo::IndirectAliased: {
1643+
assert(NumIRArgs == 1);
1644+
llvm::Type *LTy = ConvertTypeForMem(it->type);
1645+
ArgTypes[FirstIRArg] = LTy->getPointerTo(ArgInfo.getIndirectAddrSpace());
1646+
break;
1647+
}
16411648
case ABIArgInfo::Extend:
16421649
case ABIArgInfo::Direct: {
16431650
// Fast-isel and the optimizer generally like scalar values better than
@@ -2101,6 +2108,7 @@ void CodeGenModule::ConstructAttributeList(
21012108
break;
21022109

21032110
case ABIArgInfo::Expand:
2111+
case ABIArgInfo::IndirectAliased:
21042112
llvm_unreachable("Invalid ABI kind for return argument");
21052113
}
21062114

@@ -2184,6 +2192,9 @@ void CodeGenModule::ConstructAttributeList(
21842192
if (AI.getIndirectByVal())
21852193
Attrs.addByValAttr(getTypes().ConvertTypeForMem(ParamType));
21862194

2195+
// TODO: We could add the byref attribute if not byval, but it would
2196+
// require updating many testcases.
2197+
21872198
CharUnits Align = AI.getIndirectAlign();
21882199

21892200
// In a byval argument, it is important that the required
@@ -2206,6 +2217,13 @@ void CodeGenModule::ConstructAttributeList(
22062217
// byval disables readnone and readonly.
22072218
FuncAttrs.removeAttribute(llvm::Attribute::ReadOnly)
22082219
.removeAttribute(llvm::Attribute::ReadNone);
2220+
2221+
break;
2222+
}
2223+
case ABIArgInfo::IndirectAliased: {
2224+
CharUnits Align = AI.getIndirectAlign();
2225+
Attrs.addByRefAttr(getTypes().ConvertTypeForMem(ParamType));
2226+
Attrs.addAlignmentAttr(Align.getQuantity());
22092227
break;
22102228
}
22112229
case ABIArgInfo::Ignore:
@@ -2434,16 +2452,19 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
24342452
break;
24352453
}
24362454

2437-
case ABIArgInfo::Indirect: {
2455+
case ABIArgInfo::Indirect:
2456+
case ABIArgInfo::IndirectAliased: {
24382457
assert(NumIRArgs == 1);
24392458
Address ParamAddr =
24402459
Address(Fn->getArg(FirstIRArg), ArgI.getIndirectAlign());
24412460

24422461
if (!hasScalarEvaluationKind(Ty)) {
2443-
// Aggregates and complex variables are accessed by reference. All we
2444-
// need to do is realign the value, if requested.
2462+
// Aggregates and complex variables are accessed by reference. All we
2463+
// need to do is realign the value, if requested. Also, if the address
2464+
// may be aliased, copy it to ensure that the parameter variable is
2465+
// mutable and has a unique adress, as C requires.
24452466
Address V = ParamAddr;
2446-
if (ArgI.getIndirectRealign()) {
2467+
if (ArgI.getIndirectRealign() || ArgI.isIndirectAliased()) {
24472468
Address AlignedTemp = CreateMemTemp(Ty, "coerce");
24482469

24492470
// Copy from the incoming argument pointer to the temporary with the
@@ -3285,8 +3306,8 @@ void CodeGenFunction::EmitFunctionEpilog(const CGFunctionInfo &FI,
32853306
}
32863307
break;
32873308
}
3288-
32893309
case ABIArgInfo::Expand:
3310+
case ABIArgInfo::IndirectAliased:
32903311
llvm_unreachable("Invalid ABI kind for return argument");
32913312
}
32923313

@@ -4413,7 +4434,8 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
44134434
break;
44144435
}
44154436

4416-
case ABIArgInfo::Indirect: {
4437+
case ABIArgInfo::Indirect:
4438+
case ABIArgInfo::IndirectAliased: {
44174439
assert(NumIRArgs == 1);
44184440
if (!I->isAggregate()) {
44194441
// Make a temporary alloca to pass the argument.
@@ -4668,12 +4690,13 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
46684690
break;
46694691
}
46704692

4671-
case ABIArgInfo::Expand:
4693+
case ABIArgInfo::Expand: {
46724694
unsigned IRArgPos = FirstIRArg;
46734695
ExpandTypeToArgs(I->Ty, *I, IRFuncTy, IRCallArgs, IRArgPos);
46744696
assert(IRArgPos == FirstIRArg + NumIRArgs);
46754697
break;
46764698
}
4699+
}
46774700
}
46784701

46794702
const CGCallee &ConcreteCallee = Callee.prepareConcreteCallee(*this);
@@ -5084,6 +5107,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
50845107
}
50855108

50865109
case ABIArgInfo::Expand:
5110+
case ABIArgInfo::IndirectAliased:
50875111
llvm_unreachable("Invalid ABI kind for return argument");
50885112
}
50895113

clang/lib/CodeGen/TargetInfo.cpp

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,11 @@ LLVM_DUMP_METHOD void ABIArgInfo::dump() const {
257257
<< " ByVal=" << getIndirectByVal()
258258
<< " Realign=" << getIndirectRealign();
259259
break;
260+
case IndirectAliased:
261+
OS << "Indirect Align=" << getIndirectAlign().getQuantity()
262+
<< " AadrSpace=" << getIndirectAddrSpace()
263+
<< " Realign=" << getIndirectRealign();
264+
break;
260265
case Expand:
261266
OS << "Expand";
262267
break;
@@ -1989,6 +1994,7 @@ static bool isArgInAlloca(const ABIArgInfo &Info) {
19891994
case ABIArgInfo::InAlloca:
19901995
return true;
19911996
case ABIArgInfo::Ignore:
1997+
case ABIArgInfo::IndirectAliased:
19921998
return false;
19931999
case ABIArgInfo::Indirect:
19942000
case ABIArgInfo::Direct:
@@ -8790,18 +8796,31 @@ ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
87908796

87918797
// TODO: Can we omit empty structs?
87928798

8793-
llvm::Type *LTy = nullptr;
87948799
if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
8795-
LTy = CGT.ConvertType(QualType(SeltTy, 0));
8800+
Ty = QualType(SeltTy, 0);
87968801

8802+
llvm::Type *OrigLTy = CGT.ConvertType(Ty);
8803+
llvm::Type *LTy = OrigLTy;
87978804
if (getContext().getLangOpts().HIP) {
8798-
if (!LTy)
8799-
LTy = CGT.ConvertType(Ty);
88008805
LTy = coerceKernelArgumentType(
8801-
LTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
8806+
OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
88028807
/*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
88038808
}
88048809

8810+
// FIXME: Should also use this for OpenCL, but it requires addressing the
8811+
// problem of kernels being called.
8812+
//
8813+
// FIXME: This doesn't apply the optimization of coercing pointers in structs
8814+
// to global address space when using byref. This would require implementing a
8815+
// new kind of coercion of the in-memory type when for indirect arguments.
8816+
if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
8817+
isAggregateTypeForABI(Ty)) {
8818+
return ABIArgInfo::getIndirectAliased(
8819+
getContext().getTypeAlignInChars(Ty),
8820+
getContext().getTargetAddressSpace(LangAS::opencl_constant),
8821+
false /*Realign*/, nullptr /*Padding*/);
8822+
}
8823+
88058824
// If we set CanBeFlattened to true, CodeGen will expand the struct to its
88068825
// individual elements, which confuses the Clover OpenCL backend; therefore we
88078826
// have to set it to false here. Other args of getDirect() are just defaults.
@@ -9377,6 +9396,7 @@ Address SparcV9ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
93779396
}
93789397

93799398
case ABIArgInfo::Indirect:
9399+
case ABIArgInfo::IndirectAliased:
93809400
Stride = SlotSize;
93819401
ArgAddr = Builder.CreateElementBitCast(Addr, ArgPtrTy, "indirect");
93829402
ArgAddr = Address(Builder.CreateLoad(ArgAddr, "indirect.arg"),
@@ -9742,6 +9762,7 @@ Address XCoreABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
97429762
ArgSize = ArgSize.alignTo(SlotSize);
97439763
break;
97449764
case ABIArgInfo::Indirect:
9765+
case ABIArgInfo::IndirectAliased:
97459766
Val = Builder.CreateElementBitCast(AP, ArgPtrTy);
97469767
Val = Address(Builder.CreateLoad(Val), TypeAlign);
97479768
ArgSize = SlotSize;

clang/test/CodeGenCUDA/kernel-args.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@ struct A {
88
int a[32];
99
};
1010

11-
// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A %x.coerce)
11+
// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}})
1212
// NVPTX: define void @_Z6kernel1A(%struct.A* byval(%struct.A) align 4 %x)
1313
__global__ void kernel(A x) {
1414
}
1515

1616
class Kernel {
1717
public:
18-
// AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A %x.coerce)
18+
// AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}})
1919
// NVPTX: define void @_ZN6Kernel12memberKernelE1A(%struct.A* byval(%struct.A) align 4 %x)
2020
static __global__ void memberKernel(A x){}
2121
template<typename T> static __global__ void templateMemberKernel(T x) {}
@@ -29,11 +29,11 @@ void launch(void*);
2929

3030
void test() {
3131
Kernel K;
32-
// AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A %x.coerce)
32+
// AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}}
3333
// NVPTX: define void @_Z14templateKernelI1AEvT_(%struct.A* byval(%struct.A) align 4 %x)
3434
launch((void*)templateKernel<A>);
3535

36-
// AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A %x.coerce)
36+
// AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}}
3737
// NVPTX: define void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A* byval(%struct.A) align 4 %x)
3838
launch((void*)Kernel::templateMemberKernel<A>);
3939
}

clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ typedef struct struct_of_structs_arg
6767
int i2;
6868
} struct_of_structs_arg_t;
6969

70-
// CHECK: %union.transparent_u = type { i32 }
7170
typedef union
7271
{
7372
int b1;
@@ -237,7 +236,7 @@ __kernel void kernel_test_struct_of_arrays_arg(struct_of_arrays_arg_t arg1) { }
237236
// CHECK: void @kernel_struct_of_structs_arg(%struct.struct_of_structs_arg %arg1.coerce)
238237
__kernel void kernel_struct_of_structs_arg(struct_of_structs_arg_t arg1) { }
239238

240-
// CHECK: void @test_kernel_transparent_union_arg(%union.transparent_u %u.coerce)
239+
// CHECK: void @test_kernel_transparent_union_arg(i32 %u.coerce)
241240
__kernel void test_kernel_transparent_union_arg(transparent_u u) { }
242241

243242
// CHECK: void @kernel_single_array_element_struct_arg(%struct.single_array_element_struct_arg %arg1.coerce)

0 commit comments

Comments
 (0)