Skip to content

Commit 5b74fb7

Browse files
authored
Reland '[flang] Allow to pass an async id to allocate the descriptor (llvm#118713)' and llvm#118733 (llvm#120997)
Device runtime build have been fixed. Attempt to re-land these patches that have been approved before. llvm#118713 llvm#118733
1 parent 4ad3de3 commit 5b74fb7

22 files changed

+105
-63
lines changed

flang/include/flang/Runtime/CUDA/allocator.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,16 @@ extern "C" {
2020
void RTDECL(CUFRegisterAllocator)();
2121
}
2222

23-
void *CUFAllocPinned(std::size_t);
23+
void *CUFAllocPinned(std::size_t, std::int64_t = kCudaNoStream);
2424
void CUFFreePinned(void *);
2525

26-
void *CUFAllocDevice(std::size_t);
26+
void *CUFAllocDevice(std::size_t, std::int64_t);
2727
void CUFFreeDevice(void *);
2828

29-
void *CUFAllocManaged(std::size_t);
29+
void *CUFAllocManaged(std::size_t, std::int64_t = kCudaNoStream);
3030
void CUFFreeManaged(void *);
3131

32-
void *CUFAllocUnified(std::size_t);
32+
void *CUFAllocUnified(std::size_t, std::int64_t = kCudaNoStream);
3333
void CUFFreeUnified(void *);
3434

3535
} // namespace Fortran::runtime::cuda

flang/include/flang/Runtime/CUDA/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ static constexpr unsigned kHostToDevice = 0;
2323
static constexpr unsigned kDeviceToHost = 1;
2424
static constexpr unsigned kDeviceToDevice = 2;
2525

26+
/// Value used for asyncId when no specific stream is specified.
27+
static constexpr std::int64_t kCudaNoStream = -1;
28+
2629
#define CUDA_REPORT_IF_ERROR(expr) \
2730
[](cudaError_t err) { \
2831
if (err == cudaSuccess) \

flang/include/flang/Runtime/allocatable.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,9 @@ int RTDECL(AllocatableCheckLengthParameter)(Descriptor &,
9494
// Successfully allocated memory is initialized if the allocatable has a
9595
// derived type, and is always initialized by AllocatableAllocateSource().
9696
// Performs all necessary coarray synchronization and validation actions.
97-
int RTDECL(AllocatableAllocate)(Descriptor &, bool hasStat = false,
98-
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
99-
int sourceLine = 0);
97+
int RTDECL(AllocatableAllocate)(Descriptor &, std::int64_t asyncId = -1,
98+
bool hasStat = false, const Descriptor *errMsg = nullptr,
99+
const char *sourceFile = nullptr, int sourceLine = 0);
100100
int RTDECL(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
101101
bool hasStat = false, const Descriptor *errMsg = nullptr,
102102
const char *sourceFile = nullptr, int sourceLine = 0);

flang/include/flang/Runtime/allocator-registry.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,27 @@
1111

1212
#include "flang/Common/api-attrs.h"
1313
#include "flang/Runtime/allocator-registry-consts.h"
14+
#include <cstdint>
1415
#include <cstdlib>
1516
#include <vector>
1617

1718
#define MAX_ALLOCATOR 7 // 3 bits are reserved in the descriptor.
1819

1920
namespace Fortran::runtime {
2021

21-
using AllocFct = void *(*)(std::size_t);
22+
using AllocFct = void *(*)(std::size_t, std::int64_t);
2223
using FreeFct = void (*)(void *);
2324

2425
typedef struct Allocator_t {
2526
AllocFct alloc{nullptr};
2627
FreeFct free{nullptr};
2728
} Allocator_t;
2829

29-
#ifdef RT_DEVICE_COMPILATION
30-
static RT_API_ATTRS void *MallocWrapper(std::size_t size) {
30+
static RT_API_ATTRS void *MallocWrapper(
31+
std::size_t size, [[maybe_unused]] std::int64_t) {
3132
return std::malloc(size);
3233
}
34+
#ifdef RT_DEVICE_COMPILATION
3335
static RT_API_ATTRS void FreeWrapper(void *p) { return std::free(p); }
3436
#endif
3537

@@ -39,7 +41,7 @@ struct AllocatorRegistry {
3941
: allocators{{&MallocWrapper, &FreeWrapper}} {}
4042
#else
4143
constexpr AllocatorRegistry() {
42-
allocators[kDefaultAllocator] = {&std::malloc, &std::free};
44+
allocators[kDefaultAllocator] = {&MallocWrapper, &std::free};
4345
};
4446
#endif
4547
RT_API_ATTRS void Register(int, Allocator_t);

flang/include/flang/Runtime/descriptor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ class Descriptor {
369369
// before calling. It (re)computes the byte strides after
370370
// allocation. Does not allocate automatic components or
371371
// perform default component initialization.
372-
RT_API_ATTRS int Allocate();
372+
RT_API_ATTRS int Allocate(std::int64_t asyncId = -1);
373373
RT_API_ATTRS void SetByteStrides();
374374

375375
// Deallocates storage; does not call FINAL subroutines or

flang/lib/Lower/Allocatable.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,14 @@ static mlir::Value genRuntimeAllocate(fir::FirOpBuilder &builder,
184184
? fir::runtime::getRuntimeFunc<mkRTKey(PointerAllocate)>(loc, builder)
185185
: fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
186186
builder);
187-
llvm::SmallVector<mlir::Value> args{
188-
box.getAddr(), errorManager.hasStat, errorManager.errMsgAddr,
189-
errorManager.sourceFile, errorManager.sourceLine};
187+
llvm::SmallVector<mlir::Value> args{box.getAddr()};
188+
if (!box.isPointer())
189+
args.push_back(
190+
builder.createIntegerConstant(loc, builder.getI64Type(), -1));
191+
args.push_back(errorManager.hasStat);
192+
args.push_back(errorManager.errMsgAddr);
193+
args.push_back(errorManager.sourceFile);
194+
args.push_back(errorManager.sourceLine);
190195
llvm::SmallVector<mlir::Value> operands;
191196
for (auto [fst, snd] : llvm::zip(args, callee.getFunctionType().getInputs()))
192197
operands.emplace_back(builder.createConvert(loc, snd, fst));

flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,19 @@ void fir::runtime::genAllocatableAllocate(fir::FirOpBuilder &builder,
7676
mlir::func::FuncOp func{
7777
fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc, builder)};
7878
mlir::FunctionType fTy{func.getFunctionType()};
79+
mlir::Value asyncId =
80+
builder.createIntegerConstant(loc, builder.getI64Type(), -1);
7981
mlir::Value sourceFile{fir::factory::locationToFilename(builder, loc)};
8082
mlir::Value sourceLine{
81-
fir::factory::locationToLineNo(builder, loc, fTy.getInput(4))};
83+
fir::factory::locationToLineNo(builder, loc, fTy.getInput(5))};
8284
if (!hasStat)
8385
hasStat = builder.createBool(loc, false);
8486
if (!errMsg) {
8587
mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
8688
errMsg = builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
8789
}
88-
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
89-
builder, loc, fTy, desc, hasStat, errMsg, sourceFile, sourceLine)};
90+
llvm::SmallVector<mlir::Value> args{
91+
fir::runtime::createArguments(builder, loc, fTy, desc, asyncId, hasStat,
92+
errMsg, sourceFile, sourceLine)};
9093
builder.create<fir::CallOp>(loc, func, args);
9194
}

flang/runtime/CUDA/allocatable.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
5252
}
5353
// Perform the standard allocation.
5454
int stat{RTNAME(AllocatableAllocate)(
55-
desc, hasStat, errMsg, sourceFile, sourceLine)};
55+
desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
5656
return stat;
5757
}
5858

flang/runtime/CUDA/allocator.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,23 +33,28 @@ void RTDEF(CUFRegisterAllocator)() {
3333
}
3434
}
3535

36-
void *CUFAllocPinned(std::size_t sizeInBytes) {
36+
void *CUFAllocPinned(std::size_t sizeInBytes, std::int64_t) {
3737
void *p;
3838
CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
3939
return p;
4040
}
4141

4242
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
4343

44-
void *CUFAllocDevice(std::size_t sizeInBytes) {
44+
void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t stream) {
4545
void *p;
46-
CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
46+
if (stream >= 0) {
47+
CUDA_REPORT_IF_ERROR(
48+
cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)stream));
49+
} else {
50+
CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
51+
}
4752
return p;
4853
}
4954

5055
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
5156

52-
void *CUFAllocManaged(std::size_t sizeInBytes) {
57+
void *CUFAllocManaged(std::size_t sizeInBytes, std::int64_t) {
5358
void *p;
5459
CUDA_REPORT_IF_ERROR(
5560
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -58,7 +63,7 @@ void *CUFAllocManaged(std::size_t sizeInBytes) {
5863

5964
void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
6065

61-
void *CUFAllocUnified(std::size_t sizeInBytes) {
66+
void *CUFAllocUnified(std::size_t sizeInBytes, std::int64_t) {
6267
// Call alloc managed for the time being.
6368
return CUFAllocManaged(sizeInBytes);
6469
}

flang/runtime/CUDA/descriptor.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ RT_EXT_API_GROUP_BEGIN
2020

2121
Descriptor *RTDEF(CUFAllocDescriptor)(
2222
std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
23-
return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
23+
return reinterpret_cast<Descriptor *>(
24+
CUFAllocManaged(sizeInBytes, kCudaNoStream));
2425
}
2526

2627
void RTDEF(CUFFreeDescriptor)(

0 commit comments

Comments
 (0)