Skip to content

Commit ae1dbe7

Browse files
authored
[offload][amdgpu] Do small host mem copies synchronously (llvm#859)
2 parents d775205 + 9471086 commit ae1dbe7

File tree

1 file changed

+27
-13
lines changed
  • offload/plugins-nextgen/amdgpu/src

1 file changed

+27
-13
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -711,7 +711,6 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
711711
private:
712712
/// The executable loaded on the agent.
713713
hsa_executable_t Executable;
714-
hsa_code_object_t CodeObject;
715714
#if SANITIZER_AMDGPU
716715
hsa_code_object_reader_t CodeObjectReader;
717716
#endif
@@ -1715,8 +1714,8 @@ struct AMDGPUStreamTy {
17151714

17161715
/// Create an empty slot.
17171716
StreamSlotTy()
1718-
: Signal(nullptr), Callbacks({}), ActionArgs({}),
1719-
OmptActionFunction(nullptr) {}
1717+
: Signal(nullptr), Callbacks({}), OmptActionFunction(nullptr),
1718+
ActionArgs({}) {}
17201719

17211720
/// Schedule a host memory copy action on the slot.
17221721
Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
@@ -1864,6 +1863,10 @@ struct AMDGPUStreamTy {
18641863
/// Use synchronous copy back.
18651864
bool UseSyncCopyBack;
18661865

1866+
/// When copying data from one host buffer to another, only do it
1867+
/// asynchronously if `MinHostToHostAsyncCopySize <= size`.
1868+
UInt32Envar OMPX_MinHostToHostAsyncCopySize;
1869+
18671870
/// Arguments for the callback function.
18681871
PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
18691872

@@ -2031,8 +2034,8 @@ struct AMDGPUStreamTy {
20312034
assert(Args->Signal &&
20322035
"Invalid AMDGPUSignal Pointer in post kernel run processing");
20332036
hsa_amd_profiling_dispatch_time_t TimeRec;
2034-
hsa_status_t Status = hsa_amd_profiling_get_dispatch_time(
2035-
Args->Agent, Args->Signal->get(), &TimeRec);
2037+
hsa_amd_profiling_get_dispatch_time(Args->Agent, Args->Signal->get(),
2038+
&TimeRec);
20362039

20372040
uint64_t StartTime = TimeRec.start * Args->TicksToTime;
20382041
uint64_t EndTime = TimeRec.end * Args->TicksToTime;
@@ -2288,6 +2291,14 @@ struct AMDGPUStreamTy {
22882291
return Err;
22892292
}
22902293

2294+
if (CopySize < OMPX_MinHostToHostAsyncCopySize) {
2295+
if (auto Err =
2296+
OutputSignals[0]->wait(StreamBusyWaitMicroseconds, &Device))
2297+
return Err;
2298+
std::memcpy(Dst, Inter, CopySize);
2299+
return Error::success();
2300+
}
2301+
22912302
// Consume another stream slot and compute dependencies.
22922303
std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
22932304
assert(InputSignal && "Invalid input signal");
@@ -2919,7 +2930,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
29192930
"OMPX_ENABLE_GFX90A_COARSE_GRAIN_SHARED_ALLOC", false),
29202931
OMPX_StrictSanityChecks("OMPX_STRICT_SANITY_CHECKS", false),
29212932
OMPX_SyncCopyBack("LIBOMPTARGET_SYNC_COPY_BACK", true),
2922-
OMPX_APUPrefaultMemcopy("LIBOMPTARGET_APU_PREFAULT_MEMCOPY", "true"),
2933+
OMPX_APUPrefaultMemcopy("LIBOMPTARGET_APU_PREFAULT_MEMCOPY", true),
29232934
OMPX_APUPrefaultMemcopySize("LIBOMPTARGET_APU_PREFAULT_MEMCOPY_SIZE",
29242935
1 * 1024 * 1024), // 1MB
29252936
OMPX_DGPUMaps("OMPX_DGPU_MAPS", false),
@@ -3892,6 +3903,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
38923903
case HSA_DEVICE_TYPE_DSP:
38933904
TmpCharPtr = "DSP";
38943905
break;
3906+
case HSA_DEVICE_TYPE_AIE:
3907+
TmpCharPtr = "AIE";
3908+
break;
38953909
}
38963910
Info.add("Device Type", TmpCharPtr);
38973911
}
@@ -4683,7 +4697,9 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
46834697
Slots(32), NextSlot(0), SyncCycle(0),
46844698
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
46854699
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()),
4686-
UseSyncCopyBack(Device.syncCopyBack()) {}
4700+
UseSyncCopyBack(Device.syncCopyBack()),
4701+
OMPX_MinHostToHostAsyncCopySize(
4702+
"LIBOMPTARGET_AMDGPU_MIN_HOST_TO_HOST_ASYNC_COPY_SIZE", 2048) {}
46874703

46884704
/// Class implementing the AMDGPU-specific functionalities of the global
46894705
/// handler.
@@ -5066,7 +5082,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
50665082
if (LaunchParams.Size)
50675083
std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size);
50685084

5069-
uint64_t Buffer = 0;
50705085
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
50715086
AMDGPUStreamTy *Stream = nullptr;
50725087
if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
@@ -5117,7 +5132,7 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
51175132
// This line should print exactly as the one in the old plugin.
51185133
fprintf(
51195134
stderr,
5120-
"DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4luX%4d) "
5135+
"DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4uX%4d) "
51215136
"reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u agpr_count:%u "
51225137
"sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
51235138
"md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
@@ -5310,8 +5325,8 @@ static std::pair<uint64_t, uint64_t>
53105325
getKernelStartAndEndTime(const OmptKernelTimingArgsAsyncTy *Args) {
53115326
assert(Args->Signal && "Invalid AMDGPUSignal Pointer in OMPT profiling");
53125327
hsa_amd_profiling_dispatch_time_t TimeRec;
5313-
hsa_status_t Status = hsa_amd_profiling_get_dispatch_time(
5314-
Args->Agent, Args->Signal->get(), &TimeRec);
5328+
hsa_amd_profiling_get_dispatch_time(Args->Agent, Args->Signal->get(),
5329+
&TimeRec);
53155330

53165331
uint64_t StartTime = TimeRec.start * Args->TicksToTime;
53175332
uint64_t EndTime = TimeRec.end * Args->TicksToTime;
@@ -5323,8 +5338,7 @@ static std::pair<uint64_t, uint64_t>
53235338
getCopyStartAndEndTime(const OmptKernelTimingArgsAsyncTy *Args) {
53245339
assert(Args->Signal && "Invalid AMDGPUSignal Pointer in OMPT profiling");
53255340
hsa_amd_profiling_async_copy_time_t TimeRec;
5326-
hsa_status_t Status =
5327-
hsa_amd_profiling_get_async_copy_time(Args->Signal->get(), &TimeRec);
5341+
hsa_amd_profiling_get_async_copy_time(Args->Signal->get(), &TimeRec);
53285342
uint64_t StartTime = TimeRec.start * Args->TicksToTime;
53295343
uint64_t EndTime = TimeRec.end * Args->TicksToTime;
53305344

0 commit comments

Comments
 (0)