Skip to content

Commit 9471086

Browse files
committed
[offload][amdgpu] Do small host mem copies synchronously
1 parent ae61c96 commit 9471086

File tree

1 file changed

+15
-1
lines changed
  • offload/plugins-nextgen/amdgpu/src

1 file changed

+15
-1
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1857,6 +1857,10 @@ struct AMDGPUStreamTy {
18571857
/// Use synchronous copy back.
18581858
bool UseSyncCopyBack;
18591859

1860+
/// When copying data from one host buffer to another, only do it
1861+
/// asynchronously if `MinHostToHostAsyncCopySize <= size`.
1862+
UInt32Envar OMPX_MinHostToHostAsyncCopySize;
1863+
18601864
/// Arguments for the callback function.
18611865
PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
18621866

@@ -2281,6 +2285,14 @@ struct AMDGPUStreamTy {
22812285
return Err;
22822286
}
22832287

2288+
if (CopySize < OMPX_MinHostToHostAsyncCopySize) {
2289+
if (auto Err =
2290+
OutputSignals[0]->wait(StreamBusyWaitMicroseconds, &Device))
2291+
return Err;
2292+
std::memcpy(Dst, Inter, CopySize);
2293+
return Error::success();
2294+
}
2295+
22842296
// Consume another stream slot and compute dependencies.
22852297
std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
22862298
assert(InputSignal && "Invalid input signal");
@@ -4679,7 +4691,9 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
46794691
Slots(32), NextSlot(0), SyncCycle(0),
46804692
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
46814693
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()),
4682-
UseSyncCopyBack(Device.syncCopyBack()) {}
4694+
UseSyncCopyBack(Device.syncCopyBack()),
4695+
OMPX_MinHostToHostAsyncCopySize(
4696+
"LIBOMPTARGET_AMDGPU_MIN_HOST_TO_HOST_ASYNC_COPY_SIZE", 2048) {}
46834697

46844698
/// Class implementing the AMDGPU-specific functionalities of the global
46854699
/// handler.

0 commit comments

Comments
 (0)