@@ -1857,6 +1857,10 @@ struct AMDGPUStreamTy {
1857
1857
// / Use synchronous copy back.
1858
1858
bool UseSyncCopyBack;
1859
1859
1860
+ // / When copying data from one host buffer to another, only do it
1861
+ // / asynchronously if `MinHostToHostAsyncCopySize <= size`.
1862
+ UInt32Envar OMPX_MinHostToHostAsyncCopySize;
1863
+
1860
1864
// / Arguments for the callback function.
1861
1865
PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
1862
1866
@@ -2281,6 +2285,14 @@ struct AMDGPUStreamTy {
2281
2285
return Err;
2282
2286
}
2283
2287
2288
+ if (CopySize < OMPX_MinHostToHostAsyncCopySize) {
2289
+ if (auto Err =
2290
+ OutputSignals[0 ]->wait (StreamBusyWaitMicroseconds, &Device))
2291
+ return Err;
2292
+ std::memcpy (Dst, Inter, CopySize);
2293
+ return Error::success ();
2294
+ }
2295
+
2284
2296
// Consume another stream slot and compute dependencies.
2285
2297
std::tie (Curr, InputSignal) = consume (OutputSignals[1 ]);
2286
2298
assert (InputSignal && " Invalid input signal" );
@@ -4679,7 +4691,9 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
4679
4691
Slots(32 ), NextSlot(0 ), SyncCycle(0 ),
4680
4692
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
4681
4693
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()),
4682
- UseSyncCopyBack(Device.syncCopyBack()) {}
4694
+ UseSyncCopyBack(Device.syncCopyBack()),
4695
+ OMPX_MinHostToHostAsyncCopySize(
4696
+ " LIBOMPTARGET_AMDGPU_MIN_HOST_TO_HOST_ASYNC_COPY_SIZE" , 2048 ) {}
4683
4697
4684
4698
// / Class implementing the AMDGPU-specific functionalities of the global
4685
4699
// / handler.
0 commit comments