@@ -711,7 +711,6 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
711
711
private:
712
712
// / The executable loaded on the agent.
713
713
hsa_executable_t Executable;
714
- hsa_code_object_t CodeObject;
715
714
#if SANITIZER_AMDGPU
716
715
hsa_code_object_reader_t CodeObjectReader;
717
716
#endif
@@ -1715,8 +1714,8 @@ struct AMDGPUStreamTy {
1715
1714
1716
1715
// / Create an empty slot.
1717
1716
StreamSlotTy ()
1718
- : Signal(nullptr ), Callbacks({}), ActionArgs({} ),
1719
- OmptActionFunction ( nullptr ) {}
1717
+ : Signal(nullptr ), Callbacks({}), OmptActionFunction( nullptr ),
1718
+ ActionArgs ({} ) {}
1720
1719
1721
1720
// / Schedule a host memory copy action on the slot.
1722
1721
Error schedHostMemoryCopy (void *Dst, const void *Src, size_t Size) {
@@ -1864,6 +1863,10 @@ struct AMDGPUStreamTy {
1864
1863
// / Use synchronous copy back.
1865
1864
bool UseSyncCopyBack;
1866
1865
1866
+ // / When copying data from one host buffer to another, only do it
1867
+ // / asynchronously if `MinHostToHostAsyncCopySize <= size`.
1868
+ UInt32Envar OMPX_MinHostToHostAsyncCopySize;
1869
+
1867
1870
// / Arguments for the callback function.
1868
1871
PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
1869
1872
@@ -2031,8 +2034,8 @@ struct AMDGPUStreamTy {
2031
2034
assert (Args->Signal &&
2032
2035
" Invalid AMDGPUSignal Pointer in post kernel run processing" );
2033
2036
hsa_amd_profiling_dispatch_time_t TimeRec;
2034
- hsa_status_t Status = hsa_amd_profiling_get_dispatch_time (
2035
- Args-> Agent , Args-> Signal -> get (), &TimeRec);
2037
+ hsa_amd_profiling_get_dispatch_time (Args-> Agent , Args-> Signal -> get (),
2038
+ &TimeRec);
2036
2039
2037
2040
uint64_t StartTime = TimeRec.start * Args->TicksToTime ;
2038
2041
uint64_t EndTime = TimeRec.end * Args->TicksToTime ;
@@ -2288,6 +2291,14 @@ struct AMDGPUStreamTy {
2288
2291
return Err;
2289
2292
}
2290
2293
2294
+ if (CopySize < OMPX_MinHostToHostAsyncCopySize) {
2295
+ if (auto Err =
2296
+ OutputSignals[0 ]->wait (StreamBusyWaitMicroseconds, &Device))
2297
+ return Err;
2298
+ std::memcpy (Dst, Inter, CopySize);
2299
+ return Error::success ();
2300
+ }
2301
+
2291
2302
// Consume another stream slot and compute dependencies.
2292
2303
std::tie (Curr, InputSignal) = consume (OutputSignals[1 ]);
2293
2304
assert (InputSignal && " Invalid input signal" );
@@ -2919,7 +2930,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2919
2930
" OMPX_ENABLE_GFX90A_COARSE_GRAIN_SHARED_ALLOC" , false ),
2920
2931
OMPX_StrictSanityChecks (" OMPX_STRICT_SANITY_CHECKS" , false ),
2921
2932
OMPX_SyncCopyBack (" LIBOMPTARGET_SYNC_COPY_BACK" , true ),
2922
- OMPX_APUPrefaultMemcopy (" LIBOMPTARGET_APU_PREFAULT_MEMCOPY" , " true" ),
2933
+ OMPX_APUPrefaultMemcopy (" LIBOMPTARGET_APU_PREFAULT_MEMCOPY" , true ),
2923
2934
OMPX_APUPrefaultMemcopySize (" LIBOMPTARGET_APU_PREFAULT_MEMCOPY_SIZE" ,
2924
2935
1 * 1024 * 1024 ), // 1MB
2925
2936
OMPX_DGPUMaps (" OMPX_DGPU_MAPS" , false ),
@@ -3892,6 +3903,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
3892
3903
case HSA_DEVICE_TYPE_DSP:
3893
3904
TmpCharPtr = " DSP" ;
3894
3905
break ;
3906
+ case HSA_DEVICE_TYPE_AIE:
3907
+ TmpCharPtr = " AIE" ;
3908
+ break ;
3895
3909
}
3896
3910
Info.add (" Device Type" , TmpCharPtr);
3897
3911
}
@@ -4683,7 +4697,9 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
4683
4697
Slots(32 ), NextSlot(0 ), SyncCycle(0 ),
4684
4698
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
4685
4699
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()),
4686
- UseSyncCopyBack(Device.syncCopyBack()) {}
4700
+ UseSyncCopyBack(Device.syncCopyBack()),
4701
+ OMPX_MinHostToHostAsyncCopySize(
4702
+ " LIBOMPTARGET_AMDGPU_MIN_HOST_TO_HOST_ASYNC_COPY_SIZE" , 2048 ) {}
4687
4703
4688
4704
// / Class implementing the AMDGPU-specific functionalities of the global
4689
4705
// / handler.
@@ -5066,7 +5082,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
5066
5082
if (LaunchParams.Size )
5067
5083
std::memcpy (AllArgs, LaunchParams.Data , LaunchParams.Size );
5068
5084
5069
- uint64_t Buffer = 0 ;
5070
5085
AMDGPUDeviceTy &AMDGPUDevice = static_cast <AMDGPUDeviceTy &>(GenericDevice);
5071
5086
AMDGPUStreamTy *Stream = nullptr ;
5072
5087
if (auto Err = AMDGPUDevice.getStream (AsyncInfoWrapper, Stream))
@@ -5117,7 +5132,7 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
5117
5132
// This line should print exactly as the one in the old plugin.
5118
5133
fprintf (
5119
5134
stderr,
5120
- " DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4luX %4d) "
5135
+ " DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4uX %4d) "
5121
5136
" reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u agpr_count:%u "
5122
5137
" sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5123
5138
" md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
@@ -5310,8 +5325,8 @@ static std::pair<uint64_t, uint64_t>
5310
5325
getKernelStartAndEndTime (const OmptKernelTimingArgsAsyncTy *Args) {
5311
5326
assert (Args->Signal && " Invalid AMDGPUSignal Pointer in OMPT profiling" );
5312
5327
hsa_amd_profiling_dispatch_time_t TimeRec;
5313
- hsa_status_t Status = hsa_amd_profiling_get_dispatch_time (
5314
- Args-> Agent , Args-> Signal -> get (), &TimeRec);
5328
+ hsa_amd_profiling_get_dispatch_time (Args-> Agent , Args-> Signal -> get (),
5329
+ &TimeRec);
5315
5330
5316
5331
uint64_t StartTime = TimeRec.start * Args->TicksToTime ;
5317
5332
uint64_t EndTime = TimeRec.end * Args->TicksToTime ;
@@ -5323,8 +5338,7 @@ static std::pair<uint64_t, uint64_t>
5323
5338
getCopyStartAndEndTime (const OmptKernelTimingArgsAsyncTy *Args) {
5324
5339
assert (Args->Signal && " Invalid AMDGPUSignal Pointer in OMPT profiling" );
5325
5340
hsa_amd_profiling_async_copy_time_t TimeRec;
5326
- hsa_status_t Status =
5327
- hsa_amd_profiling_get_async_copy_time (Args->Signal ->get (), &TimeRec);
5341
+ hsa_amd_profiling_get_async_copy_time (Args->Signal ->get (), &TimeRec);
5328
5342
uint64_t StartTime = TimeRec.start * Args->TicksToTime ;
5329
5343
uint64_t EndTime = TimeRec.end * Args->TicksToTime ;
5330
5344
0 commit comments