45
45
│ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │
46
46
└──────────┴────────────────────────────────────────────────┴─────────┘
47
47
48
- ┌───────────────────┬──────────────────────────────┐
49
- Prefix │Reset signal event │ Barrier waiting on wait event│
50
- └───────────────────┴──────────────────────────────┘
48
+ ┌───────────────────┬──────────────┐────────────── ────────────────┐
49
+ Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│
50
+ └───────────────────┴──────────────┘────────────── ────────────────┘
51
51
52
52
┌─────────────────────────────────────────────┐──────────────┐
53
- Suffix │Barrier waiting on sync-point event, │ Reset events │
54
- │signalling the UR command-buffer signal event│ │
53
+ Suffix │Barrier waiting on sync-point event, │ Query CMD │
54
+ │signalling the UR command-buffer signal event│ Timestamps │
55
55
└─────────────────────────────────────────────┘──────────────┘
56
56
57
57
For a call to `urCommandBufferEnqueueExp` with an event_list `EL`,
@@ -431,6 +431,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
431
431
432
432
ZeStruct<ze_command_list_desc_t > ZeCommandListDesc;
433
433
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
434
+ // Dependencies between commands are explicitly enforced by sync points when
435
+ // enqueuing. Consequently, relax the command ordering in the command list
436
+ // can enable the backend to further optimize the workload
437
+ ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;
434
438
435
439
ze_command_list_handle_t ZeCommandList;
436
440
// TODO We could optimize this by pooling both Level Zero command-lists and UR
@@ -491,18 +495,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
491
495
(CommandBuffer->ZeCommandList , CommandBuffer->SignalEvent ->ZeEvent ,
492
496
NumEvents, WaitEventList.data ()));
493
497
494
- // Reset the wait-event for the UR command-buffer that is signalled when its
495
- // submission dependencies have been satisfied.
496
- ZE2UR_CALL (zeCommandListAppendEventReset,
497
- (CommandBuffer->ZeCommandList , CommandBuffer->WaitEvent ->ZeEvent ));
498
-
499
- // Reset the L0 events we use for command-buffer internal sync-points to the
500
- // non-signalled state
501
- for (auto Event : WaitEventList) {
502
- ZE2UR_CALL (zeCommandListAppendEventReset,
503
- (CommandBuffer->ZeCommandList , Event));
504
- }
505
-
506
498
// Close the command list and have it ready for dispatch.
507
499
ZE2UR_CALL (zeCommandListClose, (CommandBuffer->ZeCommandList ));
508
500
return UR_RESULT_SUCCESS;
@@ -876,6 +868,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
876
868
877
869
// Create command-list to execute before `CommandListPtr` and will signal
878
870
// when `EventWaitList` dependencies are complete.
871
+ ur_command_list_ptr_t WaitCommandList{};
872
+ UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList, false ,
873
+ false ));
874
+
875
+ // Create a list of events of all the events that compose the command buffer
876
+ // workload.
877
+ // This loop also resets the L0 events we use for command-buffer internal
878
+ // sync-points to the non-signalled state.
879
+ // This is required for multiple submissions.
880
+ const size_t NumEvents = CommandBuffer->SyncPoints .size ();
881
+ std::vector<ze_event_handle_t > WaitEventList{NumEvents};
882
+ for (size_t i = 0 ; i < NumEvents; i++) {
883
+ auto ZeEvent = CommandBuffer->SyncPoints [i]->ZeEvent ;
884
+ WaitEventList[i] = ZeEvent;
885
+ ZE2UR_CALL (zeCommandListAppendEventReset,
886
+ (WaitCommandList->first , ZeEvent));
887
+ }
888
+
879
889
bool MustSignalWaitEvent = true ;
880
890
if (NumEventsInWaitList) {
881
891
_ur_ze_event_list_t TmpWaitList;
@@ -890,10 +900,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
890
900
CommandBuffer->WaitEvent ->WaitList .insert (TmpWaitList);
891
901
892
902
if (!CommandBuffer->WaitEvent ->WaitList .isEmpty ()) {
893
- ur_command_list_ptr_t WaitCommandList{};
894
- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList,
895
- false , false ))
896
-
897
903
ZE2UR_CALL (zeCommandListAppendBarrier,
898
904
(WaitCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ,
899
905
CommandBuffer->WaitEvent ->WaitList .Length ,
@@ -916,22 +922,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
916
922
917
923
// Execution event for this enqueue of the UR command-buffer
918
924
ur_event_handle_t RetEvent{};
919
- if (Event) {
920
- // Create a command-list to signal RetEvent on completion
921
- ur_command_list_ptr_t SignalCommandList{};
922
- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, SignalCommandList,
923
- false , false ));
924
925
926
+ // Create a command-list to signal RetEvent on completion
927
+ ur_command_list_ptr_t SignalCommandList{};
928
+ UR_CALL (Queue->Context ->getAvailableCommandList (Queue, SignalCommandList,
929
+ false , false ));
930
+ // Reset the wait-event for the UR command-buffer that is signalled when its
931
+ // submission dependencies have been satisfied.
932
+ ZE2UR_CALL (zeCommandListAppendEventReset,
933
+ (SignalCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ));
934
+
935
+ if (Event) {
925
936
UR_CALL (createEventAndAssociateQueue (Queue, &RetEvent,
926
937
UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
927
938
SignalCommandList, false , true ));
928
939
929
- ZE2UR_CALL (zeCommandListAppendBarrier,
930
- (SignalCommandList->first , RetEvent->ZeEvent , 1 ,
931
- &(CommandBuffer->SignalEvent ->ZeEvent )));
932
- Queue->executeCommandList (SignalCommandList, false , false );
940
+ if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
941
+ // Multiple submissions of a command buffer implies that we need to save
942
+ // the event timestamps before resubmiting the command buffer. We
943
+ // therefore copy the these timestamps in a dedicated USM memory section
944
+ // before completing the command buffer execution, and then attach this
945
+ // memory to the event returned to users to allow to allow the profiling
946
+ // engine to recover these timestamps.
947
+ command_buffer_profiling_t *Profiling = new command_buffer_profiling_t ();
948
+
949
+ Profiling->NumEvents = WaitEventList.size ();
950
+ Profiling->Timestamps =
951
+ new ze_kernel_timestamp_result_t [Profiling->NumEvents ];
952
+
953
+ ZE2UR_CALL (zeCommandListAppendQueryKernelTimestamps,
954
+ (SignalCommandList->first , WaitEventList.size (),
955
+ WaitEventList.data (), (void *)Profiling->Timestamps , 0 ,
956
+ RetEvent->ZeEvent , 1 ,
957
+ &(CommandBuffer->SignalEvent ->ZeEvent )));
958
+
959
+ RetEvent->CommandData = static_cast <void *>(Profiling);
960
+ } else {
961
+ ZE2UR_CALL (zeCommandListAppendBarrier,
962
+ (SignalCommandList->first , RetEvent->ZeEvent , 1 ,
963
+ &(CommandBuffer->SignalEvent ->ZeEvent )));
964
+ }
933
965
}
934
966
967
+ Queue->executeCommandList (SignalCommandList, false , false );
968
+
935
969
if (Event) {
936
970
*Event = RetEvent;
937
971
}
0 commit comments