10
10
#include " command_buffer.hpp"
11
11
#include " ur_level_zero.hpp"
12
12
13
- /* Command-buffer Extension
14
-
15
- The UR interface for submitting a UR command-buffer takes a list
16
- of events to wait on, and returns an event representing the completion of
17
- that particular submission of the command-buffer.
18
-
19
- However, in `zeCommandQueueExecuteCommandLists` there are no parameters to
20
- take a waitlist and also the only sync primitive returned is to block on
21
- host.
22
-
23
- In order to get the UR command-buffer enqueue semantics we want with L0
24
- this adapter adds extra commands to the L0 command-list representing a
25
- UR command-buffer.
26
-
27
- Prefix - Commands added to the start of the L0 command-list by L0 adapter.
28
- Suffix - Commands added to the end of the L0 command-list by L0 adapter.
29
-
30
- These extra commands operate on L0 event synchronisation primitives used by
31
- the command-list to interact with the external UR wait-list and UR return
32
- event required for the enqueue interface.
33
-
34
- The `ur_exp_command_buffer_handle_t` class for this adapter contains a
35
- SignalEvent which signals the completion of the command-list in the suffix,
36
- and is reset in the prefix. This signal is detected by a new UR return event
37
- created on UR command-buffer enqueue.
38
-
39
- There is also a WaitEvent used by the `ur_exp_command_buffer_handle_t` class
40
- in the prefix to wait on any dependencies passed in the enqueue wait-list.
41
- This WaitEvent is reset at the end of the suffix, along with reset commands
42
- to reset the L0 events used to implement the UR sync-points.
43
-
44
- ┌──────────┬────────────────────────────────────────────────┬─────────┐
45
- │ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │
46
- └──────────┴────────────────────────────────────────────────┴─────────┘
47
-
48
- ┌───────────────────┬──────────────┐──────────────────────────────┐
49
- Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│
50
- └───────────────────┴──────────────┘──────────────────────────────┘
51
-
52
- ┌─────────────────────────────────────────────┐──────────────┐
53
- Suffix │Barrier waiting on sync-point event, │ Query CMD │
54
- │signaling the UR command-buffer signal event │ Timestamps │
55
- └─────────────────────────────────────────────┘──────────────┘
56
-
57
- For a call to `urCommandBufferEnqueueExp` with an event_list `EL`,
58
- command-buffer `CB`, and return event `RE` our implementation has to create
59
- and submit two new command-lists for the above approach to work. One before
60
- the command-list with extra commands associated with `CB`, and the other
61
- after `CB`.
62
-
63
- Command-list created on `urCommandBufferEnqueueExp` to execution before `CB`:
64
- ┌───────────────────────────────────────────────────────────┐
65
- │Barrier on `EL` than signals `CB` WaitEvent when completed │
66
- └───────────────────────────────────────────────────────────┘
67
-
68
- Command-list created on `urCommandBufferEnqueueExp` to execution after `CB`:
69
- ┌─────────────────────────────────────────────────────────────┐
70
- │Barrier on `CB` SignalEvent that signals `RE` when completed │
71
- └─────────────────────────────────────────────────────────────┘
72
-
73
- Drawbacks
74
- ---------
75
-
76
- There are two drawbacks to this approach:
77
-
78
- 1. We use 3x the command-list resources, if there are many UR command-buffers
79
- in flight, this may exhaust L0 driver resources.
80
-
81
- 2. Each command list is submitted individually with a
82
- `ur_queue_handle_t_::executeCommandList` call which introduces serialization in
83
- the submission pipeline that is heavier than having a barrier or a
84
- waitForEvents on the same list. Resulting in additional latency when executing
85
- graphs.
86
-
13
+ /* L0 Command-buffer Extension Doc see:
14
+ https://github.com/intel/llvm/blob/sycl/sycl/doc/design/CommandGraph.md#level-zero
87
15
*/
88
16
89
17
ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_ (
90
18
ur_context_handle_t Context, ur_device_handle_t Device,
91
19
ze_command_list_handle_t CommandList,
20
+ ze_command_list_handle_t CommandListResetEvents,
92
21
ZeStruct<ze_command_list_desc_t > ZeDesc,
93
22
const ur_exp_command_buffer_desc_t *Desc)
94
23
: Context(Context), Device(Device), ZeCommandList(CommandList),
24
+ ZeCommandListResetEvents(CommandListResetEvents),
95
25
ZeCommandListDesc(ZeDesc), ZeFencesList(), QueueProperties(),
96
26
SyncPoints(), NextSyncPoint(0 ) {
97
27
(void )Desc;
@@ -114,6 +44,12 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
114
44
ZE_CALL_NOCHECK (zeCommandListDestroy, (ZeCommandList));
115
45
}
116
46
47
+ // Release the memory allocated to the CommandListResetEvents stored in the
48
+ // command_buffer
49
+ if (ZeCommandListResetEvents) {
50
+ ZE_CALL_NOCHECK (zeCommandListDestroy, (ZeCommandListResetEvents));
51
+ }
52
+
117
53
// Release additional signal and wait events used by command_buffer
118
54
if (SignalEvent) {
119
55
CleanupCompletedEvent (SignalEvent, false );
@@ -123,6 +59,10 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
123
59
CleanupCompletedEvent (WaitEvent, false );
124
60
urEventReleaseInternal (WaitEvent);
125
61
}
62
+ if (AllResetEvent) {
63
+ CleanupCompletedEvent (AllResetEvent, false );
64
+ urEventReleaseInternal (AllResetEvent);
65
+ }
126
66
127
67
// Release events added to the command_buffer
128
68
for (auto &Sync : SyncPoints) {
@@ -434,6 +374,13 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
434
374
435
375
ZeStruct<ze_command_list_desc_t > ZeCommandListDesc;
436
376
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
377
+
378
+ ze_command_list_handle_t ZeCommandListResetEvents;
379
+ // Create a command-list for reseting the events associated to enqueued cmd.
380
+ ZE2UR_CALL (zeCommandListCreate,
381
+ (Context->ZeContext , Device->ZeDevice , &ZeCommandListDesc,
382
+ &ZeCommandListResetEvents));
383
+
437
384
// Dependencies between commands are explicitly enforced by sync points when
438
385
// enqueuing. Consequently, relax the command ordering in the command list
439
386
// can enable the backend to further optimize the workload
@@ -446,7 +393,8 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
446
393
&ZeCommandListDesc, &ZeCommandList));
447
394
try {
448
395
*CommandBuffer = new ur_exp_command_buffer_handle_t_ (
449
- Context, Device, ZeCommandList, ZeCommandListDesc, CommandBufferDesc);
396
+ Context, Device, ZeCommandList, ZeCommandListResetEvents,
397
+ ZeCommandListDesc, CommandBufferDesc);
450
398
} catch (const std::bad_alloc &) {
451
399
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
452
400
} catch (...) {
@@ -460,13 +408,19 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
460
408
&RetCommandBuffer->SignalEvent ));
461
409
UR_CALL (EventCreate (Context, nullptr , false , false ,
462
410
&RetCommandBuffer->WaitEvent ));
411
+ UR_CALL (EventCreate (Context, nullptr , false , false ,
412
+ &RetCommandBuffer->AllResetEvent ));
463
413
464
414
// Add prefix commands
465
- ZE2UR_CALL (zeCommandListAppendEventReset,
466
- (ZeCommandList, RetCommandBuffer->SignalEvent ->ZeEvent ));
415
+ ZE2UR_CALL (
416
+ zeCommandListAppendEventReset,
417
+ (ZeCommandListResetEvents, RetCommandBuffer->SignalEvent ->ZeEvent ));
418
+ std::vector<ze_event_handle_t > PrecondEvents = {
419
+ RetCommandBuffer->WaitEvent ->ZeEvent ,
420
+ RetCommandBuffer->AllResetEvent ->ZeEvent };
467
421
ZE2UR_CALL (
468
422
zeCommandListAppendBarrier,
469
- (ZeCommandList, nullptr , 1 , &RetCommandBuffer-> WaitEvent -> ZeEvent ));
423
+ (ZeCommandList, nullptr , PrecondEvents. size (), PrecondEvents. data () ));
470
424
return UR_RESULT_SUCCESS;
471
425
}
472
426
@@ -488,20 +442,29 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) {
488
442
UR_APIEXPORT ur_result_t UR_APICALL
489
443
urCommandBufferFinalizeExp (ur_exp_command_buffer_handle_t CommandBuffer) {
490
444
// Create a list of events for our signal event to wait on
445
+ // This loop also resets the L0 events we use for command-buffer internal
446
+ // sync-points to the non-signaled state.
447
+ // This is required for multiple submissions.
491
448
const size_t NumEvents = CommandBuffer->SyncPoints .size ();
492
- std::vector<ze_event_handle_t > WaitEventList{NumEvents};
493
449
for (size_t i = 0 ; i < NumEvents; i++) {
494
- WaitEventList[i] = CommandBuffer->SyncPoints [i]->ZeEvent ;
450
+ auto ZeEvent = CommandBuffer->SyncPoints [i]->ZeEvent ;
451
+ CommandBuffer->ZeEventsList .push_back (ZeEvent);
452
+ ZE2UR_CALL (zeCommandListAppendEventReset,
453
+ (CommandBuffer->ZeCommandListResetEvents , ZeEvent));
495
454
}
455
+ ZE2UR_CALL (zeCommandListAppendSignalEvent,
456
+ (CommandBuffer->ZeCommandListResetEvents ,
457
+ CommandBuffer->AllResetEvent ->ZeEvent ));
496
458
497
459
// Wait for all the user added commands to complete, and signal the
498
460
// command-buffer signal-event when they are done.
499
461
ZE2UR_CALL (zeCommandListAppendBarrier,
500
462
(CommandBuffer->ZeCommandList , CommandBuffer->SignalEvent ->ZeEvent ,
501
- NumEvents, WaitEventList .data ()));
463
+ NumEvents, CommandBuffer-> ZeEventsList .data ()));
502
464
503
- // Close the command list and have it ready for dispatch.
465
+ // Close the command lists and have them ready for dispatch.
504
466
ZE2UR_CALL (zeCommandListClose, (CommandBuffer->ZeCommandList ));
467
+ ZE2UR_CALL (zeCommandListClose, (CommandBuffer->ZeCommandListResetEvents ));
505
468
return UR_RESULT_SUCCESS;
506
469
}
507
470
@@ -875,26 +838,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
875
838
ZE2UR_CALL (zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
876
839
CommandBuffer->ZeFencesList .push_back (ZeFence);
877
840
878
- // Create command-list to execute before `CommandListPtr` and will signal
879
- // when `EventWaitList` dependencies are complete.
880
- ur_command_list_ptr_t WaitCommandList{};
881
- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList, false ,
882
- false ));
883
-
884
- // Create a list of events of all the events that compose the command buffer
885
- // workload.
886
- // This loop also resets the L0 events we use for command-buffer internal
887
- // sync-points to the non-signaled state.
888
- // This is required for multiple submissions.
889
- const size_t NumEvents = CommandBuffer->SyncPoints .size ();
890
- std::vector<ze_event_handle_t > WaitEventList{NumEvents};
891
- for (size_t i = 0 ; i < NumEvents; i++) {
892
- auto ZeEvent = CommandBuffer->SyncPoints [i]->ZeEvent ;
893
- WaitEventList[i] = ZeEvent;
894
- ZE2UR_CALL (zeCommandListAppendEventReset,
895
- (WaitCommandList->first , ZeEvent));
896
- }
897
-
898
841
bool MustSignalWaitEvent = true ;
899
842
if (NumEventsInWaitList) {
900
843
_ur_ze_event_list_t TmpWaitList;
@@ -909,18 +852,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
909
852
CommandBuffer->WaitEvent ->WaitList .insert (TmpWaitList);
910
853
911
854
if (!CommandBuffer->WaitEvent ->WaitList .isEmpty ()) {
855
+ // Create command-list to execute before `CommandListPtr` and will signal
856
+ // when `EventWaitList` dependencies are complete.
857
+ ur_command_list_ptr_t WaitCommandList{};
858
+ UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList,
859
+ false , false ));
860
+
912
861
ZE2UR_CALL (zeCommandListAppendBarrier,
913
862
(WaitCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ,
914
863
CommandBuffer->WaitEvent ->WaitList .Length ,
915
864
CommandBuffer->WaitEvent ->WaitList .ZeEventList ));
865
+ Queue->executeCommandList (WaitCommandList, false , false );
916
866
MustSignalWaitEvent = false ;
917
867
}
918
868
}
919
869
if (MustSignalWaitEvent) {
920
- ZE2UR_CALL (zeCommandListAppendSignalEvent,
921
- (WaitCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ));
870
+ ZE2UR_CALL (zeEventHostSignal, (CommandBuffer->WaitEvent ->ZeEvent ));
922
871
}
923
- Queue->executeCommandList (WaitCommandList, false , false );
872
+
873
+ // Submit reset events command-list. This command-list is of a batch
874
+ // command-list type, regardless of the UR Queue type. We therefore need to
875
+ // submit the list directly using the Level-Zero API to avoid type mismatches
876
+ // if using UR functions.
877
+ ZE2UR_CALL (
878
+ zeCommandQueueExecuteCommandLists,
879
+ (ZeCommandQueue, 1 , &CommandBuffer->ZeCommandListResetEvents , nullptr ));
924
880
925
881
// Submit main command-list. This command-list is of a batch command-list
926
882
// type, regardless of the UR Queue type. We therefore need to submit the list
@@ -940,6 +896,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
940
896
// submission dependencies have been satisfied.
941
897
ZE2UR_CALL (zeCommandListAppendEventReset,
942
898
(SignalCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ));
899
+ // Reset the all-reset-event for the UR command-buffer that is signaled when
900
+ // all events of the main command-list have been reset.
901
+ ZE2UR_CALL (zeCommandListAppendEventReset,
902
+ (SignalCommandList->first , CommandBuffer->AllResetEvent ->ZeEvent ));
943
903
944
904
if (Event) {
945
905
UR_CALL (createEventAndAssociateQueue (
@@ -955,14 +915,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
955
915
// engine to recover these timestamps.
956
916
command_buffer_profiling_t *Profiling = new command_buffer_profiling_t ();
957
917
958
- Profiling->NumEvents = WaitEventList .size ();
918
+ Profiling->NumEvents = CommandBuffer-> ZeEventsList .size ();
959
919
Profiling->Timestamps =
960
920
new ze_kernel_timestamp_result_t [Profiling->NumEvents ];
961
921
962
922
ZE2UR_CALL (zeCommandListAppendQueryKernelTimestamps,
963
- (SignalCommandList->first , WaitEventList .size (),
964
- WaitEventList .data (), ( void *)Profiling-> Timestamps , 0 ,
965
- RetEvent->ZeEvent , 1 ,
923
+ (SignalCommandList->first , CommandBuffer-> ZeEventsList .size (),
924
+ CommandBuffer-> ZeEventsList .data (),
925
+ ( void *)Profiling-> Timestamps , 0 , RetEvent->ZeEvent , 1 ,
966
926
&(CommandBuffer->SignalEvent ->ZeEvent )));
967
927
968
928
RetEvent->CommandData = static_cast <void *>(Profiling);
0 commit comments