Skip to content

Commit 78a483c

Browse files
[SYCL][L0] allow use of sub-devices with their root-device context (#6538)
1 parent 2e86cd4 commit 78a483c

File tree

2 files changed

+84
-83
lines changed

2 files changed

+84
-83
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 73 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -732,14 +732,13 @@ pi_result _pi_device::initialize(int SubSubDeviceOrdinal,
732732
}
733733
}
734734

735-
// Reinitialize a sub-sub-device with its own ordinal, index and numQueues
735+
// Reinitialize a sub-sub-device with its own ordinal, index.
736736
// Our sub-sub-device representation is currently [Level-Zero sub-device
737-
// handle + Level-Zero compute group/engine index]. As we have a single queue
738-
// per device, we need to reinitialize numQueues in ZeProperties to be 1.
737+
// handle + Level-Zero compute group/engine index]. Only the specified
738+
// index queue will be used to submit work to the sub-sub-device.
739739
if (SubSubDeviceOrdinal >= 0) {
740740
QueueGroup[queue_group_info_t::Compute].ZeOrdinal = SubSubDeviceOrdinal;
741741
QueueGroup[queue_group_info_t::Compute].ZeIndex = SubSubDeviceIndex;
742-
QueueGroup[queue_group_info_t::Compute].ZeProperties.numQueues = 1;
743742
} else { // Proceed with initialization for root and sub-device
744743
// How is it possible that there are no "compute" capabilities?
745744
if (QueueGroup[queue_group_info_t::Compute].ZeOrdinal < 0) {
@@ -862,6 +861,50 @@ pi_device _pi_context::getRootDevice() const {
862861
}
863862

864863
pi_result _pi_context::initialize() {
864+
865+
// Helper lambda to create various USM allocators for a device.
866+
auto createUSMAllocators = [this](pi_device Device) {
867+
SharedMemAllocContexts.emplace(
868+
std::piecewise_construct, std::make_tuple(Device),
869+
std::make_tuple(std::unique_ptr<SystemMemory>(
870+
new USMSharedMemoryAlloc(this, Device))));
871+
SharedReadOnlyMemAllocContexts.emplace(
872+
std::piecewise_construct, std::make_tuple(Device),
873+
std::make_tuple(std::unique_ptr<SystemMemory>(
874+
new USMSharedReadOnlyMemoryAlloc(this, Device))));
875+
DeviceMemAllocContexts.emplace(
876+
std::piecewise_construct, std::make_tuple(Device),
877+
std::make_tuple(std::unique_ptr<SystemMemory>(
878+
new USMDeviceMemoryAlloc(this, Device))));
879+
};
880+
881+
// Recursive helper to call createUSMAllocators for all sub-devices
882+
std::function<void(pi_device)> createUSMAllocatorsRecursive;
883+
createUSMAllocatorsRecursive =
884+
[this, createUSMAllocators,
885+
&createUSMAllocatorsRecursive](pi_device Device) -> void {
886+
createUSMAllocators(Device);
887+
for (auto &SubDevice : Device->SubDevices)
888+
createUSMAllocatorsRecursive(SubDevice);
889+
};
890+
891+
// Create USM allocator context for each pair (device, context).
892+
//
893+
for (auto &Device : Devices) {
894+
createUSMAllocatorsRecursive(Device);
895+
}
896+
// Create USM allocator context for host. Device and Shared USM allocations
897+
// are device-specific. Host allocations are not device-dependent therefore
898+
// we don't need a map with device as key.
899+
HostMemAllocContext = std::make_unique<USMAllocContext>(
900+
std::unique_ptr<SystemMemory>(new USMHostMemoryAlloc(this)));
901+
902+
// We may allocate memory to this root device so create allocators.
903+
if (SingleRootDevice && DeviceMemAllocContexts.find(SingleRootDevice) ==
904+
DeviceMemAllocContexts.end()) {
905+
createUSMAllocators(SingleRootDevice);
906+
}
907+
865908
// Create the immediate command list to be used for initializations
866909
// Created as synchronous so level-zero performs implicit synchronization and
867910
// there is no need to query for completion in the plugin
@@ -1112,32 +1155,30 @@ _pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
11121155
// First, see if the queue's device allows for round-robin or it is
11131156
// fixed to one particular compute CCS (it is so for sub-sub-devices).
11141157
auto &ComputeQueueGroupInfo = Device->QueueGroup[queue_type::Compute];
1158+
ComputeQueueGroup.ZeQueues = ComputeQueues;
11151159
if (ComputeQueueGroupInfo.ZeIndex >= 0) {
11161160
ComputeQueueGroup.LowerIndex = ComputeQueueGroupInfo.ZeIndex;
11171161
ComputeQueueGroup.UpperIndex = ComputeQueueGroupInfo.ZeIndex;
11181162
ComputeQueueGroup.NextIndex = ComputeQueueGroupInfo.ZeIndex;
11191163
} else {
1120-
ComputeQueueGroup.LowerIndex = 0;
1121-
ComputeQueueGroup.UpperIndex = INT_MAX;
1122-
ComputeQueueGroup.NextIndex = 0;
1123-
}
1124-
1125-
uint32_t FilterLowerIndex = getRangeOfAllowedComputeEngines.first;
1126-
uint32_t FilterUpperIndex = getRangeOfAllowedComputeEngines.second;
1127-
FilterUpperIndex =
1128-
std::min((size_t)FilterUpperIndex, ComputeQueues.size() - 1);
1129-
if (FilterLowerIndex <= FilterUpperIndex) {
1130-
ComputeQueueGroup.ZeQueues = ComputeQueues;
1131-
ComputeQueueGroup.LowerIndex = FilterLowerIndex;
1132-
ComputeQueueGroup.UpperIndex = FilterUpperIndex;
1133-
ComputeQueueGroup.NextIndex = ComputeQueueGroup.LowerIndex;
1134-
// Create space to hold immediate commandlists corresponding to the ZeQueues
1135-
if (UseImmediateCommandLists) {
1136-
ComputeQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
1137-
ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
1164+
// Set-up to round-robin across allowed range of engines.
1165+
uint32_t FilterLowerIndex = getRangeOfAllowedComputeEngines.first;
1166+
uint32_t FilterUpperIndex = getRangeOfAllowedComputeEngines.second;
1167+
FilterUpperIndex = std::min((size_t)FilterUpperIndex,
1168+
FilterLowerIndex + ComputeQueues.size() - 1);
1169+
if (FilterLowerIndex <= FilterUpperIndex) {
1170+
ComputeQueueGroup.LowerIndex = FilterLowerIndex;
1171+
ComputeQueueGroup.UpperIndex = FilterUpperIndex;
1172+
ComputeQueueGroup.NextIndex = ComputeQueueGroup.LowerIndex;
1173+
// Create space to hold immediate commandlists corresponding to the
1174+
// ZeQueues
1175+
if (UseImmediateCommandLists) {
1176+
ComputeQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
1177+
ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
1178+
}
1179+
} else {
1180+
die("No compute queue available/allowed.");
11381181
}
1139-
} else {
1140-
die("No compute queue available.");
11411182
}
11421183

11431184
// Copy group initialization.
@@ -1148,8 +1189,8 @@ _pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
11481189
} else {
11491190
uint32_t FilterLowerIndex = getRangeOfAllowedCopyEngines.first;
11501191
uint32_t FilterUpperIndex = getRangeOfAllowedCopyEngines.second;
1151-
FilterUpperIndex =
1152-
std::min((size_t)FilterUpperIndex, CopyQueues.size() - 1);
1192+
FilterUpperIndex = std::min((size_t)FilterUpperIndex,
1193+
FilterLowerIndex + CopyQueues.size() - 1);
11531194
if (FilterLowerIndex <= FilterUpperIndex) {
11541195
CopyQueueGroup.ZeQueues = CopyQueues;
11551196
CopyQueueGroup.LowerIndex = FilterLowerIndex;
@@ -3410,11 +3451,7 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
34103451
PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
34113452
PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
34123453
PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
3413-
3414-
if (std::find(Context->Devices.begin(), Context->Devices.end(), Device) ==
3415-
Context->Devices.end()) {
3416-
return PI_ERROR_INVALID_DEVICE;
3417-
}
3454+
PI_ASSERT(Context->isValidDevice(Device), PI_ERROR_INVALID_DEVICE);
34183455

34193456
// Create placeholder queues in the compute queue group.
34203457
// Actual L0 queues will be created at first use.
@@ -4196,11 +4233,7 @@ pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
41964233
pi_device Device = nullptr;
41974234
if (ZeDevice) {
41984235
Device = Context->getPlatform()->getDeviceFromNativeHandle(ZeDevice);
4199-
// Check that the device is present in this context.
4200-
if (std::find(Context->Devices.begin(), Context->Devices.end(), Device) ==
4201-
Context->Devices.end()) {
4202-
return PI_ERROR_INVALID_CONTEXT;
4203-
}
4236+
PI_ASSERT(Context->isValidDevice(Device), PI_ERROR_INVALID_CONTEXT);
42044237
}
42054238

42064239
try {
@@ -4469,12 +4502,7 @@ pi_result piProgramLink(pi_context Context, pi_uint32 NumDevices,
44694502

44704503
// Validate input parameters.
44714504
PI_ASSERT(DeviceList, PI_ERROR_INVALID_DEVICE);
4472-
{
4473-
auto DeviceEntry =
4474-
find(Context->Devices.begin(), Context->Devices.end(), DeviceList[0]);
4475-
if (DeviceEntry == Context->Devices.end())
4476-
return PI_ERROR_INVALID_DEVICE;
4477-
}
4505+
PI_ASSERT(Context->isValidDevice(DeviceList[0]), PI_ERROR_INVALID_DEVICE);
44784506
PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
44794507
if (NumInputPrograms == 0 || InputPrograms == nullptr)
44804508
return PI_ERROR_INVALID_VALUE;
@@ -4679,12 +4707,9 @@ pi_result piProgramBuild(pi_program Program, pi_uint32 NumDevices,
46794707
std::scoped_lock Guard(Program->Mutex);
46804708
// Check if device belongs to associated context.
46814709
PI_ASSERT(Program->Context, PI_ERROR_INVALID_PROGRAM);
4682-
{
4683-
auto DeviceEntry = find(Program->Context->Devices.begin(),
4684-
Program->Context->Devices.end(), DeviceList[0]);
4685-
if (DeviceEntry == Program->Context->Devices.end())
4686-
return PI_ERROR_INVALID_VALUE;
4687-
}
4710+
PI_ASSERT(Program->Context->isValidDevice(DeviceList[0]),
4711+
PI_ERROR_INVALID_VALUE);
4712+
46884713
// It is legal to build a program created from either IL or from native
46894714
// device code.
46904715
if (Program->State != _pi_program::IL &&

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 11 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -596,41 +596,6 @@ struct _pi_context : _pi_object {
596596
SingleRootDevice(getRootDevice()), ZeCommandListInit{nullptr} {
597597
// NOTE: one must additionally call initialize() to complete
598598
// PI context creation.
599-
600-
// Create USM allocator context for each pair (device, context).
601-
for (uint32_t I = 0; I < NumDevices; I++) {
602-
pi_device Device = Devs[I];
603-
SharedMemAllocContexts.emplace(
604-
std::piecewise_construct, std::make_tuple(Device),
605-
std::make_tuple(std::unique_ptr<SystemMemory>(
606-
new USMSharedMemoryAlloc(this, Device))));
607-
SharedReadOnlyMemAllocContexts.emplace(
608-
std::piecewise_construct, std::make_tuple(Device),
609-
std::make_tuple(std::unique_ptr<SystemMemory>(
610-
new USMSharedReadOnlyMemoryAlloc(this, Device))));
611-
DeviceMemAllocContexts.emplace(
612-
std::piecewise_construct, std::make_tuple(Device),
613-
std::make_tuple(std::unique_ptr<SystemMemory>(
614-
new USMDeviceMemoryAlloc(this, Device))));
615-
}
616-
// Create USM allocator context for host. Device and Shared USM allocations
617-
// are device-specific. Host allocations are not device-dependent therefore
618-
// we don't need a map with device as key.
619-
HostMemAllocContext = std::make_unique<USMAllocContext>(
620-
std::unique_ptr<SystemMemory>(new USMHostMemoryAlloc(this)));
621-
622-
// We may allocate memory to this root device so create allocators.
623-
if (SingleRootDevice && DeviceMemAllocContexts.find(SingleRootDevice) ==
624-
DeviceMemAllocContexts.end()) {
625-
SharedMemAllocContexts.emplace(
626-
std::piecewise_construct, std::make_tuple(SingleRootDevice),
627-
std::make_tuple(std::unique_ptr<SystemMemory>(
628-
new USMSharedMemoryAlloc(this, SingleRootDevice))));
629-
DeviceMemAllocContexts.emplace(
630-
std::piecewise_construct, std::make_tuple(SingleRootDevice),
631-
std::make_tuple(std::unique_ptr<SystemMemory>(
632-
new USMDeviceMemoryAlloc(this, SingleRootDevice))));
633-
}
634599
}
635600

636601
// Initialize the PI context.
@@ -657,6 +622,17 @@ struct _pi_context : _pi_object {
657622
// Therefore it can be accessed without holding a lock on this _pi_context.
658623
const std::vector<pi_device> Devices;
659624

625+
// Checks if Device is covered by this context.
626+
// For that the Device or its root devices need to be in the context.
627+
bool isValidDevice(pi_device Device) const {
628+
while (Device) {
629+
if (std::find(Devices.begin(), Devices.end(), Device) != Devices.end())
630+
return true;
631+
Device = Device->RootDevice;
632+
}
633+
return false;
634+
}
635+
660636
// If context contains one device or sub-devices of the same device, we want
661637
// to save this device.
662638
// This field is only set at _pi_context creation time, and cannot change.

0 commit comments

Comments
 (0)