Skip to content

Commit e452bf0

Browse files
[SYCL] Support device_global in SYCLBIN kernel_bundle (#19164)
This commit adds the functionality for using device_global in SYCLBIN-based kernel_bundle, with some temporary limitations such as the names have to be without namespace qualification. --------- Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
1 parent f7f049a commit e452bf0

16 files changed

+404
-71
lines changed

sycl/source/detail/context_impl.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -340,9 +340,9 @@ void context_impl::addDeviceGlobalInitializer(
340340
}
341341
}
342342

343-
std::vector<ur_event_handle_t>
344-
context_impl::initializeDeviceGlobals(ur_program_handle_t NativePrg,
345-
queue_impl &QueueImpl) {
343+
std::vector<ur_event_handle_t> context_impl::initializeDeviceGlobals(
344+
ur_program_handle_t NativePrg, queue_impl &QueueImpl,
345+
detail::kernel_bundle_impl *KernelBundleImplPtr) {
346346
if (!MDeviceGlobalNotInitializedCnt.load(std::memory_order_acquire))
347347
return {};
348348

@@ -396,6 +396,12 @@ context_impl::initializeDeviceGlobals(ur_program_handle_t NativePrg,
396396
detail::ProgramManager::getInstance().getDeviceGlobalEntries(
397397
DeviceGlobalIds,
398398
/*ExcludeDeviceImageScopeDecorated=*/true);
399+
// Kernel bundles may have isolated device globals. They need to be
400+
// initialized too.
401+
if (KernelBundleImplPtr && KernelBundleImplPtr->getDeviceGlobalMap().size())
402+
KernelBundleImplPtr->getDeviceGlobalMap().getEntries(
403+
DeviceGlobalIds, /*ExcludeDeviceImageScopeDecorated=*/true,
404+
DeviceGlobalEntries);
399405

400406
// If there were no device globals without device_image_scope the device
401407
// globals are trivially fully initialized and we can end early.

sycl/source/detail/context_impl.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,8 @@ class context_impl : public std::enable_shared_from_this<context_impl> {
223223

224224
/// Initializes device globals for a program on the associated queue.
225225
std::vector<ur_event_handle_t>
226-
initializeDeviceGlobals(ur_program_handle_t NativePrg, queue_impl &QueueImpl);
226+
initializeDeviceGlobals(ur_program_handle_t NativePrg, queue_impl &QueueImpl,
227+
detail::kernel_bundle_impl *KernelBundleImplPtr);
227228

228229
void memcpyToHostOnlyDeviceGlobal(device_impl &DeviceImpl,
229230
const void *DeviceGlobalPtr,

sycl/source/detail/device_global_map.hpp

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,30 @@
1616
#include <detail/device_global_map_entry.hpp>
1717
#include <sycl/detail/defines_elementary.hpp>
1818
#include <sycl/detail/kernel_name_str_t.hpp>
19+
#include <sycl/kernel_bundle.hpp>
1920

2021
namespace sycl {
2122
inline namespace _V1 {
2223
namespace detail {
2324

2425
class DeviceGlobalMap {
2526
public:
27+
DeviceGlobalMap(bool OwnerControlledCleanup)
28+
: MOwnerControlledCleanup{OwnerControlledCleanup} {}
29+
30+
~DeviceGlobalMap() {
31+
if (!MOwnerControlledCleanup)
32+
for (auto &DeviceGlobalIt : MDeviceGlobals)
33+
DeviceGlobalIt.second->cleanup();
34+
}
35+
2636
void initializeEntries(const RTDeviceBinaryImage *Img) {
27-
const auto &DeviceGlobals = Img->getDeviceGlobals();
2837
std::lock_guard<std::mutex> DeviceGlobalsGuard(MDeviceGlobalsMutex);
38+
initializeEntriesLockless(Img);
39+
}
40+
41+
void initializeEntriesLockless(const RTDeviceBinaryImage *Img) {
42+
const auto &DeviceGlobals = Img->getDeviceGlobals();
2943
for (const sycl_device_binary_property &DeviceGlobal : DeviceGlobals) {
3044
ByteArray DeviceGlobalInfo =
3145
DeviceBinaryProperty(DeviceGlobal).asByteArray();
@@ -102,9 +116,16 @@ class DeviceGlobalMap {
102116
return Entry->second;
103117
}
104118

105-
DeviceGlobalMapEntry *tryGetEntry(const std::string &UniqueId,
106-
bool ExcludeDeviceImageScopeDecorated) {
119+
DeviceGlobalMapEntry *
120+
tryGetEntry(const std::string &UniqueId,
121+
bool ExcludeDeviceImageScopeDecorated = false) {
107122
std::lock_guard<std::mutex> DeviceGlobalsGuard(MDeviceGlobalsMutex);
123+
return tryGetEntryLockless(UniqueId, ExcludeDeviceImageScopeDecorated);
124+
}
125+
126+
DeviceGlobalMapEntry *
127+
tryGetEntryLockless(const std::string &UniqueId,
128+
bool ExcludeDeviceImageScopeDecorated = false) const {
108129
auto DeviceGlobalEntry = MDeviceGlobals.find(UniqueId);
109130
if (DeviceGlobalEntry != MDeviceGlobals.end() &&
110131
(!ExcludeDeviceImageScopeDecorated ||
@@ -113,22 +134,17 @@ class DeviceGlobalMap {
113134
return nullptr;
114135
}
115136

116-
std::vector<DeviceGlobalMapEntry *>
117-
getEntries(const std::vector<std::string> &UniqueIds,
118-
bool ExcludeDeviceImageScopeDecorated) {
119-
std::vector<DeviceGlobalMapEntry *> FoundEntries;
120-
FoundEntries.reserve(UniqueIds.size());
121-
137+
void getEntries(const std::vector<std::string> &UniqueIds,
138+
bool ExcludeDeviceImageScopeDecorated,
139+
std::vector<DeviceGlobalMapEntry *> &OutVec) {
122140
std::lock_guard<std::mutex> DeviceGlobalsGuard(MDeviceGlobalsMutex);
123141
for (const std::string &UniqueId : UniqueIds) {
124142
auto DeviceGlobalEntry = MDeviceGlobals.find(UniqueId);
125-
assert(DeviceGlobalEntry != MDeviceGlobals.end() &&
126-
"Device global not found in map.");
127-
if (!ExcludeDeviceImageScopeDecorated ||
128-
!DeviceGlobalEntry->second->MIsDeviceImageScopeDecorated)
129-
FoundEntries.push_back(DeviceGlobalEntry->second.get());
143+
if (DeviceGlobalEntry != MDeviceGlobals.end() &&
144+
(!ExcludeDeviceImageScopeDecorated ||
145+
!DeviceGlobalEntry->second->MIsDeviceImageScopeDecorated))
146+
OutVec.push_back(DeviceGlobalEntry->second.get());
130147
}
131-
return FoundEntries;
132148
}
133149

134150
const std::unordered_map<const void *, DeviceGlobalMapEntry *>
@@ -143,6 +159,12 @@ class DeviceGlobalMap {
143159
}
144160

145161
private:
162+
// Indicates whether the owner will explicitly cleanup the entries. If false
163+
// the dtor of DeviceGlobalMap will cleanup the entries.
164+
// Note: This lets the global device global map avoid overhead at shutdown and
165+
// instead let the contexts own the associated entries.
166+
bool MOwnerControlledCleanup = true;
167+
146168
// Maps between device_global identifiers and associated information.
147169
std::unordered_map<KernelNameStrT, std::unique_ptr<DeviceGlobalMapEntry>>
148170
MDeviceGlobals;

sycl/source/detail/device_global_map_entry.cpp

Lines changed: 74 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -68,22 +68,33 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) {
6868
{
6969
std::lock_guard<std::mutex> Lock(NewAlloc.MInitEventMutex);
7070
ur_event_handle_t InitEvent;
71-
// C++ guarantees members appear in memory in the order they are declared,
72-
// so since the member variable that contains the initial contents of the
73-
// device_global is right after the usm_ptr member variable we can do
74-
// some pointer arithmetic to memcopy over this value to the usm_ptr. This
75-
// value inside of the device_global will be zero-initialized if it was not
76-
// given a value on construction.
77-
78-
MemoryManager::copy_usm(reinterpret_cast<const void *>(
79-
reinterpret_cast<uintptr_t>(MDeviceGlobalPtr) +
80-
sizeof(MDeviceGlobalPtr)),
81-
QueueImpl, MDeviceGlobalTSize, NewAlloc.MPtr,
82-
std::vector<ur_event_handle_t>{}, &InitEvent);
71+
if (MDeviceGlobalPtr) {
72+
// C++ guarantees members appear in memory in the order they are declared,
73+
// so since the member variable that contains the initial contents of the
74+
// device_global is right after the usm_ptr member variable we can do
75+
// some pointer arithmetic to memcopy over this value to the usm_ptr. This
76+
// value inside of the device_global will be zero-initialized if it was
77+
// not given a value on construction.
78+
MemoryManager::copy_usm(
79+
reinterpret_cast<const void *>(
80+
reinterpret_cast<uintptr_t>(MDeviceGlobalPtr) +
81+
sizeof(MDeviceGlobalPtr)),
82+
QueueImpl, MDeviceGlobalTSize, NewAlloc.MPtr,
83+
std::vector<ur_event_handle_t>{}, &InitEvent);
84+
} else {
85+
// For SYCLBIN device globals we do not have a host pointer to copy from,
86+
// so instead we fill the USM memory with 0's.
87+
MemoryManager::fill_usm(NewAlloc.MPtr, QueueImpl, MDeviceGlobalTSize,
88+
{static_cast<unsigned char>(0)}, {}, &InitEvent);
89+
}
8390
NewAlloc.MInitEvent = InitEvent;
8491
}
8592

86-
CtxImpl.addAssociatedDeviceGlobal(MDeviceGlobalPtr);
93+
// Only device globals with host variables need to be registered with the
94+
// context. The rest will be managed by their kernel bundles and cleaned up
95+
// accordingly.
96+
if (MDeviceGlobalPtr)
97+
CtxImpl.addAssociatedDeviceGlobal(MDeviceGlobalPtr);
8798
return NewAlloc;
8899
}
89100

@@ -111,19 +122,32 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(const context &Context) {
111122
"USM allocation for device and context already happened.");
112123
DeviceGlobalUSMMem &NewAlloc = NewAllocIt.first->second;
113124

114-
// C++ guarantees members appear in memory in the order they are declared,
115-
// so since the member variable that contains the initial contents of the
116-
// device_global is right after the usm_ptr member variable we can do
117-
// some pointer arithmetic to memcopy over this value to the usm_ptr. This
118-
// value inside of the device_global will be zero-initialized if it was not
119-
// given a value on construction.
120-
MemoryManager::context_copy_usm(
121-
reinterpret_cast<const void *>(
122-
reinterpret_cast<uintptr_t>(MDeviceGlobalPtr) +
123-
sizeof(MDeviceGlobalPtr)),
124-
&CtxImpl, MDeviceGlobalTSize, NewAlloc.MPtr);
125-
126-
CtxImpl.addAssociatedDeviceGlobal(MDeviceGlobalPtr);
125+
if (MDeviceGlobalPtr) {
126+
// C++ guarantees members appear in memory in the order they are declared,
127+
// so since the member variable that contains the initial contents of the
128+
// device_global is right after the usm_ptr member variable we can do
129+
// some pointer arithmetic to memcopy over this value to the usm_ptr. This
130+
// value inside of the device_global will be zero-initialized if it was not
131+
// given a value on construction.
132+
MemoryManager::context_copy_usm(
133+
reinterpret_cast<const void *>(
134+
reinterpret_cast<uintptr_t>(MDeviceGlobalPtr) +
135+
sizeof(MDeviceGlobalPtr)),
136+
&CtxImpl, MDeviceGlobalTSize, NewAlloc.MPtr);
137+
} else {
138+
// For SYCLBIN device globals we do not have a host pointer to copy from,
139+
// so instead we fill the USM memory with 0's.
140+
std::vector<unsigned char> ImmBuff(MDeviceGlobalTSize,
141+
static_cast<unsigned char>(0));
142+
MemoryManager::context_copy_usm(ImmBuff.data(), &CtxImpl,
143+
MDeviceGlobalTSize, NewAlloc.MPtr);
144+
}
145+
146+
// Only device globals with host variables need to be registered with the
147+
// context. The rest will be managed by their kernel bundles and cleaned up
148+
// accordingly.
149+
if (MDeviceGlobalPtr)
150+
CtxImpl.addAssociatedDeviceGlobal(MDeviceGlobalPtr);
127151
return NewAlloc;
128152
}
129153

@@ -150,6 +174,30 @@ void DeviceGlobalMapEntry::removeAssociatedResources(
150174
}
151175
}
152176

177+
void DeviceGlobalMapEntry::cleanup() {
178+
std::lock_guard<std::mutex> Lock{MDeviceToUSMPtrMapMutex};
179+
assert(MDeviceGlobalPtr == nullptr &&
180+
"Entry has host variable, so it should be associated with a context "
181+
"and should be cleaned up by its dtor.");
182+
for (auto &USMPtrIt : MDeviceToUSMPtrMap) {
183+
// The context should be alive through the kernel_bundle owning these
184+
// device_global entries.
185+
const context_impl *CtxImpl = USMPtrIt.first.second;
186+
DeviceGlobalUSMMem &USMMem = USMPtrIt.second;
187+
detail::usm::freeInternal(USMMem.MPtr, CtxImpl);
188+
if (USMMem.MInitEvent.has_value())
189+
CtxImpl->getAdapter()->call<UrApiKind::urEventRelease>(
190+
*USMMem.MInitEvent);
191+
#ifndef NDEBUG
192+
// For debugging we set the event and memory to some recognizable values
193+
// to allow us to check that this cleanup happens before erasure.
194+
USMMem.MPtr = nullptr;
195+
USMMem.MInitEvent = {};
196+
#endif
197+
}
198+
MDeviceToUSMPtrMap.clear();
199+
}
200+
153201
} // namespace detail
154202
} // namespace _V1
155203
} // namespace sycl

sycl/source/detail/device_global_map_entry.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,13 @@ struct DeviceGlobalMapEntry {
120120
// Removes resources for device_globals associated with the context.
121121
void removeAssociatedResources(const context_impl *CtxImpl);
122122

123+
// Cleans up the USM memory and intialization events associated with this
124+
// entry. This should only be called when the device global entry is not
125+
// owned by the program manager, as otherwise it will be bound to the lifetime
126+
// of the owner context and will be cleaned up through
127+
// removeAssociatedResources.
128+
void cleanup();
129+
123130
private:
124131
// Map from a device and a context to the associated USM allocation for the
125132
// device_global. This should always be empty if MIsDeviceImageScopeDecorated

sycl/source/detail/device_image_impl.hpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,20 +84,15 @@ class ManagedDeviceGlobalsRegistry {
8484
bool hasDeviceGlobalName(const std::string &Name) const noexcept {
8585
return !MDeviceGlobalNames.empty() &&
8686
std::find(MDeviceGlobalNames.begin(), MDeviceGlobalNames.end(),
87-
mangleDeviceGlobalName(Name)) != MDeviceGlobalNames.end();
87+
Name) != MDeviceGlobalNames.end();
8888
}
8989

9090
DeviceGlobalMapEntry *tryGetDeviceGlobalEntry(const std::string &Name) const {
9191
auto &PM = detail::ProgramManager::getInstance();
92-
return PM.tryGetDeviceGlobalEntry(MPrefix + mangleDeviceGlobalName(Name));
92+
return PM.tryGetDeviceGlobalEntry(MPrefix + Name);
9393
}
9494

9595
private:
96-
static std::string mangleDeviceGlobalName(const std::string &Name) {
97-
// TODO: Support device globals declared in namespaces.
98-
return "_Z" + std::to_string(Name.length()) + Name;
99-
}
100-
10196
void unregisterDeviceGlobalsFromContext() {
10297
if (MDeviceGlobalNames.empty())
10398
return;

0 commit comments

Comments
 (0)