Skip to content

Commit 4d0836d

Browse files
[NFC][SYCL][Graph] Avoid unnecessary shared_ptr in duplicateNodes
Changes are mostly local to the routine, but I've updated `MIDCache` to use raw `node_impl *` while on it too because `duplicateNodes` accounts for 40% of the places that need updates after the change of the `MIDCache`.
1 parent 19d83d5 commit 4d0836d

File tree

4 files changed

+126
-139
lines changed

4 files changed

+126
-139
lines changed

sycl/source/detail/graph/graph_impl.cpp

Lines changed: 78 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -409,10 +409,8 @@ void graph_impl::markCGMemObjs(
409409
}
410410
}
411411

412-
std::shared_ptr<node_impl> graph_impl::add(nodes_range Deps) {
413-
const std::shared_ptr<node_impl> &NodeImpl = std::make_shared<node_impl>();
414-
415-
MNodeStorage.push_back(NodeImpl);
412+
node_impl &graph_impl::add(nodes_range Deps) {
413+
node_impl &NodeImpl = createNode();
416414

417415
addDepsToNode(NodeImpl, Deps);
418416
// Add an event associated with this explicit node for mixed usage
@@ -421,10 +419,9 @@ std::shared_ptr<node_impl> graph_impl::add(nodes_range Deps) {
421419
return NodeImpl;
422420
}
423421

424-
std::shared_ptr<node_impl>
425-
graph_impl::add(std::function<void(handler &)> CGF,
426-
const std::vector<sycl::detail::ArgDesc> &Args,
427-
std::vector<std::shared_ptr<node_impl>> &Deps) {
422+
node_impl &graph_impl::add(std::function<void(handler &)> CGF,
423+
const std::vector<sycl::detail::ArgDesc> &Args,
424+
nodes_range Deps) {
428425
(void)Args;
429426
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
430427
detail::handler_impl HandlerImpl{*this};
@@ -435,7 +432,9 @@ graph_impl::add(std::function<void(handler &)> CGF,
435432

436433
// Pass the node deps to the handler so they are available when processing the
437434
// CGF, need for async_malloc nodes.
438-
Handler.impl->MNodeDeps = Deps;
435+
Handler.impl->MNodeDeps.clear();
436+
for (node_impl &N : Deps)
437+
Handler.impl->MNodeDeps.push_back(N.shared_from_this());
439438

440439
#if XPTI_ENABLE_INSTRUMENTATION
441440
// Save code location if one was set in TLS.
@@ -471,7 +470,7 @@ graph_impl::add(std::function<void(handler &)> CGF,
471470
: ext::oneapi::experimental::detail::getNodeTypeFromCG(
472471
Handler.getType());
473472

474-
auto NodeImpl =
473+
node_impl &NodeImpl =
475474
this->add(NodeType, std::move(Handler.impl->MGraphNodeCG), Deps);
476475

477476
// Add an event associated with this explicit node for mixed usage
@@ -489,44 +488,40 @@ graph_impl::add(std::function<void(handler &)> CGF,
489488
}
490489

491490
for (auto &[DynamicParam, ArgIndex] : DynamicParams) {
492-
DynamicParam->registerNode(NodeImpl, ArgIndex);
491+
DynamicParam->registerNode(NodeImpl.shared_from_this(), ArgIndex);
493492
}
494493

495494
return NodeImpl;
496495
}
497496

498-
std::shared_ptr<node_impl>
499-
graph_impl::add(node_type NodeType,
500-
std::shared_ptr<sycl::detail::CG> CommandGroup,
501-
nodes_range Deps) {
497+
node_impl &graph_impl::add(node_type NodeType,
498+
std::shared_ptr<sycl::detail::CG> CommandGroup,
499+
nodes_range Deps) {
502500

503501
// A unique set of dependencies obtained by checking requirements and events
504502
std::set<std::shared_ptr<node_impl>> UniqueDeps = getCGEdges(CommandGroup);
505503

506504
// Track and mark the memory objects being used by the graph.
507505
markCGMemObjs(CommandGroup);
508506

509-
const std::shared_ptr<node_impl> &NodeImpl =
510-
std::make_shared<node_impl>(NodeType, std::move(CommandGroup));
511-
MNodeStorage.push_back(NodeImpl);
507+
node_impl &NodeImpl = createNode(NodeType, std::move(CommandGroup));
512508

513509
// Add any deps determined from requirements and events into the dependency
514510
// list
515511
addDepsToNode(NodeImpl, Deps);
516512
addDepsToNode(NodeImpl, UniqueDeps);
517513

518514
if (NodeType == node_type::async_free) {
519-
auto AsyncFreeCG =
520-
static_cast<CGAsyncFree *>(NodeImpl->MCommandGroup.get());
515+
auto AsyncFreeCG = static_cast<CGAsyncFree *>(NodeImpl.MCommandGroup.get());
521516
// If this is an async free node mark that it is now available for reuse,
522517
// and pass the async free node for tracking.
523-
MGraphMemPool.markAllocationAsAvailable(AsyncFreeCG->getPtr(), *NodeImpl);
518+
MGraphMemPool.markAllocationAsAvailable(AsyncFreeCG->getPtr(), NodeImpl);
524519
}
525520

526521
return NodeImpl;
527522
}
528523

529-
std::shared_ptr<node_impl>
524+
node_impl &
530525
graph_impl::add(std::shared_ptr<dynamic_command_group_impl> &DynCGImpl,
531526
nodes_range Deps) {
532527
// Set of Dependent nodes based on CG event and accessor dependencies.
@@ -551,15 +546,14 @@ graph_impl::add(std::shared_ptr<dynamic_command_group_impl> &DynCGImpl,
551546
const auto &ActiveKernel = DynCGImpl->getActiveCG();
552547
node_type NodeType =
553548
ext::oneapi::experimental::detail::getNodeTypeFromCG(DynCGImpl->MCGType);
554-
std::shared_ptr<detail::node_impl> NodeImpl =
555-
add(NodeType, ActiveKernel, Deps);
549+
detail::node_impl &NodeImpl = add(NodeType, ActiveKernel, Deps);
556550

557551
// Add an event associated with this explicit node for mixed usage
558552
addEventForNode(sycl::detail::event_impl::create_completed_host_event(),
559553
NodeImpl);
560554

561555
// Track the dynamic command-group used inside the node object
562-
DynCGImpl->MNodes.push_back(NodeImpl);
556+
DynCGImpl->MNodes.push_back(NodeImpl.shared_from_this());
563557

564558
return NodeImpl;
565559
}
@@ -652,7 +646,7 @@ void graph_impl::makeEdge(std::shared_ptr<node_impl> Src,
652646
bool DestWasGraphRoot = Dest->MPredecessors.size() == 0;
653647

654648
// We need to add the edges first before checking for cycles
655-
Src->registerSuccessor(Dest);
649+
Src->registerSuccessor(*Dest);
656650

657651
bool DestLostRootStatus = DestWasGraphRoot && Dest->MPredecessors.size() == 1;
658652
if (DestLostRootStatus) {
@@ -1233,40 +1227,38 @@ exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
12331227

12341228
void exec_graph_impl::duplicateNodes() {
12351229
// Map of original modifiable nodes (keys) to new duplicated nodes (values)
1236-
std::map<std::shared_ptr<node_impl>, std::shared_ptr<node_impl>> NodesMap;
1230+
std::map<node_impl *, node_impl *> NodesMap;
12371231

1238-
const std::vector<std::shared_ptr<node_impl>> &ModifiableNodes =
1239-
MGraphImpl->MNodeStorage;
1232+
nodes_range ModifiableNodes{MGraphImpl->MNodeStorage};
12401233
std::deque<std::shared_ptr<node_impl>> NewNodes;
12411234

1242-
for (size_t i = 0; i < ModifiableNodes.size(); i++) {
1243-
auto OriginalNode = ModifiableNodes[i];
1244-
std::shared_ptr<node_impl> NodeCopy =
1245-
std::make_shared<node_impl>(*OriginalNode);
1235+
for (node_impl &OriginalNode : ModifiableNodes) {
1236+
NewNodes.push_back(std::make_shared<node_impl>(OriginalNode));
1237+
node_impl &NodeCopy = *NewNodes.back();
12461238

12471239
// Associate the ID of the original node with the node copy for later quick
12481240
// access
1249-
MIDCache.insert(std::make_pair(OriginalNode->MID, NodeCopy));
1241+
MIDCache.insert(std::make_pair(OriginalNode.MID, &NodeCopy));
12501242

12511243
// Clear edges between nodes so that we can replace with new ones
1252-
NodeCopy->MSuccessors.clear();
1253-
NodeCopy->MPredecessors.clear();
1244+
NodeCopy.MSuccessors.clear();
1245+
NodeCopy.MPredecessors.clear();
12541246
// Push the new node to the front of the stack
1255-
NewNodes.push_back(NodeCopy);
12561247
// Associate the new node with the old one for updating edges
1257-
NodesMap.insert({OriginalNode, NodeCopy});
1248+
NodesMap.insert({&OriginalNode, &NodeCopy});
12581249
}
12591250

12601251
// Now that all nodes have been copied rebuild edges on new nodes. This must
12611252
// be done as a separate step since successors may be out of order.
1262-
for (size_t i = 0; i < ModifiableNodes.size(); i++) {
1263-
auto OriginalNode = ModifiableNodes[i];
1264-
auto NodeCopy = NewNodes[i];
1253+
auto OrigIt = ModifiableNodes.begin(), OrigEnd = ModifiableNodes.end();
1254+
for (auto NewIt = NewNodes.begin(); OrigIt != OrigEnd; ++OrigIt, ++NewIt) {
1255+
node_impl &OriginalNode = *OrigIt;
1256+
node_impl &NodeCopy = **NewIt;
12651257
// Look through all the original node successors, find their copies and
12661258
// register those as successors with the current copied node
1267-
for (node_impl &NextNode : OriginalNode->successors()) {
1268-
auto Successor = NodesMap.at(NextNode.shared_from_this());
1269-
NodeCopy->registerSuccessor(Successor);
1259+
for (node_impl &NextNode : OriginalNode.successors()) {
1260+
node_impl &Successor = *NodesMap.at(&NextNode);
1261+
NodeCopy.registerSuccessor(Successor);
12701262
}
12711263
}
12721264

@@ -1279,48 +1271,47 @@ void exec_graph_impl::duplicateNodes() {
12791271
if (NewNode->MNodeType != node_type::subgraph) {
12801272
continue;
12811273
}
1282-
const std::vector<std::shared_ptr<node_impl>> &SubgraphNodes =
1283-
NewNode->MSubGraphImpl->MNodeStorage;
1274+
nodes_range SubgraphNodes{NewNode->MSubGraphImpl->MNodeStorage};
12841275
std::deque<std::shared_ptr<node_impl>> NewSubgraphNodes{};
12851276

12861277
// Map of original subgraph nodes (keys) to new duplicated nodes (values)
1287-
std::map<std::shared_ptr<node_impl>, std::shared_ptr<node_impl>>
1288-
SubgraphNodesMap;
1278+
std::map<node_impl *, node_impl *> SubgraphNodesMap;
12891279

12901280
// Copy subgraph nodes
1291-
for (size_t i = 0; i < SubgraphNodes.size(); i++) {
1292-
auto SubgraphNode = SubgraphNodes[i];
1293-
auto NodeCopy = std::make_shared<node_impl>(*SubgraphNode);
1281+
for (node_impl &SubgraphNode : SubgraphNodes) {
1282+
NewSubgraphNodes.push_back(std::make_shared<node_impl>(SubgraphNode));
1283+
node_impl &NodeCopy = *NewSubgraphNodes.back();
12941284
// Associate the ID of the original subgraph node with all extracted node
12951285
// copies for future quick access.
1296-
MIDCache.insert(std::make_pair(SubgraphNode->MID, NodeCopy));
1286+
MIDCache.insert(std::make_pair(SubgraphNode.MID, &NodeCopy));
12971287

1298-
NewSubgraphNodes.push_back(NodeCopy);
1299-
SubgraphNodesMap.insert({SubgraphNode, NodeCopy});
1300-
NodeCopy->MSuccessors.clear();
1301-
NodeCopy->MPredecessors.clear();
1288+
SubgraphNodesMap.insert({&SubgraphNode, &NodeCopy});
1289+
NodeCopy.MSuccessors.clear();
1290+
NodeCopy.MPredecessors.clear();
13021291
}
13031292

13041293
// Rebuild edges for new subgraph nodes
1305-
for (size_t i = 0; i < SubgraphNodes.size(); i++) {
1306-
auto SubgraphNode = SubgraphNodes[i];
1307-
auto NodeCopy = NewSubgraphNodes[i];
1308-
1309-
for (node_impl &NextNode : SubgraphNode->successors()) {
1310-
auto Successor = SubgraphNodesMap.at(NextNode.shared_from_this());
1311-
NodeCopy->registerSuccessor(Successor);
1294+
auto OrigIt = SubgraphNodes.begin(), OrigEnd = SubgraphNodes.end();
1295+
for (auto NewIt = NewSubgraphNodes.begin(); OrigIt != OrigEnd;
1296+
++OrigIt, ++NewIt) {
1297+
node_impl &SubgraphNode = *OrigIt;
1298+
node_impl &NodeCopy = **NewIt;
1299+
1300+
for (node_impl &NextNode : SubgraphNode.successors()) {
1301+
node_impl &Successor = *SubgraphNodesMap.at(&NextNode);
1302+
NodeCopy.registerSuccessor(Successor);
13121303
}
13131304
}
13141305

13151306
// Collect input and output nodes for the subgraph
1316-
std::vector<std::shared_ptr<node_impl>> Inputs;
1317-
std::vector<std::shared_ptr<node_impl>> Outputs;
1318-
for (auto &NodeImpl : NewSubgraphNodes) {
1307+
std::vector<node_impl *> Inputs;
1308+
std::vector<node_impl *> Outputs;
1309+
for (std::shared_ptr<node_impl> &NodeImpl : NewSubgraphNodes) {
13191310
if (NodeImpl->MPredecessors.size() == 0) {
1320-
Inputs.push_back(NodeImpl);
1311+
Inputs.push_back(&*NodeImpl);
13211312
}
13221313
if (NodeImpl->MSuccessors.size() == 0) {
1323-
Outputs.push_back(NodeImpl);
1314+
Outputs.push_back(&*NodeImpl);
13241315
}
13251316
}
13261317

@@ -1340,8 +1331,8 @@ void exec_graph_impl::duplicateNodes() {
13401331

13411332
// Add all input nodes from the subgraph as successors for this node
13421333
// instead
1343-
for (auto &Input : Inputs) {
1344-
PredNode.registerSuccessor(Input);
1334+
for (node_impl *Input : Inputs) {
1335+
PredNode.registerSuccessor(*Input);
13451336
}
13461337
}
13471338

@@ -1359,8 +1350,8 @@ void exec_graph_impl::duplicateNodes() {
13591350

13601351
// Add all Output nodes from the subgraph as predecessors for this node
13611352
// instead
1362-
for (auto &Output : Outputs) {
1363-
Output->registerSuccessor(SuccNode.shared_from_this());
1353+
for (node_impl *Output : Outputs) {
1354+
Output->registerSuccessor(SuccNode);
13641355
}
13651356
}
13661357

@@ -1370,15 +1361,18 @@ void exec_graph_impl::duplicateNodes() {
13701361
NewNodes.erase(std::find(NewNodes.begin(), NewNodes.end(), NewNode));
13711362
// Also set the iterator to the newly added nodes so we can continue
13721363
// iterating over all remaining nodes
1373-
auto InsertIt = NewNodes.insert(OldPositionIt, NewSubgraphNodes.begin(),
1374-
NewSubgraphNodes.end());
1364+
auto InsertIt = NewNodes.insert(
1365+
OldPositionIt, std::make_move_iterator(NewSubgraphNodes.begin()),
1366+
std::make_move_iterator(NewSubgraphNodes.end()));
13751367
// Since the new reverse_iterator will be at i - 1 we need to advance it
13761368
// when constructing
13771369
NewNodeIt = std::make_reverse_iterator(std::next(InsertIt));
13781370
}
13791371

13801372
// Store all the new nodes locally
1381-
MNodeStorage.insert(MNodeStorage.begin(), NewNodes.begin(), NewNodes.end());
1373+
MNodeStorage.insert(MNodeStorage.begin(),
1374+
std::make_move_iterator(NewNodes.begin()),
1375+
std::make_move_iterator(NewNodes.end()));
13821376
}
13831377

13841378
void exec_graph_impl::update(std::shared_ptr<graph_impl> GraphImpl) {
@@ -1443,7 +1437,7 @@ void exec_graph_impl::update(std::shared_ptr<graph_impl> GraphImpl) {
14431437

14441438
for (uint32_t i = 0; i < MNodeStorage.size(); ++i) {
14451439
MIDCache.insert(
1446-
std::make_pair(GraphImpl->MNodeStorage[i]->MID, MNodeStorage[i]));
1440+
std::make_pair(GraphImpl->MNodeStorage[i]->MID, MNodeStorage[i].get()));
14471441
}
14481442

14491443
update(GraphImpl->MNodeStorage);
@@ -1716,7 +1710,7 @@ void exec_graph_impl::populateURKernelUpdateStructs(
17161710
auto ExecNode = MIDCache.find(Node->MID);
17171711
assert(ExecNode != MIDCache.end() && "Node ID was not found in ID cache");
17181712

1719-
auto Command = MCommandMap.find(ExecNode->second.get());
1713+
auto Command = MCommandMap.find(ExecNode->second);
17201714
assert(Command != MCommandMap.end());
17211715
UpdateDesc.hCommand = Command->second;
17221716

@@ -1746,7 +1740,7 @@ exec_graph_impl::getURUpdatableNodes(
17461740

17471741
auto ExecNode = MIDCache.find(Node->MID);
17481742
assert(ExecNode != MIDCache.end() && "Node ID was not found in ID cache");
1749-
auto PartitionIndex = MPartitionNodes.find(ExecNode->second.get());
1743+
auto PartitionIndex = MPartitionNodes.find(ExecNode->second);
17501744
assert(PartitionIndex != MPartitionNodes.end());
17511745
PartitionedNodes[PartitionIndex->second].push_back(Node);
17521746
}
@@ -1843,38 +1837,25 @@ node modifiable_command_graph::addImpl(dynamic_command_group &DynCGF,
18431837
"dynamic command-group.");
18441838
}
18451839

1846-
std::vector<std::shared_ptr<detail::node_impl>> DepImpls;
1847-
for (auto &D : Deps) {
1848-
DepImpls.push_back(sycl::detail::getSyclObjImpl(D));
1849-
}
1850-
18511840
graph_impl::WriteLock Lock(impl->MMutex);
1852-
std::shared_ptr<detail::node_impl> NodeImpl = impl->add(DynCGFImpl, DepImpls);
1853-
return sycl::detail::createSyclObjFromImpl<node>(std::move(NodeImpl));
1841+
detail::node_impl &NodeImpl = impl->add(DynCGFImpl, Deps);
1842+
return sycl::detail::createSyclObjFromImpl<node>(NodeImpl);
18541843
}
18551844

18561845
node modifiable_command_graph::addImpl(const std::vector<node> &Deps) {
18571846
impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function");
1858-
std::vector<std::shared_ptr<detail::node_impl>> DepImpls;
1859-
for (auto &D : Deps) {
1860-
DepImpls.push_back(sycl::detail::getSyclObjImpl(D));
1861-
}
18621847

18631848
graph_impl::WriteLock Lock(impl->MMutex);
1864-
std::shared_ptr<detail::node_impl> NodeImpl = impl->add(DepImpls);
1865-
return sycl::detail::createSyclObjFromImpl<node>(std::move(NodeImpl));
1849+
detail::node_impl &NodeImpl = impl->add(Deps);
1850+
return sycl::detail::createSyclObjFromImpl<node>(NodeImpl);
18661851
}
18671852

18681853
node modifiable_command_graph::addImpl(std::function<void(handler &)> CGF,
18691854
const std::vector<node> &Deps) {
18701855
impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function");
1871-
std::vector<std::shared_ptr<detail::node_impl>> DepImpls;
1872-
for (auto &D : Deps) {
1873-
DepImpls.push_back(sycl::detail::getSyclObjImpl(D));
1874-
}
18751856

1876-
std::shared_ptr<detail::node_impl> NodeImpl = impl->add(CGF, {}, DepImpls);
1877-
return sycl::detail::createSyclObjFromImpl<node>(std::move(NodeImpl));
1857+
detail::node_impl &NodeImpl = impl->add(CGF, {}, Deps);
1858+
return sycl::detail::createSyclObjFromImpl<node>(NodeImpl);
18781859
}
18791860

18801861
void modifiable_command_graph::addGraphLeafDependencies(node Node) {

0 commit comments

Comments
 (0)