Skip to content

Commit 4e43183

Browse files
Merge pull request #861 from Devsh-Graphics-Programming/tlas_blas_tracking
TLAS tracking BLAS they use doing build
2 parents cca5e09 + 0264eac commit 4e43183

8 files changed

+249
-35
lines changed

include/nbl/video/IDeferredOperation.h

+10
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,15 @@ class IDeferredOperation : public IBackendObject
3030
{
3131
const auto retval = execute_impl();
3232
if (retval==STATUS::COMPLETED || retval==STATUS::_ERROR)
33+
{
34+
std::lock_guard lock(m_completionMutex);
35+
if (m_callback)
36+
{
37+
m_callback(this);
38+
m_callback = {};
39+
}
3340
m_resourceTracking.clear();
41+
}
3442
return retval;
3543
}
3644

@@ -66,6 +74,8 @@ class IDeferredOperation : public IBackendObject
6674
private:
6775
friend class ILogicalDevice;
6876
// when we improve allocators, etc. we'll stop using STL containers here
77+
std::mutex m_completionMutex;
78+
std::function<void(IDeferredOperation*)> m_callback;
6979
core::vector<core::smart_refctd_ptr<const IReferenceCounted>> m_resourceTracking;
7080
};
7181

include/nbl/video/IGPUAccelerationStructure.h

+70-3
Original file line numberDiff line numberDiff line change
@@ -400,8 +400,9 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
400400
template<typename T> requires nbl::is_any_of_v<T,std::conditional_t<std::is_same_v<BufferType,IGPUBuffer>,uint32_t,BuildRangeInfo>,BuildRangeInfo>
401401
inline uint32_t valid(const T& buildRangeInfo) const
402402
{
403+
uint32_t retval = trackedBLASes.size();
403404
if constexpr (std::is_same_v<T,uint32_t>)
404-
return valid<BuildRangeInfo>({.instanceCount=buildRangeInfo,.instanceByteOffset=0});
405+
retval += valid<BuildRangeInfo>({.instanceCount=buildRangeInfo,.instanceByteOffset=0});
405406
else
406407
{
407408
if (IGPUAccelerationStructure::BuildInfo<BufferType>::invalid(srcAS,dstAS))
@@ -444,8 +445,9 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
444445
#endif
445446

446447
// destination, scratch and instanceData are required, source is optional
447-
return Base::isUpdate ? 4u:3u;
448+
retval += Base::isUpdate ? 4u:3u;
448449
}
450+
return retval;
449451
}
450452

451453
inline core::smart_refctd_ptr<const IReferenceCounted>* fillTracking(core::smart_refctd_ptr<const IReferenceCounted>* oit) const
@@ -457,6 +459,9 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
457459

458460
*(oit++) = core::smart_refctd_ptr<const IReferenceCounted>(instanceData.buffer);
459461

462+
for (const auto& blas : trackedBLASes)
463+
*(oit++) = core::smart_refctd_ptr<const IReferenceCounted>(blas);
464+
460465
return oit;
461466
}
462467

@@ -470,6 +475,8 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
470475
// + an array of `PolymorphicInstance` if our `SCreationParams::flags.hasFlags(MOTION_BIT)`, otherwise
471476
// + an array of `StaticInstance`
472477
asset::SBufferBinding<const BufferType> instanceData = {};
478+
// [optional] Provide info about what BLAS references to hold onto after the build. For performance make sure the list is compact (without repeated elements).
479+
std::span<const IGPUBottomLevelAccelerationStructure*> trackedBLASes = {};
473480
};
474481
using DeviceBuildInfo = BuildInfo<IGPUBuffer>;
475482
using HostBuildInfo = BuildInfo<asset::ICPUBuffer>;
@@ -545,11 +552,71 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
545552
using HostPolymorphicInstance = PolymorphicInstance<IGPUBottomLevelAccelerationStructure::host_op_ref_t>;
546553
static_assert(sizeof(DevicePolymorphicInstance)==sizeof(HostPolymorphicInstance));
547554

555+
//
556+
using build_ver_t = uint32_t;
557+
// this gets called when execution is sure to happen 100%, e.g. not during command recording but during submission
558+
inline build_ver_t registerNextBuildVer()
559+
{
560+
return m_pendingBuildVer++;
561+
}
562+
//
563+
using blas_smart_ptr_t = core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>;
564+
// returns number of tracked BLASes if `tracked==nullptr` otherwise writes `*count` tracked BLASes from `first` into `*tracked`
565+
inline build_ver_t getTrackedBLASes(uint32_t* count, blas_smart_ptr_t* tracked, const uint32_t first=0) const
566+
{
567+
if (!count)
568+
return 0;
569+
// stop multiple threads messing with us
570+
std::lock_guard lk(m_trackingLock);
571+
const uint32_t toWrite = std::min<uint32_t>(std::max<uint32_t>(m_trackedBLASes.size(),first)-first,tracked ? (*count):0xffFFffFFu);
572+
*count = toWrite;
573+
if (tracked && toWrite)
574+
{
575+
auto it = m_trackedBLASes.begin();
576+
// cmon its an unordered map, iterator should have operator +=
577+
for (auto i=0; i<first; i++)
578+
it++;
579+
for (auto i=0; i<toWrite; i++)
580+
*(tracked++) = *(it++);
581+
}
582+
return m_completedBuildVer;
583+
}
584+
// Useful if TLAS got built externally as well, returns if there were no later builds that preempted us setting the result here
585+
template<typename Iterator>
586+
inline bool setTrackedBLASes(const Iterator begin, const Iterator end, const build_ver_t buildVer)
587+
{
588+
// stop multiple threads messing with us
589+
std::lock_guard lk(m_trackingLock);
590+
// stop out of order callbacks
591+
if (buildVer<=m_completedBuildVer)
592+
return false;
593+
m_completedBuildVer = buildVer;
594+
// release already tracked BLASes
595+
m_trackedBLASes.clear();
596+
// sanity check, TODO: this should be an atomic_max on the `m_pendingBuildVer`
597+
if (m_completedBuildVer>m_pendingBuildVer)
598+
m_pendingBuildVer = m_completedBuildVer;
599+
// now fill the contents
600+
m_trackedBLASes.insert(begin,end);
601+
return true;
602+
}
603+
// a little utility to make sure nothing from this build version and before gets tracked
604+
inline bool clearTrackedBLASes(const build_ver_t buildVer)
605+
{
606+
return setTrackedBLASes<const blas_smart_ptr_t*>(nullptr,nullptr,buildVer);
607+
}
608+
548609
protected:
549610
inline IGPUTopLevelAccelerationStructure(core::smart_refctd_ptr<const ILogicalDevice>&& dev, SCreationParams&& params)
550-
: asset::ITopLevelAccelerationStructure<IGPUAccelerationStructure>(std::move(dev),std::move(params)), m_maxInstanceCount(params.maxInstanceCount) {}
611+
: asset::ITopLevelAccelerationStructure<IGPUAccelerationStructure>(std::move(dev),std::move(params)),
612+
m_maxInstanceCount(params.maxInstanceCount),m_trackedBLASes() {}
551613

552614
const uint32_t m_maxInstanceCount;
615+
// TODO: maybe replace with new readers/writers lock
616+
mutable std::mutex m_trackingLock;
617+
std::atomic<build_ver_t> m_pendingBuildVer = 0;
618+
build_ver_t m_completedBuildVer = 0;
619+
core::unordered_set<blas_smart_ptr_t> m_trackedBLASes;
553620
};
554621

555622
}

include/nbl/video/IGPUCommandBuffer.h

+12
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,9 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
536536
//! Secondary CommandBuffer execute
537537
bool executeCommands(const uint32_t count, IGPUCommandBuffer* const* const cmdbufs);
538538

539+
// in case you want the commandbuffer to hold onto things as long as its not RESET
540+
bool recordReferences(const std::span<const IReferenceCounted*> refs);
541+
539542
virtual bool insertDebugMarker(const char* name, const core::vector4df_SIMD& color = core::vector4df_SIMD(1.0, 1.0, 1.0, 1.0)) = 0;
540543
virtual bool beginDebugMarker(const char* name, const core::vector4df_SIMD& color = core::vector4df_SIMD(1.0, 1.0, 1.0, 1.0)) = 0;
541544
virtual bool endDebugMarker() = 0;
@@ -708,6 +711,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
708711
m_state = STATE::INITIAL;
709712

710713
m_boundDescriptorSetsRecord.clear();
714+
m_TLASToBLASReferenceSets.clear();
711715
m_boundGraphicsPipeline= nullptr;
712716
m_boundComputePipeline= nullptr;
713717
m_boundRayTracingPipeline= nullptr;
@@ -725,6 +729,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
725729
{
726730
deleteCommandList();
727731
m_boundDescriptorSetsRecord.clear();
732+
m_TLASToBLASReferenceSets.clear();
728733
m_boundGraphicsPipeline= nullptr;
729734
m_boundComputePipeline= nullptr;
730735
m_boundRayTracingPipeline= nullptr;
@@ -858,10 +863,17 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
858863
template<typename IndirectCommand> requires nbl::is_any_of_v<IndirectCommand, hlsl::DrawArraysIndirectCommand_t, hlsl::DrawElementsIndirectCommand_t>
859864
bool invalidDrawIndirectCount(const asset::SBufferBinding<const IGPUBuffer>& indirectBinding, const asset::SBufferBinding<const IGPUBuffer>& countBinding, const uint32_t maxDrawCount, const uint32_t stride);
860865

866+
861867
// This bound descriptor set record doesn't include the descriptor sets whose layout has _any_ one of its bindings
862868
// created with IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT
863869
// or IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT.
864870
core::unordered_map<const IGPUDescriptorSet*,uint64_t> m_boundDescriptorSetsRecord;
871+
872+
// If the user wants the builds to be tracking, and make the TLAS remember the BLASes that have been built into it.
873+
// NOTE: We know that a TLAS may be rebuilt multiple times per frame on purpose and not only the final BLASes need to be kept alive till submission finishes.
874+
// However, the Command Pool already tracks resources referenced in the Build Infos, so we only need pointers into those records.
875+
core::unordered_map<IGPUTopLevelAccelerationStructure*,std::span<const IGPUTopLevelAccelerationStructure::blas_smart_ptr_t>> m_TLASToBLASReferenceSets;
876+
865877
const IGPUGraphicsPipeline* m_boundGraphicsPipeline;
866878
const IGPUComputePipeline* m_boundComputePipeline;
867879
const IGPURayTracingPipeline* m_boundRayTracingPipeline;

include/nbl/video/IGPUCommandPool.h

+12
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ class IGPUCommandPool : public IBackendObject
140140
class CBlitImageCmd;
141141
class CCopyImageToBufferCmd;
142142
class CExecuteCommandsCmd;
143+
class CCustomReferenceCmd;
143144
class CWaitEventsCmd;
144145
class CCopyImageCmd;
145146
class CResolveImageCmd;
@@ -686,6 +687,17 @@ class IGPUCommandPool::CExecuteCommandsCmd final : public IVariableSizeCommand<C
686687
}
687688
};
688689

690+
class IGPUCommandPool::CCustomReferenceCmd final : public IVariableSizeCommand<CCustomReferenceCmd>
691+
{
692+
public:
693+
CCustomReferenceCmd(const uint32_t count) : IVariableSizeCommand<CCustomReferenceCmd>(count) {}
694+
695+
static uint32_t calc_resources(const uint32_t count)
696+
{
697+
return count;
698+
}
699+
};
700+
689701
class IGPUCommandPool::CWaitEventsCmd final : public IVariableSizeCommand<CWaitEventsCmd>
690702
{
691703
public:

include/nbl/video/ILogicalDevice.h

+30
Original file line numberDiff line numberDiff line change
@@ -539,11 +539,41 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
539539
// track things created
540540
if (result==DEFERRABLE_RESULT::DEFERRED)
541541
{
542+
constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>;
543+
struct TLASCallback
544+
{
545+
// upon completion set the BLASes tracked
546+
inline void operator()(IDeferredOperation*) const
547+
{
548+
for (const auto& set : m_TLASToBLASReferenceSets)
549+
{
550+
auto tlas = set.first;
551+
// we know the build is completed immediately after performing it, so we get our pending stamp then
552+
tlas->setTrackedBLASes(set.second.begin(),set.second.end(),tlas->registerNextBuildVer());
553+
}
554+
}
555+
556+
// the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes
557+
core::unordered_map<IGPUTopLevelAccelerationStructure*,std::span<const IGPUTopLevelAccelerationStructure::blas_smart_ptr_t>> m_TLASToBLASReferenceSets;
558+
} callback = {};
559+
542560
auto& tracking = deferredOperation->m_resourceTracking;
543561
tracking.resize(trackingReservation);
544562
auto oit = tracking.data();
545563
for (const auto& info : infos)
564+
{
546565
oit = info.fillTracking(oit);
566+
if constexpr (IsTLAS)
567+
{
568+
const auto blasCount = info.trackedBLASes.size();
569+
if (blasCount)
570+
callback.m_TLASToBLASReferenceSets[info.dstAS] = {reinterpret_cast<const IGPUTopLevelAccelerationStructure::blas_smart_ptr_t*>(oit-blasCount),blasCount};
571+
else
572+
callback.m_TLASToBLASReferenceSets[info.dstAS] = {};
573+
}
574+
}
575+
if constexpr (IsTLAS)
576+
deferredOperation->m_callback = std::move(callback);
547577
}
548578
return result!=DEFERRABLE_RESULT::SOME_ERROR;
549579
}

include/nbl/video/IQueue.h

+25-30
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,12 @@ class IQueue : public core::Interface, public core::Unmovable
9292
std::span<const SSemaphoreInfo> waitSemaphores = {};
9393
std::span<const SCommandBufferInfo> commandBuffers = {};
9494
std::span<const SSemaphoreInfo> signalSemaphores = {};
95+
// No guarantees are given about when it will execute, except that it will execute:
96+
// 1) after the `signalSemaphore.back()` signals
97+
// 2) in order w.r.t. all other submits on this queue
98+
// 3) after all lifetime tracking has been performed (so transient resources will already be dead!)
99+
// NOTE: This `std::function` WILL be copied!
100+
std::function<void()>* completionCallback = nullptr;
95101

96102
inline bool valid() const
97103
{
@@ -116,45 +122,34 @@ class IQueue : public core::Interface, public core::Unmovable
116122
virtual const void* getNativeHandle() const = 0;
117123

118124
// only public because MultiTimelineEventHandlerST needs to know about it
119-
class DeferredSubmitResourceDrop final
125+
class DeferredSubmitCallback final
120126
{
127+
//
128+
struct STLASBuildMetadata
129+
{
130+
core::unordered_set<IGPUTopLevelAccelerationStructure::blas_smart_ptr_t> m_BLASes;
131+
uint32_t m_buildVer;
132+
};
133+
core::unordered_map<IGPUTopLevelAccelerationStructure*,STLASBuildMetadata> m_TLASToBLASReferenceSets;
134+
//
121135
using smart_ptr = core::smart_refctd_ptr<IBackendObject>;
122-
core::smart_refctd_dynamic_array<smart_ptr> m_resources;
136+
core::smart_refctd_dynamic_array<smart_ptr> m_resources;
137+
//
138+
std::function<void()> m_callback;
123139

124140
public:
125-
inline DeferredSubmitResourceDrop(const SSubmitInfo& info)
126-
{
127-
// We could actually not hold any signal semaphore because you're expected to use the signal result somewhere else.
128-
// However it's possible to you might only wait on one from the set and then drop the rest (UB)
129-
m_resources = core::make_refctd_dynamic_array<decltype(m_resources)>(info.signalSemaphores.size()-1+info.commandBuffers.size()+info.waitSemaphores.size());
130-
auto outRes = m_resources->data();
131-
for (const auto& sema : info.waitSemaphores)
132-
*(outRes++) = smart_ptr(sema.semaphore);
133-
for (const auto& cb : info.commandBuffers)
134-
*(outRes++) = smart_ptr(cb.cmdbuf);
135-
// We don't hold the last signal semaphore, because the timeline does as an Event trigger.
136-
for (auto i=0u; i<info.signalSemaphores.size()-1; i++)
137-
*(outRes++) = smart_ptr(info.signalSemaphores[i].semaphore);
138-
}
139-
DeferredSubmitResourceDrop(const DeferredSubmitResourceDrop& other) = delete;
140-
inline DeferredSubmitResourceDrop(DeferredSubmitResourceDrop&& other) : m_resources(nullptr)
141+
DeferredSubmitCallback(const SSubmitInfo& info);
142+
DeferredSubmitCallback(const DeferredSubmitCallback& other) = delete;
143+
inline DeferredSubmitCallback(DeferredSubmitCallback&& other) : m_resources(nullptr)
141144
{
142145
this->operator=(std::move(other));
143146
}
144147

145-
DeferredSubmitResourceDrop& operator=(const DeferredSubmitResourceDrop& other) = delete;
146-
inline DeferredSubmitResourceDrop& operator=(DeferredSubmitResourceDrop&& other)
147-
{
148-
m_resources = std::move(other.m_resources);
149-
other.m_resources = nullptr;
150-
return *this;
151-
}
148+
DeferredSubmitCallback& operator=(const DeferredSubmitCallback& other) = delete;
149+
DeferredSubmitCallback& operator=(DeferredSubmitCallback&& other);
152150

153151
// always exhaustive poll, because we need to get rid of resources ASAP
154-
inline void operator()()
155-
{
156-
m_resources = nullptr;
157-
}
152+
void operator()();
158153
};
159154

160155
protected:
@@ -170,7 +165,7 @@ class IQueue : public core::Interface, public core::Unmovable
170165
virtual RESULT waitIdle_impl() const = 0;
171166

172167
// Refcounts all resources used by Pending Submits, gets occasionally cleared out
173-
std::unique_ptr<MultiTimelineEventHandlerST<DeferredSubmitResourceDrop,false>> m_submittedResources;
168+
std::unique_ptr<MultiTimelineEventHandlerST<DeferredSubmitCallback,false>> m_submittedResources;
174169
const ILogicalDevice* m_originDevice;
175170
const uint32_t m_familyIndex;
176171
const float m_priority;

src/nbl/video/IGPUCommandBuffer.cpp

+30
Original file line numberDiff line numberDiff line change
@@ -816,6 +816,7 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::span<c
816816

817817
if (indirectBuffer)
818818
{
819+
// TODO: maybe hoist the check
819820
if (!features.accelerationStructureIndirectBuild)
820821
{
821822
NBL_LOG_ERROR("'accelerationStructureIndirectBuild' feature not enabled!");
@@ -835,7 +836,18 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::span<c
835836
if (indirectBuffer)
836837
*(oit++) = core::smart_refctd_ptr<const IGPUBuffer>(indirectBuffer);
837838
for (const auto& info : infos)
839+
{
838840
oit = info.fillTracking(oit);
841+
// we still need to clear the BLAS tracking list if the TLAS has nothing to track
842+
if constexpr (std::is_same_v<DeviceBuildInfo,IGPUTopLevelAccelerationStructure::DeviceBuildInfo>)
843+
{
844+
const auto blasCount = info.trackedBLASes.size();
845+
if (blasCount)
846+
m_TLASToBLASReferenceSets[info.dstAS] = {reinterpret_cast<const IGPUTopLevelAccelerationStructure::blas_smart_ptr_t*>(oit-blasCount),blasCount};
847+
else
848+
m_TLASToBLASReferenceSets[info.dstAS] = {};
849+
}
850+
}
839851

840852
return totalGeometries;
841853
}
@@ -2066,4 +2078,22 @@ bool IGPUCommandBuffer::executeCommands(const uint32_t count, IGPUCommandBuffer*
20662078
return executeCommands_impl(count,cmdbufs);
20672079
}
20682080

2081+
bool IGPUCommandBuffer::recordReferences(const std::span<const IReferenceCounted*> refs)
2082+
{
2083+
if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::GRAPHICS_BIT|queue_flags_t::TRANSFER_BIT|queue_flags_t::SPARSE_BINDING_BIT))
2084+
return false;
2085+
2086+
auto cmd = m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CCustomReferenceCmd>(m_commandList,refs.size());
2087+
if (!cmd)
2088+
{
2089+
NBL_LOG_ERROR("out of host memory!");
2090+
return false;
2091+
}
2092+
auto oit = cmd->getVariableCountResources();
2093+
for (const auto& ref : refs)
2094+
*(oit++) = core::smart_refctd_ptr<const core::IReferenceCounted>(ref);
2095+
2096+
return true;
2097+
}
2098+
20692099
}

0 commit comments

Comments
 (0)