From a1747c6eac87887c535ddbc9830afc055fa0d89d Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Tue, 16 Jan 2024 21:20:06 -0300 Subject: [PATCH 01/18] Work on HLSL impl of property pools --- .../nbl/builtin/hlsl/property_pool/copy.hlsl | 127 ++++++++++++++++++ .../builtin/hlsl/property_pool/transfer.hlsl | 36 +++++ 2 files changed, 163 insertions(+) create mode 100644 include/nbl/builtin/hlsl/property_pool/copy.hlsl create mode 100644 include/nbl/builtin/hlsl/property_pool/transfer.hlsl diff --git a/include/nbl/builtin/hlsl/property_pool/copy.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.hlsl new file mode 100644 index 0000000000..8d7becefaa --- /dev/null +++ b/include/nbl/builtin/hlsl/property_pool/copy.hlsl @@ -0,0 +1,127 @@ +#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" +#include "nbl/builtin/hlsl/property_pool/transfer.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace property_pools +{ +// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 +template +uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() { + return uint32_t3(capability_traits::maxOptimallyResidentWorkgroupInvocations, 1, 1); +} + +[[vk::push_constant]] GlobalPushContants globals; + +template +struct TransferLoop +{ + void iteration(uint propertyId, uint64_t propertySize, uint64_t srcAddr, uint64_t dstAddr, uint invocationIndex) + { + const uint srcOffset = uint64_t(invocationIndex) * (uint64_t(1) << SrcIndexSizeLog2) * propertySize; + const uint dstOffset = uint64_t(invocationIndex) * (uint64_t(1) << DstIndexSizeLog2) * propertySize; + + const uint srcIndexAddress = Fill ? srcAddr + srcOffset : srcAddr; + const uint dstIndexAddress = Fill ? dstAddr + dstOffset : dstAddr; + + const uint srcAddressMapped = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad(srcIndexAddress); + const uint dstAddressMapped = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad(dstIndexAddress); + + if (SrcIndexSizeLog2 == 0) {} // we can't write individual bytes + else if (SrcIndexSizeLog2 == 1) vk::RawBufferStore(dstAddressMapped, vk::RawBufferLoad(srcAddressMapped)); + else if (SrcIndexSizeLog2 == 2) vk::RawBufferStore(dstAddressMapped, vk::RawBufferLoad(srcAddressMapped)); + else if (SrcIndexSizeLog2 == 3) vk::RawBufferStore(dstAddressMapped, vk::RawBufferLoad(srcAddressMapped)); + } + + void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + { + uint lastInvocation = min(transferRequest.elementCount, gloabls.endOffset); + for (uint invocationIndex = globals.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize) + { + iteration(propertyId, transferRequest.propertySize, transferRequest.srcAddr, transferRequest.dstAddr, invocationIndex); + } + } +} + +// For creating permutations of the functions based on parameters that are constant over the transfer request +// These branches should all be scalar, and because of how templates work, the loops shouldn't have any +// branching within them + +template +struct TransferLoopPermutationSrcIndexSizeLog +{ + void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + { + if (transferRequest.dstIndexSizeLog2 == 0) TransferLoop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + else if (transferRequest.dstIndexSizeLog2 == 1) TransferLoop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + else if (transferRequest.dstIndexSizeLog2 == 2) TransferLoop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ TransferLoop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + } +} + +template +struct TransferLoopPermutationDstIota +{ + void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + { + if (transferRequest.srcIndexSizeLog2 == 0) TransferLoopPermutationSrcIndexSizeLog.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + else if (transferRequest.srcIndexSizeLog2 == 1) TransferLoopPermutationSrcIndexSizeLog.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + else if (transferRequest.srcIndexSizeLog2 == 2) TransferLoopPermutationSrcIndexSizeLog.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ TransferLoopPermutationSrcIndexSizeLog.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + } +} + +template +struct TransferLoopPermutationSrcIota +{ + void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + { + bool dstIota = transferRequest.dstAddr == 0; + if (dstIota) TransferLoopPermutationDstIota.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + else TransferLoopPermutationDstIota.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + } +} + +template +struct TransferLoopPermutationFill +{ + void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + { + bool srcIota = transferRequest.srcAddr == 0; + if (srcIota) TransferLoopPermutationSrcIota.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + else TransferLoopPermutationSrcIota.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + } +} + +void main(uint32_t3 dispatchId : SV_DispatchThreadID) +{ + const uint propertyId = dispatchId.y; + const uint invocationIndex = dispatchId.x; + + // Loading transfer request from the pointer (can't use struct + // with BDA on HLSL SPIRV) + const TransferRequest transferRequest; + transferRequest.srcAddr = vk::RawBufferLoad(globals.transferCommandsAddress); + transferRequest.dstAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t)); + transferRequest.srcIndexAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 2); + transferRequest.dstIndexAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 3); + // TODO: These are all part of the same bitfield and shoulbe read with a single RawBufferLoad + transferRequest.elementCount = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 4); + transferRequest.propertySize = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 5); + transferRequest.fill = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 6); + transferRequest.srcIndexSizeLog2 = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 7); + transferRequest.dstIndexSizeLog2 = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 8); + + const uint dispatchSize = capability_traits::maxOptimallyResidentWorkgroupInvocations; + const bool fill = transferRequest.fill == 1; + + if (fill) TransferLoopPermutationFill.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); + else TransferLoopPermutationFill.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); +} + +} +} +} + diff --git a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl new file mode 100644 index 0000000000..cd51e23fd2 --- /dev/null +++ b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl @@ -0,0 +1,36 @@ +namespace nbl +{ +namespace hlsl +{ +namespace property_pools +{ + +struct TransferRequest +{ + // This represents a transfer command/request + uint64_t srcAddr; + uint64_t dstAddr; + uint64_t srcIndexAddr = 0; // IOTA default + uint64_t dstIndexAddr = 0; // IOTA default + uint64_t elementCount : 35; // allow up to 64GB IGPUBuffers + uint64_t propertySize : 24; // all the leftover bits (just use bytes now) + uint64_t fill : 1 = false; + // 0=uint8, 1=uint16, 2=uint32, 3=uint64 + uint64_t srcIndexSizeLog2 : 2 = 1; + uint64_t dstIndexSizeLog2 : 2 = 1; +}; + +struct GlobalPushContants +{ + // Define the range of invocations (X axis) that will be transfered over in this dispatch + // May be sectioned off in the case of overflow or any other situation that doesn't allow + // for a full transfer + uint64_t beginOffset; + uint64_t endOffset; + // BDA address (GPU pointer) into the transfer commands buffer + uint64_t transferCommandsAddress; +}; + +} +} +} From adc4d576d034b3e1fb31356645c0a0915beee817 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Thu, 18 Jan 2024 23:14:06 -0300 Subject: [PATCH 02/18] Fix compilation issues with shader --- .../{copy.hlsl => copy.comp.hlsl} | 47 +++++++++++-------- .../builtin/hlsl/property_pool/transfer.hlsl | 10 ++-- src/nbl/builtin/CMakeLists.txt | 6 ++- 3 files changed, 38 insertions(+), 25 deletions(-) rename include/nbl/builtin/hlsl/property_pool/{copy.hlsl => copy.comp.hlsl} (90%) diff --git a/include/nbl/builtin/hlsl/property_pool/copy.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl similarity index 90% rename from include/nbl/builtin/hlsl/property_pool/copy.hlsl rename to include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl index 8d7becefaa..f60d6c6163 100644 --- a/include/nbl/builtin/hlsl/property_pool/copy.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl @@ -1,17 +1,25 @@ #include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/property_pool/transfer.hlsl" +// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 +template +uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() { + return uint32_t3(capability_traits::maxOptimallyResidentWorkgroupInvocations, 1, 1); +} + +[[numthreads(1, 1, 1)] +void main(uint32_t3 dispatchId : SV_DispatchThreadID) +{ + nbl::hlsl::property_pool::main(dispatchId); +} + namespace nbl { namespace hlsl { namespace property_pools { -// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 -template -uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() { - return uint32_t3(capability_traits::maxOptimallyResidentWorkgroupInvocations, 1, 1); -} [[vk::push_constant]] GlobalPushContants globals; @@ -37,13 +45,13 @@ struct TransferLoop void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) { - uint lastInvocation = min(transferRequest.elementCount, gloabls.endOffset); + uint lastInvocation = min(transferRequest.elementCount, globals.endOffset); for (uint invocationIndex = globals.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize) { iteration(propertyId, transferRequest.propertySize, transferRequest.srcAddr, transferRequest.dstAddr, invocationIndex); } } -} +}; // For creating permutations of the functions based on parameters that are constant over the transfer request // These branches should all be scalar, and because of how templates work, the loops shouldn't have any @@ -59,7 +67,7 @@ struct TransferLoopPermutationSrcIndexSizeLog else if (transferRequest.dstIndexSizeLog2 == 2) TransferLoop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ TransferLoop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } -} +}; template struct TransferLoopPermutationDstIota @@ -71,7 +79,7 @@ struct TransferLoopPermutationDstIota else if (transferRequest.srcIndexSizeLog2 == 2) TransferLoopPermutationSrcIndexSizeLog.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ TransferLoopPermutationSrcIndexSizeLog.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } -} +}; template struct TransferLoopPermutationSrcIota @@ -82,7 +90,7 @@ struct TransferLoopPermutationSrcIota if (dstIota) TransferLoopPermutationDstIota.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); else TransferLoopPermutationDstIota.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } -} +}; template struct TransferLoopPermutationFill @@ -93,9 +101,9 @@ struct TransferLoopPermutationFill if (srcIota) TransferLoopPermutationSrcIota.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); else TransferLoopPermutationSrcIota.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } -} +}; -void main(uint32_t3 dispatchId : SV_DispatchThreadID) +void main(uint32_t3 dispatchId) { const uint propertyId = dispatchId.y; const uint invocationIndex = dispatchId.x; @@ -107,12 +115,14 @@ void main(uint32_t3 dispatchId : SV_DispatchThreadID) transferRequest.dstAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t)); transferRequest.srcIndexAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 2); transferRequest.dstIndexAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 3); - // TODO: These are all part of the same bitfield and shoulbe read with a single RawBufferLoad - transferRequest.elementCount = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 4); - transferRequest.propertySize = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 5); - transferRequest.fill = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 6); - transferRequest.srcIndexSizeLog2 = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 7); - transferRequest.dstIndexSizeLog2 = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 8); + // Remaining elements are part of the same bitfield + // TODO: Do this only using raw buffer load? + uint64_t bitfieldType = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 4); + transferRequest.elementCount = bitfieldType; + transferRequest.propertySize = bitfieldType >> 35; + transferRequest.fill = bitfieldType >> (35 + 24); + transferRequest.srcIndexSizeLog2 = bitfieldType >> (35 + 24 + 1); + transferRequest.dstIndexSizeLog2 = bitfieldType >> (35 + 24 + 1 + 2); const uint dispatchSize = capability_traits::maxOptimallyResidentWorkgroupInvocations; const bool fill = transferRequest.fill == 1; @@ -124,4 +134,3 @@ void main(uint32_t3 dispatchId : SV_DispatchThreadID) } } } - diff --git a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl index cd51e23fd2..b900dae153 100644 --- a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl @@ -10,14 +10,14 @@ struct TransferRequest // This represents a transfer command/request uint64_t srcAddr; uint64_t dstAddr; - uint64_t srcIndexAddr = 0; // IOTA default - uint64_t dstIndexAddr = 0; // IOTA default + uint64_t srcIndexAddr; // IOTA default + uint64_t dstIndexAddr; // IOTA default uint64_t elementCount : 35; // allow up to 64GB IGPUBuffers uint64_t propertySize : 24; // all the leftover bits (just use bytes now) - uint64_t fill : 1 = false; + uint64_t fill; // 0=uint8, 1=uint16, 2=uint32, 3=uint64 - uint64_t srcIndexSizeLog2 : 2 = 1; - uint64_t dstIndexSizeLog2 : 2 = 1; + uint64_t srcIndexSizeLog2 : 2; + uint64_t dstIndexSizeLog2 : 2; }; struct GlobalPushContants diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index c8abe13460..eeec7fa474 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -300,4 +300,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/broadcast.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl") -ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL") \ No newline at end of file +# property pools +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/property_pool/transfer.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/property_pool/copy.comp.hlsl") + +ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL") From d9ddf4125a64c22e1235c53fbb5cf496c8650f0a Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Sun, 21 Jan 2024 10:57:42 -0300 Subject: [PATCH 03/18] Nuke code that won't be used (for now) --- .../builtin/hlsl/property_pool/copy.comp.hlsl | 8 +- .../builtin/hlsl/property_pool/transfer.hlsl | 7 +- include/nbl/scene/ITransformTreeManager.h | 2 + .../video/utilities/CPropertyPoolHandler.h | 50 +++------ include/nbl/video/utilities/IPropertyPool.h | 18 ++-- .../video/utilities/CPropertyPoolHandler.cpp | 100 ++++++------------ src/nbl/video/utilities/IPropertyPool.cpp | 2 +- 7 files changed, 71 insertions(+), 116 deletions(-) diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl index f60d6c6163..a959a0501e 100644 --- a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl @@ -3,10 +3,10 @@ #include "nbl/builtin/hlsl/property_pool/transfer.hlsl" // https://github.com/microsoft/DirectXShaderCompiler/issues/6144 -template -uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() { - return uint32_t3(capability_traits::maxOptimallyResidentWorkgroupInvocations, 1, 1); -} +// template +// uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() { +// return uint32_t3(capability_traits::maxOptimallyResidentWorkgroupInvocations, 1, 1); +// } [[numthreads(1, 1, 1)] void main(uint32_t3 dispatchId : SV_DispatchThreadID) diff --git a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl index b900dae153..d89d2ff6d0 100644 --- a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl @@ -1,3 +1,6 @@ +#ifndef _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_ +#define _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_ + namespace nbl { namespace hlsl @@ -30,7 +33,9 @@ struct GlobalPushContants // BDA address (GPU pointer) into the transfer commands buffer uint64_t transferCommandsAddress; }; - } } } + +#endif + diff --git a/include/nbl/scene/ITransformTreeManager.h b/include/nbl/scene/ITransformTreeManager.h index 141bb38566..3045eeaf3d 100644 --- a/include/nbl/scene/ITransformTreeManager.h +++ b/include/nbl/scene/ITransformTreeManager.h @@ -253,6 +253,7 @@ class ITransformTreeManager : public virtual core::IReferenceCounted return true; } +#if 0 // TODO: upstreaming cpropertypoolhandler // struct UpstreamRequestBase : RequestBase { @@ -496,6 +497,7 @@ class ITransformTreeManager : public virtual core::IReferenceCounted waitSemaphoreCount,semaphoresToWaitBeforeOverwrite,stagesToWaitForPerSemaphore,request.logger,maxWaitPoint ); } +#endif diff --git a/include/nbl/video/utilities/CPropertyPoolHandler.h b/include/nbl/video/utilities/CPropertyPoolHandler.h index 0bf60c55d4..6bd533a303 100644 --- a/include/nbl/video/utilities/CPropertyPoolHandler.h +++ b/include/nbl/video/utilities/CPropertyPoolHandler.h @@ -12,16 +12,15 @@ #include "nbl/video/utilities/IDescriptorSetCache.h" #include "nbl/video/utilities/IPropertyPool.h" +#include "glm/glm/glm.hpp" +#include +#include +#include "nbl/builtin/hlsl/property_pool/transfer.hlsl" namespace nbl::video { -#define int int32_t -#define uint uint32_t -#include "nbl/builtin/glsl/property_pool/transfer.glsl" -#undef uint -#undef int -static_assert(NBL_BUILTIN_PROPERTY_POOL_INVALID==IPropertyPool::invalid); +//static_assert(NBL_BUILTIN_PROPERTY_POOL_INVALID==IPropertyPool::invalid); // property pool factory is externally synchronized // TODO: could rename to CSparseStreamingSystem/CSparseStreamingHandler @@ -36,19 +35,10 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ // inline ILogicalDevice* getDevice() {return m_device.get();} - // - inline const uint32_t getMaxPropertiesPerTransferDispatch() {return m_maxPropertiesPerPass;} - - // - inline uint32_t getMaxScratchSize() const {return sizeof(nbl_glsl_property_pool_transfer_t)*m_maxPropertiesPerPass;} - // inline IGPUComputePipeline* getPipeline() {return m_pipeline.get();} inline const IGPUComputePipeline* getPipeline() const {return m_pipeline.get();} - // - inline const IGPUDescriptorSetLayout* getCanonicalLayout() const { return m_dsCache->getCanonicalLayout(); } - // struct TransferRequest { @@ -56,10 +46,11 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ enum E_FLAG : uint16_t { EF_NONE=0, - EF_DOWNLOAD=NBL_BUILTIN_PROPERTY_POOL_TRANSFER_EF_DOWNLOAD, + // this wasn't used anywhere in the hlsl + EF_DOWNLOAD=1, // this flag will make the `srcAddresses ? srcAddresses[0]:0` be used as the source address for all reads, effectively "filling" with uniform value - EF_FILL=NBL_BUILTIN_PROPERTY_POOL_TRANSFER_EF_SRC_FILL, - EF_BIT_COUNT=NBL_BUILTIN_PROPERTY_POOL_TRANSFER_EF_BIT_COUNT + EF_FILL=2, + EF_BIT_COUNT=3 }; // static inline constexpr uint32_t invalid_offset = ~0u; @@ -71,9 +62,6 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ elementSize = pool->getPropertySize(propertyID); } - // - inline bool isDownload() const {return flags&EF_DOWNLOAD;} - // inline uint32_t getSourceElementCount() const { @@ -101,6 +89,7 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ system::logger_opt_ptr logger, const uint32_t baseDWORD=0u, const uint32_t endDWORD=~0ull ); +#if 0 // TODO: Up streaming requests // struct UpStreamingRequest { @@ -189,7 +178,9 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ uint32_t& waitSemaphoreCount, IGPUSemaphore* const*& semaphoresToWaitBeforeOverwrite, const asset::E_PIPELINE_STAGE_FLAGS*& stagesToWaitForPerSemaphore, system::logger_opt_ptr logger, const std::chrono::steady_clock::time_point& maxWaitPoint=std::chrono::steady_clock::now()+std::chrono::microseconds(500u) ); +#endif +#if 0 // TODO: freeing properties // utility to help you fill out the tail move scatter request after the free, properly, returns if you actually need to transfer anything static inline bool freeProperties(IPropertyPool* pool, UpStreamingRequest* requests, const uint32_t* indicesBegin, const uint32_t* indicesEnd, uint32_t* srcAddresses, uint32_t* dstAddresses) { @@ -210,31 +201,18 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ } return false; } +#endif protected: ~CPropertyPoolHandler() {} - static inline constexpr auto MaxPropertiesPerDispatch = NBL_BUILTIN_PROPERTY_POOL_MAX_PROPERTIES_PER_DISPATCH; + static inline constexpr auto MaxPropertiesPerDispatch = 0u; // TODO static inline constexpr auto DescriptorCacheSize = 128u; core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_pipeline; - // TODO: investigate using Push Descriptors for this - class TransferDescriptorSetCache : public IDescriptorSetCache - { - public: - using IDescriptorSetCache::IDescriptorSetCache; - - // - uint32_t acquireSet( - CPropertyPoolHandler* handler, const asset::SBufferBinding& scratch, const asset::SBufferBinding& addresses, - const TransferRequest* requests, const uint32_t propertyCount - ); - }; - core::smart_refctd_ptr m_dsCache; - uint16_t m_maxPropertiesPerPass; uint32_t m_alignment; }; diff --git a/include/nbl/video/utilities/IPropertyPool.h b/include/nbl/video/utilities/IPropertyPool.h index 0f56df622e..3c23ddce24 100644 --- a/include/nbl/video/utilities/IPropertyPool.h +++ b/include/nbl/video/utilities/IPropertyPool.h @@ -11,6 +11,11 @@ #include "nbl/video/ILogicalDevice.h" #include "nbl/video/IGPUDescriptorSetLayout.h" +#include "glm/glm/glm.hpp" +#include +#include +#include "nbl/builtin/hlsl/property_pool/transfer.hlsl" + namespace nbl::video { @@ -21,8 +26,7 @@ class NBL_API2 IPropertyPool : public core::IReferenceCounted public: using PropertyAddressAllocator = core::PoolAddressAllocatorST; - static inline constexpr auto invalid = PropertyAddressAllocator::invalid_address; - + static inline constexpr uint64_t invalid = 0; // virtual const asset::SBufferRange& getPropertyMemoryBlock(uint32_t ix) const =0; @@ -34,19 +38,19 @@ class NBL_API2 IPropertyPool : public core::IReferenceCounted inline bool isContiguous() const {return m_indexToAddr;} // - inline uint32_t getAllocated() const + inline uint64_t getAllocated() const { return indexAllocator.get_allocated_size(); } // - inline uint32_t getFree() const + inline uint64_t getFree() const { return indexAllocator.get_free_size(); } // - inline uint32_t getCapacity() const + inline uint64_t getCapacity() const { // special case allows us to use `get_total_size`, because the pool allocator has no added offsets return indexAllocator.get_total_size(); @@ -217,8 +221,8 @@ class NBL_API2 IPropertyPool : public core::IReferenceCounted static bool validateBlocks(const ILogicalDevice* device, const uint32_t propertyCount, const size_t* propertySizes, const uint32_t capacity, const asset::SBufferRange* _memoryBlocks); PropertyAddressAllocator indexAllocator; - uint32_t* m_indexToAddr; - uint32_t* m_addrToIndex; + uint64_t* m_indexToAddr; + uint64_t* m_addrToIndex; }; diff --git a/src/nbl/video/utilities/CPropertyPoolHandler.cpp b/src/nbl/video/utilities/CPropertyPoolHandler.cpp index 635ed771d7..341dcba1b5 100644 --- a/src/nbl/video/utilities/CPropertyPoolHandler.cpp +++ b/src/nbl/video/utilities/CPropertyPoolHandler.cpp @@ -6,9 +6,32 @@ using namespace nbl; using namespace video; // -CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptr&& device) : m_device(std::move(device)), m_dsCache() +CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptr&& device) : m_device(std::move(device)) { - // TODO: rewrite in HLSL! + auto system = m_device->getPhysicalDevice()->getSystem(); + // TODO: Reuse asset manager from elsewhere? + auto assetManager = core::make_smart_refctd_ptr(core::smart_refctd_ptr(system)); + + video::IGPUObjectFromAssetConverter CPU2GPU; + video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams; + cpu2gpuParams.assetManager = assetManager.get(); + cpu2gpuParams.device = m_device.get(); + + auto loadShader = [&](const char* path) + { + asset::IAssetLoader::SAssetLoadParams params = {}; + auto shader = core::smart_refctd_ptr_static_cast(*assetManager->getAsset(path, params).getContents().begin()); + shader->setSpecializationInfo(asset::ISpecializedShader::SInfo(nullptr, nullptr, "main")); + assert(shader); + + auto gpuShaders = CPU2GPU.getGPUObjectsFromAssets(&shader, &shader + 1u, cpu2gpuParams); + auto gpuShader = gpuShaders->begin()[0u]; + assert(gpuShader); + + return gpuShader; + }; + auto shader = loadShader("../../../include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl"); + #if 0 const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits(); m_maxPropertiesPerPass = core::min((deviceLimits.maxPerStageDescriptorSSBOs-2u)/2u,MaxPropertiesPerDispatch); @@ -66,6 +89,7 @@ bool CPropertyPoolHandler::transferProperties( system::logger_opt_ptr logger, const uint32_t baseDWORD, const uint32_t endDWORD ) { +#if 0 if (requestsBegin==requestsEnd) return true; if (!scratch.buffer || !scratch.buffer->getCreationParams().usage.hasFlags(IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF)) @@ -180,8 +204,12 @@ bool CPropertyPoolHandler::transferProperties( result = copyPass(requests,leftOverProps)&&result; return result; +#endif + return false; } +#if 0 // TODO: up streaming requests + uint32_t CPropertyPoolHandler::transferProperties( StreamingTransientDataBufferMT<>* const upBuff, IGPUCommandBuffer* const cmdbuf, IGPUFence* const fence, IGPUQueue* const queue, const asset::SBufferBinding& scratch, UpStreamingRequest* &requests, const uint32_t requestCount, @@ -189,6 +217,7 @@ uint32_t CPropertyPoolHandler::transferProperties( system::logger_opt_ptr logger, const std::chrono::steady_clock::time_point& maxWaitPoint ) { +#if 0 if (!requestCount) return 0u; @@ -526,72 +555,9 @@ uint32_t CPropertyPoolHandler::transferProperties( const auto leftOverProps = requestCount-fullPasses*m_maxPropertiesPerPass; if (leftOverProps) return copyPass(requests,leftOverProps); +#endif return 0u; } -uint32_t CPropertyPoolHandler::TransferDescriptorSetCache::acquireSet( - CPropertyPoolHandler* handler, const asset::SBufferBinding& scratch, const asset::SBufferBinding& addresses, - const TransferRequest* requests, const uint32_t propertyCount -) -{ - auto retval = IDescriptorSetCache::acquireSet(); - if (retval==IDescriptorSetCache::invalid_index) - return IDescriptorSetCache::invalid_index; - - - auto device = handler->getDevice(); - const auto maxPropertiesPerPass = handler->getMaxPropertiesPerTransferDispatch(); - - - IGPUDescriptorSet::SDescriptorInfo infos[MaxPropertiesPerDispatch*2u+2u]; - infos[0] = scratch; - infos[0].info.buffer.size = sizeof(nbl_glsl_property_pool_transfer_t)*propertyCount; - infos[1] = addresses; - auto* inDescInfo = infos+2; - auto* outDescInfo = infos+2+maxPropertiesPerPass; - for (uint32_t i=0u; iupdateDescriptorSets(4u, writes, 0u, nullptr); - - return retval; -} \ No newline at end of file +#endif diff --git a/src/nbl/video/utilities/IPropertyPool.cpp b/src/nbl/video/utilities/IPropertyPool.cpp index 2aec9387f8..683954ee55 100644 --- a/src/nbl/video/utilities/IPropertyPool.cpp +++ b/src/nbl/video/utilities/IPropertyPool.cpp @@ -20,7 +20,7 @@ IPropertyPool::IPropertyPool(uint32_t capacity, void* reserved, bool contiguous) { if (contiguous) { - m_indexToAddr = reinterpret_cast(reinterpret_cast(reserved)+getReservedSize(capacity)); + m_indexToAddr = reinterpret_cast(reinterpret_cast(reserved)+getReservedSize(capacity)); m_addrToIndex = m_indexToAddr+capacity; std::fill_n(m_indexToAddr,capacity,invalid); From 1707158f16ffae5c475d53ffe26cb1eb044aac98 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Sun, 21 Jan 2024 13:37:46 -0300 Subject: [PATCH 04/18] Fix compilation problems --- .../builtin/hlsl/property_pool/copy.comp.hlsl | 68 +++++++++---------- .../builtin/hlsl/property_pool/transfer.hlsl | 2 +- 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl index a959a0501e..49fbf34f8b 100644 --- a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl @@ -2,18 +2,6 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/property_pool/transfer.hlsl" -// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 -// template -// uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() { -// return uint32_t3(capability_traits::maxOptimallyResidentWorkgroupInvocations, 1, 1); -// } - -[[numthreads(1, 1, 1)] -void main(uint32_t3 dispatchId : SV_DispatchThreadID) -{ - nbl::hlsl::property_pool::main(dispatchId); -} - namespace nbl { namespace hlsl @@ -28,14 +16,14 @@ struct TransferLoop { void iteration(uint propertyId, uint64_t propertySize, uint64_t srcAddr, uint64_t dstAddr, uint invocationIndex) { - const uint srcOffset = uint64_t(invocationIndex) * (uint64_t(1) << SrcIndexSizeLog2) * propertySize; - const uint dstOffset = uint64_t(invocationIndex) * (uint64_t(1) << DstIndexSizeLog2) * propertySize; + const uint64_t srcOffset = uint64_t(invocationIndex) * (uint64_t(1) << SrcIndexSizeLog2) * propertySize; + const uint64_t dstOffset = uint64_t(invocationIndex) * (uint64_t(1) << DstIndexSizeLog2) * propertySize; - const uint srcIndexAddress = Fill ? srcAddr + srcOffset : srcAddr; - const uint dstIndexAddress = Fill ? dstAddr + dstOffset : dstAddr; + const uint64_t srcIndexAddress = Fill ? srcAddr + srcOffset : srcAddr; + const uint64_t dstIndexAddress = Fill ? dstAddr + dstOffset : dstAddr; - const uint srcAddressMapped = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad(srcIndexAddress); - const uint dstAddressMapped = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad(dstIndexAddress); + const uint64_t srcAddressMapped = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad(srcIndexAddress); + const uint64_t dstAddressMapped = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad(dstIndexAddress); if (SrcIndexSizeLog2 == 0) {} // we can't write individual bytes else if (SrcIndexSizeLog2 == 1) vk::RawBufferStore(dstAddressMapped, vk::RawBufferLoad(srcAddressMapped)); @@ -51,10 +39,10 @@ struct TransferLoop iteration(propertyId, transferRequest.propertySize, transferRequest.srcAddr, transferRequest.dstAddr, invocationIndex); } } -}; +}; // For creating permutations of the functions based on parameters that are constant over the transfer request -// These branches should all be scalar, and because of how templates work, the loops shouldn't have any +// These branches should all be scalar, and because of how templates are compiled statically, the loops shouldn't have any // branching within them template @@ -62,10 +50,10 @@ struct TransferLoopPermutationSrcIndexSizeLog { void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) { - if (transferRequest.dstIndexSizeLog2 == 0) TransferLoop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); - else if (transferRequest.dstIndexSizeLog2 == 1) TransferLoop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); - else if (transferRequest.dstIndexSizeLog2 == 2) TransferLoop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); - else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ TransferLoop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + if (transferRequest.dstIndexSizeLog2 == 0) { TransferLoop loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else if (transferRequest.dstIndexSizeLog2 == 1) { TransferLoop loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else if (transferRequest.dstIndexSizeLog2 == 2) { TransferLoop loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ { TransferLoop loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } } }; @@ -74,10 +62,10 @@ struct TransferLoopPermutationDstIota { void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) { - if (transferRequest.srcIndexSizeLog2 == 0) TransferLoopPermutationSrcIndexSizeLog.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); - else if (transferRequest.srcIndexSizeLog2 == 1) TransferLoopPermutationSrcIndexSizeLog.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); - else if (transferRequest.srcIndexSizeLog2 == 2) TransferLoopPermutationSrcIndexSizeLog.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); - else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ TransferLoopPermutationSrcIndexSizeLog.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + if (transferRequest.srcIndexSizeLog2 == 0) { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else if (transferRequest.srcIndexSizeLog2 == 1) { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else if (transferRequest.srcIndexSizeLog2 == 2) { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } } }; @@ -87,8 +75,8 @@ struct TransferLoopPermutationSrcIota void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) { bool dstIota = transferRequest.dstAddr == 0; - if (dstIota) TransferLoopPermutationDstIota.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); - else TransferLoopPermutationDstIota.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + if (dstIota) { TransferLoopPermutationDstIota loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else { TransferLoopPermutationDstIota loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } } }; @@ -98,11 +86,12 @@ struct TransferLoopPermutationFill void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) { bool srcIota = transferRequest.srcAddr == 0; - if (srcIota) TransferLoopPermutationSrcIota.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); - else TransferLoopPermutationSrcIota.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); + if (srcIota) { TransferLoopPermutationSrcIota loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else { TransferLoopPermutationSrcIota loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } } }; +template void main(uint32_t3 dispatchId) { const uint propertyId = dispatchId.y; @@ -110,7 +99,7 @@ void main(uint32_t3 dispatchId) // Loading transfer request from the pointer (can't use struct // with BDA on HLSL SPIRV) - const TransferRequest transferRequest; + TransferRequest transferRequest; transferRequest.srcAddr = vk::RawBufferLoad(globals.transferCommandsAddress); transferRequest.dstAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t)); transferRequest.srcIndexAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 2); @@ -124,13 +113,20 @@ void main(uint32_t3 dispatchId) transferRequest.srcIndexSizeLog2 = bitfieldType >> (35 + 24 + 1); transferRequest.dstIndexSizeLog2 = bitfieldType >> (35 + 24 + 1 + 2); - const uint dispatchSize = capability_traits::maxOptimallyResidentWorkgroupInvocations; + const uint dispatchSize = nbl::hlsl::device_capabilities_traits::maxOptimallyResidentWorkgroupInvocations; const bool fill = transferRequest.fill == 1; - if (fill) TransferLoopPermutationFill.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); - else TransferLoopPermutationFill.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); + if (fill) { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } + else { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } } } } } + +[numthreads(1,1,1)] +void main(uint32_t3 dispatchId : SV_DispatchThreadID) +{ + nbl::hlsl::property_pools::main(dispatchId); +} + diff --git a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl index d89d2ff6d0..917f6ed98d 100644 --- a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl @@ -17,7 +17,7 @@ struct TransferRequest uint64_t dstIndexAddr; // IOTA default uint64_t elementCount : 35; // allow up to 64GB IGPUBuffers uint64_t propertySize : 24; // all the leftover bits (just use bytes now) - uint64_t fill; + uint64_t fill : 1; // 0=uint8, 1=uint16, 2=uint32, 3=uint64 uint64_t srcIndexSizeLog2 : 2; uint64_t dstIndexSizeLog2 : 2; From 279c220747215d778e4f2ee9701f0f225df72924 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Sun, 21 Jan 2024 13:53:41 -0300 Subject: [PATCH 05/18] Temp fix for last compilation issue --- .../builtin/hlsl/property_pool/copy.comp.hlsl | 17 ++++++++++------- .../builtin/hlsl/property_pool/transfer.hlsl | 19 +++++++++++++------ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl index 49fbf34f8b..bb4679098d 100644 --- a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl @@ -33,7 +33,9 @@ struct TransferLoop void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) { - uint lastInvocation = min(transferRequest.elementCount, globals.endOffset); + uint64_t elementCount = uint64_t(transferRequest.elementCount32) + | uint64_t(transferRequest.elementCountExtra) << 32; + uint lastInvocation = min(elementCount, globals.endOffset); for (uint invocationIndex = globals.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize) { iteration(propertyId, transferRequest.propertySize, transferRequest.srcAddr, transferRequest.dstAddr, invocationIndex); @@ -106,12 +108,13 @@ void main(uint32_t3 dispatchId) transferRequest.dstIndexAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 3); // Remaining elements are part of the same bitfield // TODO: Do this only using raw buffer load? - uint64_t bitfieldType = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 4); - transferRequest.elementCount = bitfieldType; - transferRequest.propertySize = bitfieldType >> 35; - transferRequest.fill = bitfieldType >> (35 + 24); - transferRequest.srcIndexSizeLog2 = bitfieldType >> (35 + 24 + 1); - transferRequest.dstIndexSizeLog2 = bitfieldType >> (35 + 24 + 1 + 2); + uint2 bitfieldType = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 4); + transferRequest.elementCount32 = bitfieldType; + transferRequest.elementCountExtra = bitfieldType; + transferRequest.propertySize = bitfieldType >> 3; + transferRequest.fill = bitfieldType >> (3 + 24); + transferRequest.srcIndexSizeLog2 = bitfieldType >> (3 + 24 + 1); + transferRequest.dstIndexSizeLog2 = bitfieldType >> (3 + 24 + 1 + 2); const uint dispatchSize = nbl::hlsl::device_capabilities_traits::maxOptimallyResidentWorkgroupInvocations; const bool fill = transferRequest.fill == 1; diff --git a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl index 917f6ed98d..f6ec3e78dd 100644 --- a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl @@ -15,12 +15,19 @@ struct TransferRequest uint64_t dstAddr; uint64_t srcIndexAddr; // IOTA default uint64_t dstIndexAddr; // IOTA default - uint64_t elementCount : 35; // allow up to 64GB IGPUBuffers - uint64_t propertySize : 24; // all the leftover bits (just use bytes now) - uint64_t fill : 1; - // 0=uint8, 1=uint16, 2=uint32, 3=uint64 - uint64_t srcIndexSizeLog2 : 2; - uint64_t dstIndexSizeLog2 : 2; + // TODO: go back to this ideal layout when things work + //uint64_t elementCount : 35; // allow up to 64GB IGPUBuffers + //uint64_t propertySize : 24; // all the leftover bits (just use bytes now) + //uint64_t fill : 1; + //// 0=uint8, 1=uint16, 2=uint32, 3=uint64 + //uint64_t srcIndexSizeLog2 : 2; + //uint64_t dstIndexSizeLog2 : 2; + uint32_t elementCount32; // 32 first bits + uint32_t elementCountExtra : 3; // 3 last bits + uint32_t propertySize : 24; + uint32_t fill: 1; + uint32_t srcIndexSizeLog2 : 2; + uint32_t dstIndexSizeLog2 : 2; }; struct GlobalPushContants From 4be1a3ce9fe690631a9c7855186725b051fcfaab Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Mon, 22 Jan 2024 14:43:38 -0300 Subject: [PATCH 06/18] Fix implementation problems in HLSL and write initial transferProperties function --- .../builtin/hlsl/property_pool/copy.comp.hlsl | 34 +++++---- .../builtin/hlsl/property_pool/transfer.hlsl | 5 ++ .../video/utilities/CPropertyPoolHandler.h | 8 +-- .../video/utilities/CPropertyPoolHandler.cpp | 70 +++++++++++++++++++ 4 files changed, 101 insertions(+), 16 deletions(-) diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl index bb4679098d..671c385438 100644 --- a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl @@ -14,16 +14,22 @@ namespace property_pools template struct TransferLoop { - void iteration(uint propertyId, uint64_t propertySize, uint64_t srcAddr, uint64_t dstAddr, uint invocationIndex) + void iteration(uint propertyId, TransferRequest transferRequest, uint invocationIndex) { - const uint64_t srcOffset = uint64_t(invocationIndex) * (uint64_t(1) << SrcIndexSizeLog2) * propertySize; - const uint64_t dstOffset = uint64_t(invocationIndex) * (uint64_t(1) << DstIndexSizeLog2) * propertySize; + const uint64_t srcIndexSize = uint64_t(1) << SrcIndexSizeLog2; + const uint64_t dstIndexSize = uint64_t(1) << DstIndexSizeLog2; + + const uint64_t srcOffset = uint64_t(invocationIndex) * srcIndexSize * transferRequest.propertySize; + const uint64_t dstOffset = uint64_t(invocationIndex) * dstIndexSize * transferRequest.propertySize; - const uint64_t srcIndexAddress = Fill ? srcAddr + srcOffset : srcAddr; - const uint64_t dstIndexAddress = Fill ? dstAddr + dstOffset : dstAddr; + const uint64_t srcIndexAddress = Fill ? transferRequest.srcIndexAddr + srcOffset : transferRequest.srcIndexAddr; + const uint64_t dstIndexAddress = Fill ? transferRequest.dstIndexAddr + dstOffset : transferRequest.dstIndexAddr; + + const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad(srcIndexAddress); + const uint64_t dstAddressBufferOffset = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad(dstIndexAddress); - const uint64_t srcAddressMapped = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad(srcIndexAddress); - const uint64_t dstAddressMapped = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad(dstIndexAddress); + const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize; + const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize; if (SrcIndexSizeLog2 == 0) {} // we can't write individual bytes else if (SrcIndexSizeLog2 == 1) vk::RawBufferStore(dstAddressMapped, vk::RawBufferLoad(srcAddressMapped)); @@ -35,10 +41,10 @@ struct TransferLoop { uint64_t elementCount = uint64_t(transferRequest.elementCount32) | uint64_t(transferRequest.elementCountExtra) << 32; - uint lastInvocation = min(elementCount, globals.endOffset); - for (uint invocationIndex = globals.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize) + uint64_t lastInvocation = min(elementCount, globals.endOffset); + for (uint64_t invocationIndex = globals.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize) { - iteration(propertyId, transferRequest.propertySize, transferRequest.srcAddr, transferRequest.dstAddr, invocationIndex); + iteration(propertyId, transferRequest, invocationIndex); } } }; @@ -46,6 +52,10 @@ struct TransferLoop // For creating permutations of the functions based on parameters that are constant over the transfer request // These branches should all be scalar, and because of how templates are compiled statically, the loops shouldn't have any // branching within them +// +// Permutations: +// 2 (fill or not) * 2 (src index iota or not) * 2 (dst index iota or not) * 4 (src index size) * 4 (dst index size) +// Total amount of permutations: 128 template struct TransferLoopPermutationSrcIndexSizeLog @@ -76,7 +86,7 @@ struct TransferLoopPermutationSrcIota { void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) { - bool dstIota = transferRequest.dstAddr == 0; + bool dstIota = transferRequest.dstIndexAddr == 0; if (dstIota) { TransferLoopPermutationDstIota loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } else { TransferLoopPermutationDstIota loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } } @@ -87,7 +97,7 @@ struct TransferLoopPermutationFill { void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) { - bool srcIota = transferRequest.srcAddr == 0; + bool srcIota = transferRequest.srcIndexAddr == 0; if (srcIota) { TransferLoopPermutationSrcIota loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } else { TransferLoopPermutationSrcIota loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } } diff --git a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl index f6ec3e78dd..9290f911f6 100644 --- a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl @@ -1,6 +1,8 @@ #ifndef _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_ #define _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + namespace nbl { namespace hlsl @@ -40,6 +42,9 @@ struct GlobalPushContants // BDA address (GPU pointer) into the transfer commands buffer uint64_t transferCommandsAddress; }; + +NBL_CONSTEXPR uint32_t MaxPropertiesPerDispatch = 128; + } } } diff --git a/include/nbl/video/utilities/CPropertyPoolHandler.h b/include/nbl/video/utilities/CPropertyPoolHandler.h index 6bd533a303..8ace625e86 100644 --- a/include/nbl/video/utilities/CPropertyPoolHandler.h +++ b/include/nbl/video/utilities/CPropertyPoolHandler.h @@ -74,12 +74,12 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ asset::SBufferRange memblock = {}; E_FLAG flags = EF_NONE; uint16_t elementSize = 0u; - uint32_t elementCount = 0u; + uint64_t elementCount = 0u; // the source or destination buffer depending on the transfer type asset::SBufferBinding buffer = {}; // can be invalid, if invalid, treated like an implicit {0,1,2,3,...} iota view - uint32_t srcAddressesOffset = IPropertyPool::invalid; - uint32_t dstAddressesOffset = IPropertyPool::invalid; + uint64_t srcAddressesOffset = IPropertyPool::invalid; + uint64_t dstAddressesOffset = IPropertyPool::invalid; }; // Fence must be not pending yet, `cmdbuf` must be already in recording state. [[nodiscard]] bool transferProperties( @@ -206,7 +206,7 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ protected: ~CPropertyPoolHandler() {} - static inline constexpr auto MaxPropertiesPerDispatch = 0u; // TODO + static inline constexpr auto MaxPropertiesPerDispatch = nbl::hlsl::property_pools::MaxPropertiesPerDispatch; static inline constexpr auto DescriptorCacheSize = 128u; diff --git a/src/nbl/video/utilities/CPropertyPoolHandler.cpp b/src/nbl/video/utilities/CPropertyPoolHandler.cpp index 341dcba1b5..94bca9c3ad 100644 --- a/src/nbl/video/utilities/CPropertyPoolHandler.cpp +++ b/src/nbl/video/utilities/CPropertyPoolHandler.cpp @@ -31,6 +31,9 @@ CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptrcreatePipelineLayout(&baseDWORD,&baseDWORD+1u); + m_pipeline = m_device->createComputePipeline(nullptr,std::move(layout),std::move(shader)); #if 0 const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits(); @@ -89,6 +92,73 @@ bool CPropertyPoolHandler::transferProperties( system::logger_opt_ptr logger, const uint32_t baseDWORD, const uint32_t endDWORD ) { + if (requestsBegin==requestsEnd) + return true; + if (!scratch.buffer || !scratch.buffer->getCreationParams().usage.hasFlags(IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF)) + { + logger.log("CPropertyPoolHandler: Need a valid scratch buffer which can have updates staged from the commandbuffer!",system::ILogger::ELL_ERROR); + return false; + } + // TODO: validate usage flags + uint32_t maxScratchSize = MaxPropertiesPerDispatch * sizeof(nbl::hlsl::property_pools::TransferRequest); + if (scratch.offset + maxScratchSize > scratch.buffer->getSize()) + logger.log("CPropertyPoolHandler: The scratch buffer binding provided might not be big enough in the worst case! (Scratch buffer size: %i Max scratch size: %i)", + system::ILogger::ELL_WARNING, + scratch.buffer->getSize() - scratch.offset, + maxScratchSize); + + const auto totalProps = std::distance(requestsBegin,requestsEnd); + bool success = true; + + uint32_t numberOfPasses = totalProps / MaxPropertiesPerDispatch; + nbl::hlsl::property_pools::TransferRequest transferRequestsData[MaxPropertiesPerDispatch]; + uint64_t scratchBufferDeviceAddr = m_device->getBufferDeviceAddress(scratch.buffer.get()) + scratch.offset; + uint64_t addressBufferDeviceAddr = m_device->getBufferDeviceAddress(addresses.buffer.get()) + addresses.offset; + + for (uint32_t transferPassRequestsIndex = 0; transferPassRequestsIndex < totalProps; transferPassRequestsIndex += MaxPropertiesPerDispatch) + { + const TransferRequest* transferPassRequests = requestsBegin + transferPassRequestsIndex; + uint32_t requestsThisPass = core::min(std::distance(transferPassRequests, requestsEnd), MaxPropertiesPerDispatch); + uint64_t maxElements = 0; + for (uint32_t i = 0; i < requestsThisPass; i ++) + { + auto& transferRequest = transferRequestsData[i]; + auto srcRequest = transferPassRequests + i; + transferRequest.srcAddr = m_device->getBufferDeviceAddress(srcRequest->memblock.buffer.get()) + srcRequest->memblock.offset; + transferRequest.dstAddr = m_device->getBufferDeviceAddress(srcRequest->buffer.buffer.get()) + srcRequest->buffer.offset; + transferRequest.srcIndexAddr = srcRequest->srcAddressesOffset ? addressBufferDeviceAddr + srcRequest->srcAddressesOffset : 0; + transferRequest.dstIndexAddr = srcRequest->dstAddressesOffset ? addressBufferDeviceAddr + srcRequest->dstAddressesOffset : 0; + transferRequest.elementCount32 = uint32_t(srcRequest->elementCount & (uint64_t(1) << 32) - 1); + transferRequest.elementCountExtra = uint32_t(srcRequest->elementCount >> 32); + transferRequest.propertySize = srcRequest->elementSize; + transferRequest.fill = 0; // TODO + transferRequest.srcIndexSizeLog2 = 1u; // TODO + transferRequest.dstIndexSizeLog2 = 1u; // TODO + + maxElements = core::max(maxElements, srcRequest->elementCount); + } + cmdbuf->updateBuffer(scratch.buffer.get(),scratch.offset,sizeof(TransferRequest)*requestsThisPass,transferRequestsData); + // TODO: pipeline barrier + cmdbuf->bindComputePipeline(m_pipeline.get()); + + nbl::hlsl::property_pools::GlobalPushContants pushConstants; + { + pushConstants.beginOffset = baseDWORD; + pushConstants.endOffset = endDWORD; + pushConstants.transferCommandsAddress = scratchBufferDeviceAddr; + } + cmdbuf->pushConstants(m_pipeline->getLayout(), asset::IShader::ESS_COMPUTE, 0u, sizeof(nbl::hlsl::property_pools::GlobalPushContants), &pushConstants); + + // dispatch + { + const auto& limits = m_device->getPhysicalDevice()->getLimits(); + const auto invocationCoarseness = limits.maxOptimallyResidentWorkgroupInvocations * requestsThisPass; + cmdbuf->dispatch(limits.computeOptimalPersistentWorkgroupDispatchSize(maxElements,invocationCoarseness), requestsThisPass, 1u); + } + // TODO: pipeline barrier + } + + return success; #if 0 if (requestsBegin==requestsEnd) return true; From c44bb49031916c07d9fa06c9db21aa34aba8b529 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Sat, 27 Jan 2024 11:20:08 -0300 Subject: [PATCH 07/18] Fix merge build issues --- include/nbl/video/utilities/CPropertyPoolHandler.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/nbl/video/utilities/CPropertyPoolHandler.h b/include/nbl/video/utilities/CPropertyPoolHandler.h index ad38cdfece..2c6f9eeac5 100644 --- a/include/nbl/video/utilities/CPropertyPoolHandler.h +++ b/include/nbl/video/utilities/CPropertyPoolHandler.h @@ -81,7 +81,7 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ }; // Fence must be not pending yet, `cmdbuf` must be already in recording state. [[nodiscard]] bool transferProperties( - IGPUCommandBuffer* const cmdbuf, IGPUFence* const fence, + IGPUCommandBuffer* const cmdbuf, //IGPUFence* const fence, const asset::SBufferBinding& scratch, const asset::SBufferBinding& addresses, const TransferRequest* const requestsBegin, const TransferRequest* const requestsEnd, system::logger_opt_ptr logger, const uint32_t baseDWORD=0u, const uint32_t endDWORD=~0ull @@ -177,8 +177,9 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ system::logger_opt_ptr logger, const std::chrono::steady_clock::time_point& maxWaitPoint=std::chrono::steady_clock::now()+std::chrono::microseconds(500u) ); #endif - -#if 0 // TODO: freeing properties + +// TODO: freeing properties +#if 0 // utility to help you fill out the tail move scatter request after the free, properly, returns if you actually need to transfer anything static inline bool freeProperties(IPropertyPool* pool, UpStreamingRequest* requests, const uint32_t* indicesBegin, const uint32_t* indicesEnd, uint32_t* srcAddresses, uint32_t* dstAddresses) { @@ -213,7 +214,7 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ uint32_t m_alignment; }; -#endif } -#endif \ No newline at end of file + +#endif From 9460e245a1148173585cdfe68593dc23ba3502e5 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Sat, 27 Jan 2024 18:38:32 -0300 Subject: [PATCH 08/18] Fix vulkan_1_3 incompatibilities --- .../video/utilities/CPropertyPoolHandler.cpp | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/nbl/video/utilities/CPropertyPoolHandler.cpp b/src/nbl/video/utilities/CPropertyPoolHandler.cpp index cc494fef46..7714a01a1d 100644 --- a/src/nbl/video/utilities/CPropertyPoolHandler.cpp +++ b/src/nbl/video/utilities/CPropertyPoolHandler.cpp @@ -12,28 +12,28 @@ CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptr(core::smart_refctd_ptr(system)); - video::IGPUObjectFromAssetConverter CPU2GPU; - video::IGPUObjectFromAssetConverter::SParams cpu2gpuParams; - cpu2gpuParams.assetManager = assetManager.get(); - cpu2gpuParams.device = m_device.get(); - auto loadShader = [&](const char* path) - { - asset::IAssetLoader::SAssetLoadParams params = {}; - auto shader = core::smart_refctd_ptr_static_cast(*assetManager->getAsset(path, params).getContents().begin()); - shader->setSpecializationInfo(asset::ISpecializedShader::SInfo(nullptr, nullptr, "main")); - assert(shader); + { + asset::IAssetLoader::SAssetLoadParams params = {}; + auto assetBundle = assetManager->getAsset(path, params); + auto assets = assetBundle.getContents(); + assert(!assets.empty()); + + auto cpuShader = asset::IAsset::castDown(assets[0]); + auto shader = m_device->createShader(cpuShader.get()); + return shader; + }; + auto shader = loadShader("../../../include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl"); + const asset::SPushConstantRange baseDWORD = { asset::IShader::ESS_COMPUTE,0u,sizeof(nbl::hlsl::property_pools::GlobalPushContants) }; + auto layout = m_device->createPipelineLayout({ &baseDWORD,1u }); - auto gpuShaders = CPU2GPU.getGPUObjectsFromAssets(&shader, &shader + 1u, cpu2gpuParams); - auto gpuShader = gpuShaders->begin()[0u]; - assert(gpuShader); + { + video::IGPUComputePipeline::SCreationParams params = {}; + params.layout = layout.get(); + params.shader.shader = shader.get(); - return gpuShader; - }; - auto shader = loadShader("../../../include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl"); - const asset::SPushConstantRange baseDWORD = {asset::IShader::ESS_COMPUTE,0u,sizeof(nbl::hlsl::property_pools::GlobalPushContants)}; - auto layout = m_device->createPipelineLayout(&baseDWORD,&baseDWORD+1u); - m_pipeline = m_device->createComputePipeline(nullptr,std::move(layout),std::move(shader)); + m_device->createComputePipelines(nullptr, { ¶ms, 1 }, &m_pipeline); + } #if 0 const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits(); @@ -86,7 +86,7 @@ CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptr& scratch, const asset::SBufferBinding& addresses, const TransferRequest* const requestsBegin, const TransferRequest* const requestsEnd, system::logger_opt_ptr logger, const uint32_t baseDWORD, const uint32_t endDWORD @@ -112,8 +112,8 @@ bool CPropertyPoolHandler::transferProperties( uint32_t numberOfPasses = totalProps / MaxPropertiesPerDispatch; nbl::hlsl::property_pools::TransferRequest transferRequestsData[MaxPropertiesPerDispatch]; - uint64_t scratchBufferDeviceAddr = m_device->getBufferDeviceAddress(scratch.buffer.get()) + scratch.offset; - uint64_t addressBufferDeviceAddr = m_device->getBufferDeviceAddress(addresses.buffer.get()) + addresses.offset; + uint64_t scratchBufferDeviceAddr = scratch.buffer.get()->getDeviceAddress() + scratch.offset; + uint64_t addressBufferDeviceAddr = addresses.buffer.get()->getDeviceAddress() + addresses.offset; for (uint32_t transferPassRequestsIndex = 0; transferPassRequestsIndex < totalProps; transferPassRequestsIndex += MaxPropertiesPerDispatch) { @@ -124,8 +124,8 @@ bool CPropertyPoolHandler::transferProperties( { auto& transferRequest = transferRequestsData[i]; auto srcRequest = transferPassRequests + i; - transferRequest.srcAddr = m_device->getBufferDeviceAddress(srcRequest->memblock.buffer.get()) + srcRequest->memblock.offset; - transferRequest.dstAddr = m_device->getBufferDeviceAddress(srcRequest->buffer.buffer.get()) + srcRequest->buffer.offset; + transferRequest.srcAddr = srcRequest->memblock.buffer.get()->getDeviceAddress() + srcRequest->memblock.offset; + transferRequest.dstAddr = srcRequest->buffer.buffer.get()->getDeviceAddress() + srcRequest->buffer.offset; transferRequest.srcIndexAddr = srcRequest->srcAddressesOffset ? addressBufferDeviceAddr + srcRequest->srcAddressesOffset : 0; transferRequest.dstIndexAddr = srcRequest->dstAddressesOffset ? addressBufferDeviceAddr + srcRequest->dstAddressesOffset : 0; transferRequest.elementCount32 = uint32_t(srcRequest->elementCount & (uint64_t(1) << 32) - 1); @@ -137,7 +137,7 @@ bool CPropertyPoolHandler::transferProperties( maxElements = core::max(maxElements, srcRequest->elementCount); } - cmdbuf->updateBuffer(scratch.buffer.get(),scratch.offset,sizeof(TransferRequest)*requestsThisPass,transferRequestsData); + cmdbuf->updateBuffer({ scratch.offset,sizeof(TransferRequest) * requestsThisPass, core::smart_refctd_ptr(scratch.buffer) }, transferRequestsData); // TODO: pipeline barrier cmdbuf->bindComputePipeline(m_pipeline.get()); From 88d1d001717058f1fb5d5dcf5c951f3705a3b763 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Sat, 27 Jan 2024 21:35:27 -0300 Subject: [PATCH 09/18] Add temporary fill for testing --- include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl index 671c385438..3d2a88a2d9 100644 --- a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl @@ -129,8 +129,9 @@ void main(uint32_t3 dispatchId) const uint dispatchSize = nbl::hlsl::device_capabilities_traits::maxOptimallyResidentWorkgroupInvocations; const bool fill = transferRequest.fill == 1; - if (fill) { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } - else { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } + vk::RawBufferStore(transferRequest.dstAddr, 69); + // if (fill) { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } + // else { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } } } From 52d6972c6f6beff0fa152805c3616ad8c8f767cf Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Mon, 29 Jan 2024 15:11:39 -0300 Subject: [PATCH 10/18] WIP testing --- .../builtin/hlsl/property_pool/copy.comp.hlsl | 30 ++++++++++--------- .../builtin/hlsl/property_pool/transfer.hlsl | 8 +++-- .../video/utilities/CPropertyPoolHandler.cpp | 2 +- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl index 3d2a88a2d9..28d613d5f4 100644 --- a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl @@ -14,13 +14,13 @@ namespace property_pools template struct TransferLoop { - void iteration(uint propertyId, TransferRequest transferRequest, uint invocationIndex) + void iteration(uint propertyId, TransferRequest transferRequest, uint64_t invocationIndex) { const uint64_t srcIndexSize = uint64_t(1) << SrcIndexSizeLog2; const uint64_t dstIndexSize = uint64_t(1) << DstIndexSizeLog2; - const uint64_t srcOffset = uint64_t(invocationIndex) * srcIndexSize * transferRequest.propertySize; - const uint64_t dstOffset = uint64_t(invocationIndex) * dstIndexSize * transferRequest.propertySize; + const uint64_t srcOffset = invocationIndex * srcIndexSize * transferRequest.propertySize; + const uint64_t dstOffset = invocationIndex * dstIndexSize * transferRequest.propertySize; const uint64_t srcIndexAddress = Fill ? transferRequest.srcIndexAddr + srcOffset : transferRequest.srcIndexAddr; const uint64_t dstIndexAddress = Fill ? transferRequest.dstIndexAddr + dstOffset : transferRequest.dstIndexAddr; @@ -112,26 +112,28 @@ void main(uint32_t3 dispatchId) // Loading transfer request from the pointer (can't use struct // with BDA on HLSL SPIRV) TransferRequest transferRequest; - transferRequest.srcAddr = vk::RawBufferLoad(globals.transferCommandsAddress); + transferRequest.srcAddr = vk::RawBufferLoad(globals.transferCommandsAddress) | vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint)) << 32; transferRequest.dstAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t)); transferRequest.srcIndexAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 2); transferRequest.dstIndexAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 3); // Remaining elements are part of the same bitfield // TODO: Do this only using raw buffer load? - uint2 bitfieldType = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 4); - transferRequest.elementCount32 = bitfieldType; - transferRequest.elementCountExtra = bitfieldType; - transferRequest.propertySize = bitfieldType >> 3; - transferRequest.fill = bitfieldType >> (3 + 24); - transferRequest.srcIndexSizeLog2 = bitfieldType >> (3 + 24 + 1); - transferRequest.dstIndexSizeLog2 = bitfieldType >> (3 + 24 + 1 + 2); + uint64_t bitfieldType = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 4); + transferRequest.elementCount32 = uint32_t(bitfieldType); + transferRequest.elementCountExtra = uint32_t(bitfieldType); + transferRequest.propertySize = uint32_t(bitfieldType >> 3); + transferRequest.fill = uint32_t(bitfieldType >> (3 + 24)); + transferRequest.srcIndexSizeLog2 = uint32_t(bitfieldType >> (3 + 24 + 1)); + transferRequest.dstIndexSizeLog2 = uint32_t(bitfieldType >> (3 + 24 + 1 + 2)); const uint dispatchSize = nbl::hlsl::device_capabilities_traits::maxOptimallyResidentWorkgroupInvocations; const bool fill = transferRequest.fill == 1; - vk::RawBufferStore(transferRequest.dstAddr, 69); - // if (fill) { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } - // else { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } + vk::RawBufferStore(globals.transferCommandsAddress + 40 * 3, transferRequest.srcAddr); + vk::RawBufferStore(globals.transferCommandsAddress + 40 * 4, transferRequest.dstAddr); + vk::RawBufferStore(globals.transferCommandsAddress + 40 * 5, vk::RawBufferLoad(transferRequest.srcAddr + sizeof(uint16_t) * 3)); + //if (fill) { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } + //else { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } } } diff --git a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl index 9290f911f6..9805d2e4d4 100644 --- a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl @@ -18,6 +18,10 @@ struct TransferRequest uint64_t srcIndexAddr; // IOTA default uint64_t dstIndexAddr; // IOTA default // TODO: go back to this ideal layout when things work + // (Getting a fatal error from DXC when using 64-bit bitfields:) + // fatal error: generated SPIR-V is invalid: [VUID-StandaloneSpirv-Base-04781] Expected 32-bit int type for Base operand: BitFieldInsert + // %58 = OpBitFieldInsert %ulong %42 %57 %uint_0 %uint_35 + // //uint64_t elementCount : 35; // allow up to 64GB IGPUBuffers //uint64_t propertySize : 24; // all the leftover bits (just use bytes now) //uint64_t fill : 1; @@ -34,13 +38,13 @@ struct TransferRequest struct GlobalPushContants { + // BDA address (GPU pointer) into the transfer commands buffer + uint64_t transferCommandsAddress; // Define the range of invocations (X axis) that will be transfered over in this dispatch // May be sectioned off in the case of overflow or any other situation that doesn't allow // for a full transfer uint64_t beginOffset; uint64_t endOffset; - // BDA address (GPU pointer) into the transfer commands buffer - uint64_t transferCommandsAddress; }; NBL_CONSTEXPR uint32_t MaxPropertiesPerDispatch = 128; diff --git a/src/nbl/video/utilities/CPropertyPoolHandler.cpp b/src/nbl/video/utilities/CPropertyPoolHandler.cpp index 7714a01a1d..79dd9091ef 100644 --- a/src/nbl/video/utilities/CPropertyPoolHandler.cpp +++ b/src/nbl/video/utilities/CPropertyPoolHandler.cpp @@ -147,7 +147,7 @@ bool CPropertyPoolHandler::transferProperties( pushConstants.endOffset = endDWORD; pushConstants.transferCommandsAddress = scratchBufferDeviceAddr; } - cmdbuf->pushConstants(m_pipeline->getLayout(), asset::IShader::ESS_COMPUTE, 0u, sizeof(nbl::hlsl::property_pools::GlobalPushContants), &pushConstants); + cmdbuf->pushConstants(m_pipeline->getLayout(), asset::IShader::ESS_COMPUTE, 0u, sizeof(pushConstants), &pushConstants); // dispatch { From 706000dfbf2b5a88a7fc71ed6a7ab30d7a0d4281 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Tue, 30 Jan 2024 17:13:46 -0300 Subject: [PATCH 11/18] Update to latest testing --- include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl index 28d613d5f4..5911194495 100644 --- a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl @@ -131,7 +131,7 @@ void main(uint32_t3 dispatchId) vk::RawBufferStore(globals.transferCommandsAddress + 40 * 3, transferRequest.srcAddr); vk::RawBufferStore(globals.transferCommandsAddress + 40 * 4, transferRequest.dstAddr); - vk::RawBufferStore(globals.transferCommandsAddress + 40 * 5, vk::RawBufferLoad(transferRequest.srcAddr + sizeof(uint16_t) * 3)); + //vk::RawBufferStore(globals.transferCommandsAddress + 40 * 5, vk::RawBufferLoad(transferRequest.srcAddr + sizeof(uint16_t) * 3)); //if (fill) { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } //else { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } } From b625153b1d7137f20303b80f451dc1a019ffde07 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Tue, 30 Jan 2024 17:15:07 -0300 Subject: [PATCH 12/18] Diff that breaks things --- include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl index 5911194495..28d613d5f4 100644 --- a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl @@ -131,7 +131,7 @@ void main(uint32_t3 dispatchId) vk::RawBufferStore(globals.transferCommandsAddress + 40 * 3, transferRequest.srcAddr); vk::RawBufferStore(globals.transferCommandsAddress + 40 * 4, transferRequest.dstAddr); - //vk::RawBufferStore(globals.transferCommandsAddress + 40 * 5, vk::RawBufferLoad(transferRequest.srcAddr + sizeof(uint16_t) * 3)); + vk::RawBufferStore(globals.transferCommandsAddress + 40 * 5, vk::RawBufferLoad(transferRequest.srcAddr + sizeof(uint16_t) * 3)); //if (fill) { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } //else { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } } From ef4b779202c6fe30a0cecb7e85cc4db8368ed4c2 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Mon, 12 Feb 2024 17:43:56 -0300 Subject: [PATCH 13/18] WIP Suballocated descriptor set --- .../video/alloc/SubAllocatedDescriptorSet.h | 120 ++++++++++++++++++ .../video/utilities/CPropertyPoolHandler.cpp | 8 ++ 2 files changed, 128 insertions(+) create mode 100644 include/nbl/video/alloc/SubAllocatedDescriptorSet.h diff --git a/include/nbl/video/alloc/SubAllocatedDescriptorSet.h b/include/nbl/video/alloc/SubAllocatedDescriptorSet.h new file mode 100644 index 0000000000..2bf9a37022 --- /dev/null +++ b/include/nbl/video/alloc/SubAllocatedDescriptorSet.h @@ -0,0 +1,120 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_VIDEO_SUB_ALLOCATED_DESCRIPTOR_SET_H_ +#define _NBL_VIDEO_SUB_ALLOCATED_DESCRIPTOR_SET_H + +#include "nbl/video/alloc/IBufferAllocator.h" + +#include + +namespace nbl::video +{ + +// address allocator gives offsets +// reserved allocator allocates memory to keep the address allocator state inside +template> +class SubAllocatedDescriptorSet : public core::IReferenceCounted +{ + public: + using AddressAllocator = AddrAllocator; + using ReservedAllocator = ReservAllocator; + using size_type = typename AddressAllocator::size_type; + using value_type = typename AddressAllocator::size_type; + static constexpr value_type invalid_value = AddressAllocator::invalid_address; + + // constructors + template + inline SubAllocatedDescriptorSet(const std::span bindings, + ReservedAllocator&& _reservedAllocator, const value_type maxAllocatableAlignment, Args&&... args) + { + auto allocatableDescriptors = 0; + m_allocatableRanges.reserve(bindings.size()); + + for (auto& binding : bindings) + { + SubAllocDescriptorSetRange range; + range.offset = allocatableDescriptors; + range.binding = binding; + // Only bindings with these flags will be allocatable + if (binding.createFlags.hasFlags(core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) + | IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT + | IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT)) + { + allocatableDescriptors += binding.count; + } + m_allocatableRanges.push_back(range); + } + + m_addressAllocator = AddrAllocator( + _reservedAllocator.allocate(AddressAllocator::reserved_size(maxAllocatableAlignment, static_cast(allocatableDescriptors), args...), _NBL_SIMD_ALIGNMENT), + static_cast(allocatableDescriptors), 0u, maxAllocatableAlignment, static_cast(allocatableDescriptors), std::forward(args)... + ); + m_reservedAllocator = ReservedAllocator(std::move(_reservedAllocator)); + m_reservedSize = allocatableDescriptors; + } + // version with default constructed reserved allocator + template + explicit inline SubAllocatedDescriptorSet(const std::span bindings, + const value_type maxAllocatableAlignment, Args&&... args) : + SubAllocatedDescriptorSet(bindings,ReservedAllocator(),maxAllocatableAlignment,std::forward(args)...) + { + } + ~SubAllocatedDescriptorSet() + { + auto ptr = reinterpret_cast(core::address_allocator_traits::getReservedSpacePtr(m_addressAllocator)); + m_reservedAllocator.deallocate(const_cast(ptr),m_reservedSize); + } + + // anyone gonna use it? + inline const AddressAllocator& getAddressAllocator() const {return m_addressAllocator;} + + // + inline ReservedAllocator& getReservedAllocator() {return m_reservedAllocator;} + + // main methods + + //! Warning `outAddresses` needs to be primed with `invalid_value` values, otherwise no allocation happens for elements not equal to `invalid_value` + template + inline void multi_allocate(uint32_t count, value_type* outAddresses, const size_type* byteSizes, const size_type* alignments, const Args&... args) + { + core::address_allocator_traits::multi_alloc_addr(m_addressAllocator,count,outAddresses,byteSizes,alignments,args...); + } + template + inline void multi_deallocate(Args&&... args) + { + core::address_allocator_traits::multi_free_addr(m_addressAllocator,std::forward(args)...); + } + + // to conform to IBufferAllocator concept + template + inline value_type allocate(const size_type bytes, const size_type alignment, const Args&... args) + { + value_type retval = invalid_value; + multi_allocate(&retval,&bytes,&alignment,args...); + return retval; + } + template + inline void deallocate(value_type& allocation, Args&&... args) + { + multi_deallocate(std::forward(args)...); + allocation = invalid_value; + } + + protected: + AddressAllocator m_addressAllocator; + ReservedAllocator m_reservedAllocator; + size_t m_reservedSize; // FIXME: uninitialized variable + + struct SubAllocDescriptorSetRange { + uint32_t offset; + video::IGPUDescriptorSetLayout::SBinding binding; + }; + std::vector m_allocatableRanges = {}; +}; + +} + +#endif + diff --git a/src/nbl/video/utilities/CPropertyPoolHandler.cpp b/src/nbl/video/utilities/CPropertyPoolHandler.cpp index 79dd9091ef..c1d2d7ee5b 100644 --- a/src/nbl/video/utilities/CPropertyPoolHandler.cpp +++ b/src/nbl/video/utilities/CPropertyPoolHandler.cpp @@ -84,6 +84,10 @@ CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptr(maxElements, srcRequest->elementCount); } @@ -147,6 +153,8 @@ bool CPropertyPoolHandler::transferProperties( pushConstants.endOffset = endDWORD; pushConstants.transferCommandsAddress = scratchBufferDeviceAddr; } + assert(getAlignment(scratchBufferDeviceAddr) == 0); + assert(getAlignment(sizeof(TransferRequest)) == 0); cmdbuf->pushConstants(m_pipeline->getLayout(), asset::IShader::ESS_COMPUTE, 0u, sizeof(pushConstants), &pushConstants); // dispatch From b8db8c98a1539f580409409cd63d04e1f291192f Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Tue, 13 Feb 2024 10:23:37 -0300 Subject: [PATCH 14/18] Work on sub allocated descriptor set --- .../nbl/core/alloc/address_allocator_traits.h | 20 +++++++++++++++++++ .../video/alloc/SubAllocatedDescriptorSet.h | 11 +++++----- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/include/nbl/core/alloc/address_allocator_traits.h b/include/nbl/core/alloc/address_allocator_traits.h index 293dc3503e..f6c522bc53 100644 --- a/include/nbl/core/alloc/address_allocator_traits.h +++ b/include/nbl/core/alloc/address_allocator_traits.h @@ -53,6 +53,18 @@ namespace nbl::core } } + static inline void multi_alloc_addr(AddressAlloc& alloc, uint32_t count, size_type* outAddresses, const size_type* bytes, + const size_type alignment, const size_type* hint=nullptr) noexcept + { + for (uint32_t i=0; i::value>::multi_alloc_addr( + alloc,std::min(count-i,maxMultiOps),outAddresses+i,bytes+i,alignment,hint ? (hint+i):nullptr); + } + static inline void multi_free_addr(AddressAlloc& alloc, uint32_t count, const size_type* addr, const size_type* bytes) noexcept { for (uint32_t i=0; i(allocatableDescriptors), args...), _NBL_SIMD_ALIGNMENT), - static_cast(allocatableDescriptors), 0u, maxAllocatableAlignment, static_cast(allocatableDescriptors), std::forward(args)... + static_cast(0), 0u, maxAllocatableAlignment, static_cast(allocatableDescriptors), std::forward(args)... ); m_reservedAllocator = ReservedAllocator(std::move(_reservedAllocator)); m_reservedSize = allocatableDescriptors; @@ -77,14 +77,13 @@ class SubAllocatedDescriptorSet : public core::IReferenceCounted //! Warning `outAddresses` needs to be primed with `invalid_value` values, otherwise no allocation happens for elements not equal to `invalid_value` template - inline void multi_allocate(uint32_t count, value_type* outAddresses, const size_type* byteSizes, const size_type* alignments, const Args&... args) + inline void multi_allocate(uint32_t count, value_type* outAddresses, const size_type* sizes, const Args&... args) { - core::address_allocator_traits::multi_alloc_addr(m_addressAllocator,count,outAddresses,byteSizes,alignments,args...); + core::address_allocator_traits::multi_alloc_addr(m_addressAllocator,count,outAddresses,sizes,1,args...); } - template - inline void multi_deallocate(Args&&... args) + inline void multi_deallocate(uint32_t count, const size_type* addr, const size_type* sizes) { - core::address_allocator_traits::multi_free_addr(m_addressAllocator,std::forward(args)...); + core::address_allocator_traits::multi_free_addr(m_addressAllocator,count,addr,sizes); } // to conform to IBufferAllocator concept From 1a0c998c95d04f2ed0b23fe191b0c999d6ae716f Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Wed, 14 Feb 2024 16:31:28 -0300 Subject: [PATCH 15/18] Apply fixes to property pool stuff --- .../builtin/hlsl/property_pool/copy.comp.hlsl | 61 ++++++++++++------- .../video/utilities/CPropertyPoolHandler.cpp | 17 +++++- 2 files changed, 52 insertions(+), 26 deletions(-) diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl index 28d613d5f4..470a7ff5f5 100644 --- a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl @@ -19,18 +19,20 @@ struct TransferLoop const uint64_t srcIndexSize = uint64_t(1) << SrcIndexSizeLog2; const uint64_t dstIndexSize = uint64_t(1) << DstIndexSizeLog2; - const uint64_t srcOffset = invocationIndex * srcIndexSize * transferRequest.propertySize; - const uint64_t dstOffset = invocationIndex * dstIndexSize * transferRequest.propertySize; + // Fill: Always use offset 0 on src + const uint64_t srcOffset = Fill ? 0 : invocationIndex * transferRequest.propertySize; + const uint64_t dstOffset = invocationIndex * transferRequest.propertySize; - const uint64_t srcIndexAddress = Fill ? transferRequest.srcIndexAddr + srcOffset : transferRequest.srcIndexAddr; - const uint64_t dstIndexAddress = Fill ? transferRequest.dstIndexAddr + dstOffset : transferRequest.dstIndexAddr; - - const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad(srcIndexAddress); - const uint64_t dstAddressBufferOffset = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad(dstIndexAddress); + // IOTA: Use the index as the fetching offset + // Non IOTA: Read the address buffer ("index buffer") to select fetching offset + const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcOffset : vk::RawBufferLoad(transferRequest.srcIndexAddr + srcOffset * sizeof(uint32_t)); + const uint64_t dstAddressBufferOffset = DstIndexIota ? dstOffset : vk::RawBufferLoad(transferRequest.dstIndexAddr + dstOffset * sizeof(uint32_t)); const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize; const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize; + //vk::RawBufferStore(transferRequest.dstAddr + invocationIndex * sizeof(uint64_t) * 2, srcAddressMapped,8); + //vk::RawBufferStore(transferRequest.dstAddr + invocationIndex * sizeof(uint64_t) * 2 + sizeof(uint64_t), dstAddressMapped,8); if (SrcIndexSizeLog2 == 0) {} // we can't write individual bytes else if (SrcIndexSizeLog2 == 1) vk::RawBufferStore(dstAddressMapped, vk::RawBufferLoad(srcAddressMapped)); else if (SrcIndexSizeLog2 == 2) vk::RawBufferStore(dstAddressMapped, vk::RawBufferLoad(srcAddressMapped)); @@ -111,36 +113,49 @@ void main(uint32_t3 dispatchId) // Loading transfer request from the pointer (can't use struct // with BDA on HLSL SPIRV) + uint64_t transferCmdAddr = globals.transferCommandsAddress + sizeof(TransferRequest) * propertyId; TransferRequest transferRequest; - transferRequest.srcAddr = vk::RawBufferLoad(globals.transferCommandsAddress) | vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint)) << 32; - transferRequest.dstAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t)); - transferRequest.srcIndexAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 2); - transferRequest.dstIndexAddr = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 3); + transferRequest.srcAddr = vk::RawBufferLoad(transferCmdAddr,8); + transferRequest.dstAddr = vk::RawBufferLoad(transferCmdAddr + sizeof(uint64_t),8); + transferRequest.srcIndexAddr = vk::RawBufferLoad(transferCmdAddr + sizeof(uint64_t) * 2,8); + transferRequest.dstIndexAddr = vk::RawBufferLoad(transferCmdAddr + sizeof(uint64_t) * 3,8); // Remaining elements are part of the same bitfield // TODO: Do this only using raw buffer load? - uint64_t bitfieldType = vk::RawBufferLoad(globals.transferCommandsAddress + sizeof(uint64_t) * 4); + uint64_t bitfieldType = vk::RawBufferLoad(transferCmdAddr + sizeof(uint64_t) * 4,8); transferRequest.elementCount32 = uint32_t(bitfieldType); - transferRequest.elementCountExtra = uint32_t(bitfieldType); - transferRequest.propertySize = uint32_t(bitfieldType >> 3); - transferRequest.fill = uint32_t(bitfieldType >> (3 + 24)); - transferRequest.srcIndexSizeLog2 = uint32_t(bitfieldType >> (3 + 24 + 1)); - transferRequest.dstIndexSizeLog2 = uint32_t(bitfieldType >> (3 + 24 + 1 + 2)); + transferRequest.elementCountExtra = uint32_t(bitfieldType >> 32); + transferRequest.propertySize = uint32_t(bitfieldType >> (32 + 3)); + transferRequest.fill = uint32_t(bitfieldType >> (32 + 3 + 24)); + transferRequest.srcIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1)); + transferRequest.dstIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1 + 2)); const uint dispatchSize = nbl::hlsl::device_capabilities_traits::maxOptimallyResidentWorkgroupInvocations; const bool fill = transferRequest.fill == 1; - vk::RawBufferStore(globals.transferCommandsAddress + 40 * 3, transferRequest.srcAddr); - vk::RawBufferStore(globals.transferCommandsAddress + 40 * 4, transferRequest.dstAddr); - vk::RawBufferStore(globals.transferCommandsAddress + 40 * 5, vk::RawBufferLoad(transferRequest.srcAddr + sizeof(uint16_t) * 3)); - //if (fill) { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } - //else { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } + //uint64_t debugWriteAddr = transferRequest.dstAddr + sizeof(uint64_t) * 9 * propertyId; + //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 0, transferRequest.srcAddr,8); + //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 1, transferRequest.dstAddr,8); + //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 2, transferRequest.srcIndexAddr,8); + //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 3, transferRequest.dstIndexAddr,8); + //uint64_t elementCount = uint64_t(transferRequest.elementCount32) + // | uint64_t(transferRequest.elementCountExtra) << 32; + //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 4, elementCount,8); + //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 5, transferRequest.propertySize,4); + //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 6, transferRequest.fill,4); + //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 7, transferRequest.srcIndexSizeLog2,4); + //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 8, transferRequest.dstIndexSizeLog2,4); + //vk::RawBufferStore(transferRequest.dstAddr + sizeof(uint64_t) * invocationIndex, invocationIndex,8); + + if (fill) { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } + else { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } } } } } -[numthreads(1,1,1)] +// TODO: instead use some sort of replace function for getting optimal size? +[numthreads(512,1,1)] void main(uint32_t3 dispatchId : SV_DispatchThreadID) { nbl::hlsl::property_pools::main(dispatchId); diff --git a/src/nbl/video/utilities/CPropertyPoolHandler.cpp b/src/nbl/video/utilities/CPropertyPoolHandler.cpp index c1d2d7ee5b..065f1ed4da 100644 --- a/src/nbl/video/utilities/CPropertyPoolHandler.cpp +++ b/src/nbl/video/utilities/CPropertyPoolHandler.cpp @@ -130,8 +130,8 @@ bool CPropertyPoolHandler::transferProperties( auto srcRequest = transferPassRequests + i; transferRequest.srcAddr = srcRequest->memblock.buffer.get()->getDeviceAddress() + srcRequest->memblock.offset; transferRequest.dstAddr = srcRequest->buffer.buffer.get()->getDeviceAddress() + srcRequest->buffer.offset; - transferRequest.srcIndexAddr = srcRequest->srcAddressesOffset ? addressBufferDeviceAddr + srcRequest->srcAddressesOffset : 0; - transferRequest.dstIndexAddr = srcRequest->dstAddressesOffset ? addressBufferDeviceAddr + srcRequest->dstAddressesOffset : 0; + transferRequest.srcIndexAddr = srcRequest->srcAddressesOffset != IPropertyPool::invalid ? addressBufferDeviceAddr + srcRequest->srcAddressesOffset : 0; + transferRequest.dstIndexAddr = srcRequest->dstAddressesOffset != IPropertyPool::invalid ? addressBufferDeviceAddr + srcRequest->dstAddressesOffset : 0; transferRequest.elementCount32 = uint32_t(srcRequest->elementCount & (uint64_t(1) << 32) - 1); transferRequest.elementCountExtra = uint32_t(srcRequest->elementCount >> 32); transferRequest.propertySize = srcRequest->elementSize; @@ -144,7 +144,18 @@ bool CPropertyPoolHandler::transferProperties( maxElements = core::max(maxElements, srcRequest->elementCount); } cmdbuf->updateBuffer({ scratch.offset,sizeof(TransferRequest) * requestsThisPass, core::smart_refctd_ptr(scratch.buffer) }, transferRequestsData); - // TODO: pipeline barrier + + const asset::SMemoryBarrier barriers[1] = { { + .srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT, + .srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = asset::ACCESS_FLAGS::SHADER_READ_BITS + } }; + cmdbuf->pipelineBarrier(asset::EDF_NONE,IGPUCommandBuffer::SPipelineBarrierDependencyInfo{ + .memBarriers = barriers + // TODO: .bufBarriers = instead + }); + cmdbuf->bindComputePipeline(m_pipeline.get()); nbl::hlsl::property_pools::GlobalPushContants pushConstants; From 99d80a7b656071a6f3d28f11d56bf2ca22db0e72 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Wed, 14 Feb 2024 22:25:45 -0300 Subject: [PATCH 16/18] PR review fixes --- .../builtin/hlsl/property_pool/copy.comp.hlsl | 89 +++++++++---------- .../builtin/hlsl/property_pool/transfer.hlsl | 8 +- .../video/utilities/CPropertyPoolHandler.h | 3 +- include/nbl/video/utilities/IPropertyPool.h | 3 +- .../video/utilities/CPropertyPoolHandler.cpp | 15 ++-- 5 files changed, 57 insertions(+), 61 deletions(-) diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl index 470a7ff5f5..f9fe2ecac3 100644 --- a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl @@ -9,7 +9,7 @@ namespace hlsl namespace property_pools { -[[vk::push_constant]] GlobalPushContants globals; +[[vk::push_constant]] TransferDispatchInfo globals; template struct TransferLoop @@ -39,12 +39,12 @@ struct TransferLoop else if (SrcIndexSizeLog2 == 3) vk::RawBufferStore(dstAddressMapped, vk::RawBufferLoad(srcAddressMapped)); } - void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) { uint64_t elementCount = uint64_t(transferRequest.elementCount32) | uint64_t(transferRequest.elementCountExtra) << 32; - uint64_t lastInvocation = min(elementCount, globals.endOffset); - for (uint64_t invocationIndex = globals.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize) + uint64_t lastInvocation = min(elementCount, dispatchInfo.endOffset); + for (uint64_t invocationIndex = dispatchInfo.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize) { iteration(propertyId, transferRequest, invocationIndex); } @@ -62,58 +62,53 @@ struct TransferLoop template struct TransferLoopPermutationSrcIndexSizeLog { - void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) { - if (transferRequest.dstIndexSizeLog2 == 0) { TransferLoop loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } - else if (transferRequest.dstIndexSizeLog2 == 1) { TransferLoop loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } - else if (transferRequest.dstIndexSizeLog2 == 2) { TransferLoop loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } - else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ { TransferLoop loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + if (transferRequest.dstIndexSizeLog2 == 0) { TransferLoop loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else if (transferRequest.dstIndexSizeLog2 == 1) { TransferLoop loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else if (transferRequest.dstIndexSizeLog2 == 2) { TransferLoop loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ { TransferLoop loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } } }; template struct TransferLoopPermutationDstIota { - void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) { - if (transferRequest.srcIndexSizeLog2 == 0) { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } - else if (transferRequest.srcIndexSizeLog2 == 1) { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } - else if (transferRequest.srcIndexSizeLog2 == 2) { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } - else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + if (transferRequest.srcIndexSizeLog2 == 0) { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else if (transferRequest.srcIndexSizeLog2 == 1) { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else if (transferRequest.srcIndexSizeLog2 == 2) { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ { TransferLoopPermutationSrcIndexSizeLog loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } } }; template struct TransferLoopPermutationSrcIota { - void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) { bool dstIota = transferRequest.dstIndexAddr == 0; - if (dstIota) { TransferLoopPermutationDstIota loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } - else { TransferLoopPermutationDstIota loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + if (dstIota) { TransferLoopPermutationDstIota loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else { TransferLoopPermutationDstIota loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } } }; template struct TransferLoopPermutationFill { - void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) + void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) { bool srcIota = transferRequest.srcIndexAddr == 0; - if (srcIota) { TransferLoopPermutationSrcIota loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } - else { TransferLoopPermutationSrcIota loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + if (srcIota) { TransferLoopPermutationSrcIota loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } + else { TransferLoopPermutationSrcIota loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } } }; -template -void main(uint32_t3 dispatchId) -{ - const uint propertyId = dispatchId.y; - const uint invocationIndex = dispatchId.x; - - // Loading transfer request from the pointer (can't use struct - // with BDA on HLSL SPIRV) - uint64_t transferCmdAddr = globals.transferCommandsAddress + sizeof(TransferRequest) * propertyId; +// Loading transfer request from the pointer (can't use struct +// with BDA on HLSL SPIRV) +static TransferRequest TransferRequest::newFromAddress(const uint64_t transferCmdAddr) +{ TransferRequest transferRequest; transferRequest.srcAddr = vk::RawBufferLoad(transferCmdAddr,8); transferRequest.dstAddr = vk::RawBufferLoad(transferCmdAddr + sizeof(uint64_t),8); @@ -129,35 +124,31 @@ void main(uint32_t3 dispatchId) transferRequest.srcIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1)); transferRequest.dstIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1 + 2)); - const uint dispatchSize = nbl::hlsl::device_capabilities_traits::maxOptimallyResidentWorkgroupInvocations; + return transferRequest; +} + +template +void main(uint32_t3 dispatchId, const uint dispatchSize) +{ + const uint propertyId = dispatchId.y; + const uint invocationIndex = dispatchId.x; + + uint64_t transferCmdAddr = globals.transferCommandsAddress + sizeof(TransferRequest) * propertyId; + TransferRequest transferRequest = TransferRequest::newFromAddress(transferCmdAddr); + const bool fill = transferRequest.fill == 1; - //uint64_t debugWriteAddr = transferRequest.dstAddr + sizeof(uint64_t) * 9 * propertyId; - //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 0, transferRequest.srcAddr,8); - //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 1, transferRequest.dstAddr,8); - //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 2, transferRequest.srcIndexAddr,8); - //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 3, transferRequest.dstIndexAddr,8); - //uint64_t elementCount = uint64_t(transferRequest.elementCount32) - // | uint64_t(transferRequest.elementCountExtra) << 32; - //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 4, elementCount,8); - //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 5, transferRequest.propertySize,4); - //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 6, transferRequest.fill,4); - //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 7, transferRequest.srcIndexSizeLog2,4); - //vk::RawBufferStore(debugWriteAddr + sizeof(uint64_t) * 8, transferRequest.dstIndexSizeLog2,4); - //vk::RawBufferStore(transferRequest.dstAddr + sizeof(uint64_t) * invocationIndex, invocationIndex,8); - - if (fill) { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } - else { TransferLoopPermutationFill loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); } + if (fill) { TransferLoopPermutationFill loop; loop.copyLoop(globals, invocationIndex, propertyId, transferRequest, dispatchSize); } + else { TransferLoopPermutationFill loop; loop.copyLoop(globals, invocationIndex, propertyId, transferRequest, dispatchSize); } } } } } -// TODO: instead use some sort of replace function for getting optimal size? -[numthreads(512,1,1)] +[numthreads(nbl::hlsl::property_pools::OptimalDispatchSize,1,1)] void main(uint32_t3 dispatchId : SV_DispatchThreadID) { - nbl::hlsl::property_pools::main(dispatchId); + nbl::hlsl::property_pools::main(dispatchId, nbl::hlsl::property_pools::OptimalDispatchSize); } diff --git a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl index 9805d2e4d4..d83a3453c7 100644 --- a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl @@ -34,9 +34,12 @@ struct TransferRequest uint32_t fill: 1; uint32_t srcIndexSizeLog2 : 2; uint32_t dstIndexSizeLog2 : 2; + + // Reads a TransferRequest from a BDA + static TransferRequest newFromAddress(const uint64_t address); }; -struct GlobalPushContants +struct TransferDispatchInfo { // BDA address (GPU pointer) into the transfer commands buffer uint64_t transferCommandsAddress; @@ -49,6 +52,9 @@ struct GlobalPushContants NBL_CONSTEXPR uint32_t MaxPropertiesPerDispatch = 128; +// TODO: instead use some sort of replace function for getting optimal size? +NBL_CONSTEXPR uint32_t OptimalDispatchSize = 256; + } } } diff --git a/include/nbl/video/utilities/CPropertyPoolHandler.h b/include/nbl/video/utilities/CPropertyPoolHandler.h index 2c6f9eeac5..f1a6e6da2e 100644 --- a/include/nbl/video/utilities/CPropertyPoolHandler.h +++ b/include/nbl/video/utilities/CPropertyPoolHandler.h @@ -13,8 +13,7 @@ #include "nbl/video/utilities/IPropertyPool.h" #include "glm/glm/glm.hpp" -#include -#include +#include "nbl/builtin/hlsl/cpp_compat.hlsl" #include "nbl/builtin/hlsl/property_pool/transfer.hlsl" namespace nbl::video diff --git a/include/nbl/video/utilities/IPropertyPool.h b/include/nbl/video/utilities/IPropertyPool.h index 3c23ddce24..2e98af158b 100644 --- a/include/nbl/video/utilities/IPropertyPool.h +++ b/include/nbl/video/utilities/IPropertyPool.h @@ -12,8 +12,7 @@ #include "nbl/video/IGPUDescriptorSetLayout.h" #include "glm/glm/glm.hpp" -#include -#include +#include "nbl/builtin/hlsl/cpp_compat.hlsl" #include "nbl/builtin/hlsl/property_pool/transfer.hlsl" namespace nbl::video diff --git a/src/nbl/video/utilities/CPropertyPoolHandler.cpp b/src/nbl/video/utilities/CPropertyPoolHandler.cpp index 065f1ed4da..72bab24089 100644 --- a/src/nbl/video/utilities/CPropertyPoolHandler.cpp +++ b/src/nbl/video/utilities/CPropertyPoolHandler.cpp @@ -24,8 +24,8 @@ CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptrcreatePipelineLayout({ &baseDWORD,1u }); + const asset::SPushConstantRange transferInfoPushConstants = { asset::IShader::ESS_COMPUTE,0u,sizeof(nbl::hlsl::property_pools::TransferDispatchInfo) }; + auto layout = m_device->createPipelineLayout({ &transferInfoPushConstants,1u }); { video::IGPUComputePipeline::SCreationParams params = {}; @@ -93,7 +93,7 @@ bool CPropertyPoolHandler::transferProperties( IGPUCommandBuffer* const cmdbuf, //IGPUFence* const fence, const asset::SBufferBinding& scratch, const asset::SBufferBinding& addresses, const TransferRequest* const requestsBegin, const TransferRequest* const requestsEnd, - system::logger_opt_ptr logger, const uint32_t baseDWORD, const uint32_t endDWORD + system::logger_opt_ptr logger, const uint32_t baseOffsetBytes, const uint32_t endOffsetBytes ) { if (requestsBegin==requestsEnd) @@ -158,10 +158,11 @@ bool CPropertyPoolHandler::transferProperties( cmdbuf->bindComputePipeline(m_pipeline.get()); - nbl::hlsl::property_pools::GlobalPushContants pushConstants; + nbl::hlsl::property_pools::TransferDispatchInfo pushConstants; { - pushConstants.beginOffset = baseDWORD; - pushConstants.endOffset = endDWORD; + // TODO: Should the offset bytes be handled elsewhere? + pushConstants.beginOffset = baseOffsetBytes; + pushConstants.endOffset = endOffsetBytes; pushConstants.transferCommandsAddress = scratchBufferDeviceAddr; } assert(getAlignment(scratchBufferDeviceAddr) == 0); @@ -172,7 +173,7 @@ bool CPropertyPoolHandler::transferProperties( { const auto& limits = m_device->getPhysicalDevice()->getLimits(); const auto invocationCoarseness = limits.maxOptimallyResidentWorkgroupInvocations * requestsThisPass; - cmdbuf->dispatch(limits.computeOptimalPersistentWorkgroupDispatchSize(maxElements,invocationCoarseness), requestsThisPass, 1u); + cmdbuf->dispatch((maxElements - 1) / nbl::hlsl::property_pools::OptimalDispatchSize + 1, requestsThisPass, 1u); } // TODO: pipeline barrier } From 61604ee31acf96c9cbdee7e914e43d4358413366 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Thu, 15 Feb 2024 16:59:15 -0300 Subject: [PATCH 17/18] More PR reviws --- include/nbl/video/utilities/IPropertyPool.h | 7 ++--- .../video/utilities/CPropertyPoolHandler.cpp | 26 ++++++++++++++++--- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/include/nbl/video/utilities/IPropertyPool.h b/include/nbl/video/utilities/IPropertyPool.h index 2e98af158b..86c4d02f47 100644 --- a/include/nbl/video/utilities/IPropertyPool.h +++ b/include/nbl/video/utilities/IPropertyPool.h @@ -26,6 +26,7 @@ class NBL_API2 IPropertyPool : public core::IReferenceCounted using PropertyAddressAllocator = core::PoolAddressAllocatorST; static inline constexpr uint64_t invalid = 0; + using value_type = PropertyAddressAllocator::size_type; // virtual const asset::SBufferRange& getPropertyMemoryBlock(uint32_t ix) const =0; @@ -37,19 +38,19 @@ class NBL_API2 IPropertyPool : public core::IReferenceCounted inline bool isContiguous() const {return m_indexToAddr;} // - inline uint64_t getAllocated() const + inline value_type getAllocated() const { return indexAllocator.get_allocated_size(); } // - inline uint64_t getFree() const + inline value_type getFree() const { return indexAllocator.get_free_size(); } // - inline uint64_t getCapacity() const + inline value_type getCapacity() const { // special case allows us to use `get_total_size`, because the pool allocator has no added offsets return indexAllocator.get_total_size(); diff --git a/src/nbl/video/utilities/CPropertyPoolHandler.cpp b/src/nbl/video/utilities/CPropertyPoolHandler.cpp index 72bab24089..5d53ffdd71 100644 --- a/src/nbl/video/utilities/CPropertyPoolHandler.cpp +++ b/src/nbl/video/utilities/CPropertyPoolHandler.cpp @@ -138,8 +138,28 @@ bool CPropertyPoolHandler::transferProperties( transferRequest.fill = 0; // TODO transferRequest.srcIndexSizeLog2 = 1u; // TODO transferRequest.dstIndexSizeLog2 = 1u; // TODO - assert(getAlignment(transferRequest.srcAddr) == 0); - assert(getAlignment(transferRequest.dstAddr) == 0); + if (getAlignment(transferRequest.srcAddr) != 0) + { + logger.log("CPropertyPoolHandler: memblock.buffer BDA address %I64i is not aligned to 8 byte (64 bit)",system::ILogger::ELL_ERROR, transferRequest.srcAddr); + } + if (getAlignment(transferRequest.dstAddr) != 0) + { + logger.log("CPropertyPoolHandler: buffer.buffer BDA address %I64i is not aligned to 8 byte (64 bit)",system::ILogger::ELL_ERROR, transferRequest.dstAddr); + } + if (getAlignment(transferRequest.propertySize) != 0) + { + logger.log("CPropertyPoolHandler: propertySize %i is not aligned to 8 byte (64 bit)",system::ILogger::ELL_ERROR, srcRequest->elementSize); + } + if (transferRequest.srcIndexSizeLog2 < 1 || transferRequest.srcIndexSizeLog2 > 3) + { + auto srcIndexSizeLog2 = transferRequest.srcIndexSizeLog2; + logger.log("CPropertyPoolHandler: srcIndexSizeLog2 %i (%i bit values) are unsupported",system::ILogger::ELL_ERROR, srcIndexSizeLog2, (1 << transferRequest.srcIndexSizeLog2) * sizeof(uint8_t)); + } + if (transferRequest.dstIndexSizeLog2 < 1 || transferRequest.dstIndexSizeLog2 > 3) + { + auto dstIndexSizeLog2 = transferRequest.dstIndexSizeLog2; + logger.log("CPropertyPoolHandler: dstIndexSizeLog2 %i (%i bit values) are unsupported",system::ILogger::ELL_ERROR, dstIndexSizeLog2, (1 << transferRequest.srcIndexSizeLog2) * sizeof(uint8_t)); + } maxElements = core::max(maxElements, srcRequest->elementCount); } @@ -163,7 +183,7 @@ bool CPropertyPoolHandler::transferProperties( // TODO: Should the offset bytes be handled elsewhere? pushConstants.beginOffset = baseOffsetBytes; pushConstants.endOffset = endOffsetBytes; - pushConstants.transferCommandsAddress = scratchBufferDeviceAddr; + pushConstants.transferCommandsAddress = scratchBufferDeviceAddr + transferPassRequestsIndex * sizeof(TransferRequest); } assert(getAlignment(scratchBufferDeviceAddr) == 0); assert(getAlignment(sizeof(TransferRequest)) == 0); From 7ac728b8bdf406a6e30f467b13dd60b674f684e4 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Fri, 16 Feb 2024 15:39:07 -0300 Subject: [PATCH 18/18] PR reviews --- .../builtin/hlsl/property_pool/copy.comp.hlsl | 29 ++++++++++++++----- .../video/utilities/CPropertyPoolHandler.cpp | 3 +- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl index f9fe2ecac3..5d9bce06da 100644 --- a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl +++ b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl @@ -25,18 +25,31 @@ struct TransferLoop // IOTA: Use the index as the fetching offset // Non IOTA: Read the address buffer ("index buffer") to select fetching offset - const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcOffset : vk::RawBufferLoad(transferRequest.srcIndexAddr + srcOffset * sizeof(uint32_t)); - const uint64_t dstAddressBufferOffset = DstIndexIota ? dstOffset : vk::RawBufferLoad(transferRequest.dstIndexAddr + dstOffset * sizeof(uint32_t)); + uint64_t srcAddressBufferOffset; + uint64_t dstAddressBufferOffset; + + if (SrcIndexIota) srcAddressBufferOffset = srcOffset; + else + { + if (SrcIndexSizeLog2 == 0) {} // we can't read individual byte + else if (SrcIndexSizeLog2 == 1) srcAddressBufferOffset = vk::RawBufferLoad(transferRequest.srcIndexAddr + srcOffset * sizeof(uint16_t)); + else if (SrcIndexSizeLog2 == 2) srcAddressBufferOffset = vk::RawBufferLoad(transferRequest.srcIndexAddr + srcOffset * sizeof(uint32_t)); + else if (SrcIndexSizeLog2 == 3) srcAddressBufferOffset = vk::RawBufferLoad(transferRequest.srcIndexAddr + srcOffset * sizeof(uint64_t)); + } + + if (DstIndexIota) dstAddressBufferOffset = dstOffset; + else + { + if (DstIndexSizeLog2 == 0) {} // we can't read individual byte + else if (DstIndexSizeLog2 == 1) dstAddressBufferOffset = vk::RawBufferLoad(transferRequest.dstIndexAddr + dstOffset * sizeof(uint16_t)); + else if (DstIndexSizeLog2 == 2) dstAddressBufferOffset = vk::RawBufferLoad(transferRequest.dstIndexAddr + dstOffset * sizeof(uint32_t)); + else if (DstIndexSizeLog2 == 3) dstAddressBufferOffset = vk::RawBufferLoad(transferRequest.dstIndexAddr + dstOffset * sizeof(uint64_t)); + } const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize; const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize; - //vk::RawBufferStore(transferRequest.dstAddr + invocationIndex * sizeof(uint64_t) * 2, srcAddressMapped,8); - //vk::RawBufferStore(transferRequest.dstAddr + invocationIndex * sizeof(uint64_t) * 2 + sizeof(uint64_t), dstAddressMapped,8); - if (SrcIndexSizeLog2 == 0) {} // we can't write individual bytes - else if (SrcIndexSizeLog2 == 1) vk::RawBufferStore(dstAddressMapped, vk::RawBufferLoad(srcAddressMapped)); - else if (SrcIndexSizeLog2 == 2) vk::RawBufferStore(dstAddressMapped, vk::RawBufferLoad(srcAddressMapped)); - else if (SrcIndexSizeLog2 == 3) vk::RawBufferStore(dstAddressMapped, vk::RawBufferLoad(srcAddressMapped)); + vk::RawBufferStore(dstAddressMapped, vk::RawBufferLoad(srcAddressMapped)); } void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) diff --git a/src/nbl/video/utilities/CPropertyPoolHandler.cpp b/src/nbl/video/utilities/CPropertyPoolHandler.cpp index 5d53ffdd71..1dcc82540e 100644 --- a/src/nbl/video/utilities/CPropertyPoolHandler.cpp +++ b/src/nbl/video/utilities/CPropertyPoolHandler.cpp @@ -193,7 +193,8 @@ bool CPropertyPoolHandler::transferProperties( { const auto& limits = m_device->getPhysicalDevice()->getLimits(); const auto invocationCoarseness = limits.maxOptimallyResidentWorkgroupInvocations * requestsThisPass; - cmdbuf->dispatch((maxElements - 1) / nbl::hlsl::property_pools::OptimalDispatchSize + 1, requestsThisPass, 1u); + const auto dispatchElements = (maxElements - 1) / requestsThisPass + 1; + cmdbuf->dispatch(limits.computeOptimalPersistentWorkgroupDispatchSize(dispatchElements,invocationCoarseness), requestsThisPass, 1u); } // TODO: pipeline barrier }