From dddf5755d6a7bef8d78aba37949b8badd85594fa Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Tue, 16 Jan 2024 21:34:53 -0300 Subject: [PATCH 01/13] Work on property pools example --- 66_PropertyPools/CMakeLists.txt | 24 ++ 66_PropertyPools/app_resources/common.hlsl | 22 ++ .../app_resources/shader.comp.hlsl | 33 ++ 66_PropertyPools/config.json.template | 28 ++ 66_PropertyPools/main.cpp | 292 ++++++++++++++++++ 66_PropertyPools/pipeline.groovy | 50 +++ CMakeLists.txt | 1 + 7 files changed, 450 insertions(+) create mode 100644 66_PropertyPools/CMakeLists.txt create mode 100644 66_PropertyPools/app_resources/common.hlsl create mode 100644 66_PropertyPools/app_resources/shader.comp.hlsl create mode 100644 66_PropertyPools/config.json.template create mode 100644 66_PropertyPools/main.cpp create mode 100644 66_PropertyPools/pipeline.groovy diff --git a/66_PropertyPools/CMakeLists.txt b/66_PropertyPools/CMakeLists.txt new file mode 100644 index 000000000..bc1624875 --- /dev/null +++ b/66_PropertyPools/CMakeLists.txt @@ -0,0 +1,24 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/66_PropertyPools/app_resources/common.hlsl b/66_PropertyPools/app_resources/common.hlsl new file mode 100644 index 000000000..6f339aa13 --- /dev/null +++ b/66_PropertyPools/app_resources/common.hlsl @@ -0,0 +1,22 @@ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + +// Unfortunately not every piece of C++14 metaprogramming syntax is available in HLSL 202x +// https://github.com/microsoft/DirectXShaderCompiler/issues/5751#issuecomment-1800847954 +typedef nbl::hlsl::float32_t3 input_t; +typedef nbl::hlsl::float32_t output_t; + +NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxPossibleElementCount = 1 << 20; + +struct PushConstantData +{ + uint64_t inputAddress; + uint64_t outputAddress; + uint32_t dataElementCount; +}; + +NBL_CONSTEXPR uint32_t WorkgroupSize = 256; + +#include "nbl/builtin/hlsl/random/xoroshiro.hlsl" + +// Yes we do have our own re-creation of C++'s STL in HLSL2021 ! +#include "nbl/builtin/hlsl/limits.hlsl" \ No newline at end of file diff --git a/66_PropertyPools/app_resources/shader.comp.hlsl b/66_PropertyPools/app_resources/shader.comp.hlsl new file mode 100644 index 000000000..4aeef0e0f --- /dev/null +++ b/66_PropertyPools/app_resources/shader.comp.hlsl @@ -0,0 +1,33 @@ +#include "common.hlsl" + +// just a small test +#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" + +[[vk::push_constant]] PushConstantData pushConstants; + +// does absolutely nothing, a later example will show how it gets used +template +void dummyTraitTest() {} + +[numthreads(WorkgroupSize,1,1)] +void main(uint32_t3 ID : SV_DispatchThreadID) +{ + dummyTraitTest(); + if (ID.x>=pushConstants.dataElementCount) + return; + + const input_t self = vk::RawBufferLoad(pushConstants.inputAddress+sizeof(input_t)*ID.x); + + nbl::hlsl::Xoroshiro64StarStar rng = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(pushConstants.dataElementCount,ID.x)^0xdeadbeefu); + + float32_t acc = nbl::hlsl::numeric_limits::max; + const static uint32_t OthersToTest = 15; + [[unroll(OthersToTest)]] + for (uint32_t i=0; i(pushConstants.inputAddress+sizeof(input_t)*offset); + acc = min(length(other-self),acc); + } + vk::RawBufferStore(pushConstants.outputAddress+sizeof(float32_t)*ID.x,acc); +} \ No newline at end of file diff --git a/66_PropertyPools/config.json.template b/66_PropertyPools/config.json.template new file mode 100644 index 000000000..717d05d53 --- /dev/null +++ b/66_PropertyPools/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", // should be none + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp new file mode 100644 index 000000000..155ece55b --- /dev/null +++ b/66_PropertyPools/main.cpp @@ -0,0 +1,292 @@ +// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + + +// I've moved out a tiny part of this example into a shared header for reuse, please open and read it. +#include "../common/MonoDeviceApplication.hpp" +#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp" + +using namespace nbl; +using namespace core; +using namespace system; +using namespace asset; +using namespace video; + + +#include "app_resources/common.hlsl" +#include "nbl/builtin/hlsl/bit.hlsl" + + +// In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants +class PropertyPoolsApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = examples::MonoDeviceApplication; + using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication; + + // This is the first example that submits multiple workloads in-flight. + // What the shader does is it computes the minimum distance of each point against K other random input points. + // Having the GPU randomly access parts of the buffer requires it to be DEVICE_LOCAL for performance. + // Then the CPU downloads the results and finds the median minimum distance via quick-select. + // This bizzare synthetic workload was specifically chosen for its unfriendliness towards simple buffer usage. + // The fact we have variable sized workloads and run them in a loop means we either have to dynamically + // suballocate from a single buffer or have K worst-case sized buffers we round robin for K-workloads in flight. + // Creating and destroying buffers at runtime is not an option as those are very expensive operations. + // Also since CPU needs to heapify the outputs, we need to have the GPU write them into RAM not VRAM. + smart_refctd_ptr m_pipeline; + + // The Utility class has lots of methods to handle staging without relying on ReBAR or EXT_host_image_copy as well as more complex methods we'll cover later. + // Until EXT_host_image_copy becomes ubiquitous across all Nabla Core Profile devices, you need to stage image copies from an IGPUBuffer to an IGPUImage. + // Why use Staging for buffers in the age of ReBAR? While GPU workloads overlap the CPU, individual GPU workloads's execution might not overlap each other + // but their data might. In this case you want to "precisely" time the data update on the GPU timeline between the end and start of a workload. + // For very small updates you could use the commandbuffer updateBuffer method, but it has a size limit and the data enqueued takes up space in the commandpool. + // Sometimes it might be unfeasible to either have multiple copies or update references to those copies without a cascade update. + // One example is the transformation graph of nodes in a scene, where a copy-on-write of a node would require the update the offset/pointer held by + // any other node that refers to it. This quickly turns into a cascade that would force you to basically create a full copy of the entire data structure + // after most updates. Whereas with staging you'd "queue up" the much smaller set of updates to apply between each computation step which uses the graph. + // Another example are UBO and SSBO bindings, where once you run out of dynamic bindings, you can no longer easily change offsets without introducting extra indirection in shaders. + // Actually staging can help you re-use a commandbuffer because you don't need to re-record it if you don't need to change the offsets at which you bind! + // Finally ReBAR is a precious resource, my 8GB RTX 3070 only reports a 214MB Heap backing HOST_VISIBLE and DEVICE_LOCAL device local memory type. + smart_refctd_ptr m_utils; + + // We call them downstreaming and upstreaming, simply by how we used them so far. + // Meaning that upstreaming is uncached and usually ReBAR (DEVICE_LOCAL), for simple memcpy like sequential writes. + // While the downstreaming is CACHED and not DEVICE_LOCAL for fast random acess by the CPU. + // However there are cases when you'd want to use a buffer with flags identical to the default downstreaming buffer for uploads, + // such cases is when a CPU needs to build a data-structure in-place (due to memory constraints) before GPU accesses it, + // one example are Host Acceleration Structure builds (BVH building requires lots of repeated memory accesses). + // When choosing the memory properties of a mapped buffer consider which processor (CPU or GPU) needs faster access in event of a cache-miss. + nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer; + StreamingTransientDataBufferMT<>* m_downStreamingBuffer; + // These are Buffer Device Addresses + uint64_t m_upStreamingBufferAddress; + uint64_t m_downStreamingBufferAddress; + + // You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!) + uint32_t m_alignment; + + // The pool cache is just a formalized way of round-robining command pools and resetting + reusing them after their most recent submit signals finished. + // Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools. + smart_refctd_ptr m_poolCache; + + // We'll run the iterations in reverse, easier to write "keep running" + uint32_t m_iteration = 200; + + public: + // Yay thanks to multiple inheritance we cannot forward ctors anymore + PropertyPoolsApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD,_localOutputCWD,_sharedInputCWD,_sharedOutputCWD) {} + + // we stuff all our work here because its a "single shot" app + bool onAppInitialized(smart_refctd_ptr&& system) override + { + // Remember to call the base class initialization! + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + // this time we load a shader directly from a file + smart_refctd_ptr shader; + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; // virtual root + auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl",lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + return logFail("Could not load shader!"); + + // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader + auto source = IAsset::castDown(assets[0]); + // The down-cast should not fail! + assert(source); + + IGPUObjectFromAssetConverter::SParams conversionParams = {}; + conversionParams.device = m_device.get(); + conversionParams.assetManager = m_assetMgr.get(); + created_gpu_object_array convertedGPUObjects = std::make_unique()->getGPUObjectsFromAssets(&source,&source+1,conversionParams); + if (convertedGPUObjects->empty() || !convertedGPUObjects->front()) + return logFail("Conversion of a CPU Specialized Shader to GPU failed!"); + + shader = convertedGPUObjects->front(); + } + + // The StreamingTransientDataBuffers are actually composed on top of another useful utility called `CAsyncSingleBufferSubAllocator` + // The difference is that the streaming ones are made on top of ranges of `IGPUBuffer`s backed by mappable memory, whereas the + // `CAsyncSingleBufferSubAllocator` just allows you suballocate subranges of any `IGPUBuffer` range with deferred/latched frees. + constexpr uint32_t DownstreamBufferSize = sizeof(output_t)<<24; + constexpr uint32_t UpstreamBufferSize = sizeof(input_t)<<24; + m_utils = make_smart_refctd_ptr(smart_refctd_ptr(m_device),smart_refctd_ptr(m_logger),DownstreamBufferSize,UpstreamBufferSize); + if (!m_utils) + return logFail("Failed to create Utilities!"); + m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer(); + m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer(); + m_upStreamingBufferAddress = m_device->getBufferDeviceAddress(m_upStreamingBuffer->getBuffer()); + m_downStreamingBufferAddress = m_device->getBufferDeviceAddress(m_downStreamingBuffer->getBuffer()); + + // People love Reflection but I prefer Shader Sources instead! + const nbl::asset::SPushConstantRange pcRange = {.stageFlags=IShader::ESS_COMPUTE,.offset=0,.size=sizeof(PushConstantData)}; + + // This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size + // and using traditional SSBO bindings would force us to update the Descriptor Set every frame. + // I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic + // only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding. + // Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size. + m_pipeline = m_device->createComputePipeline(nullptr,m_device->createPipelineLayout(&pcRange,&pcRange+1),std::move(shader)); + + const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits(); + // The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices + // which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets. + // Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc. + // there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those. + // We'll align to max of coherent atom size even if the memory is coherent, + // and we also need to take into account BDA shader loads need to be aligned to the type being loaded. + m_alignment = core::max(deviceLimits.nonCoherentAtomSize,alignof(float)); + + // We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are + // the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously. + constexpr auto MaxConcurrency = 64; + // Since this time we don't throw the Command Pools away and we'll reset them instead, we don't create the pools with the transient flag + m_poolCache = make_smart_refctd_ptr(m_device.get(),getComputeQueue()->getFamilyIndex(), IGPUCommandPool::ECF_NONE, MaxConcurrency); + + return true; + } + + // Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script) + bool keepRunning() override { return m_iteration; } + + // Finally the first actual work-loop + void workLoopBody() override + { + m_iteration--; + IGPUQueue* const queue = getComputeQueue(); + + // Note that I'm using the sample struct with methods that have identical code which compiles as both C++ and HLSL + auto rng = nbl::hlsl::Xoroshiro64StarStar::construct({m_iteration^0xdeadbeefu,std::hash()(_NBL_APP_NAME_)}); + + // we dynamically choose the number of elements for each iteration + const auto elementCount = rng()%MaxPossibleElementCount; + const uint32_t inputSize = sizeof(input_t)*elementCount; + + // The allocators can do multiple allocations at once for efficiency + const uint32_t AllocationCount = 1; + // It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value + // this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args. + auto inputOffset = m_upStreamingBuffer->invalid_value; + + // We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled) + // Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later). + std::chrono::steady_clock::time_point waitTill(std::chrono::years(45)); + // note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly + m_upStreamingBuffer->multi_allocate(waitTill,AllocationCount,&inputOffset,&inputSize,&m_alignment); + + // Generate our data in-place on the allocated staging buffer + { + auto* const inputPtr = reinterpret_cast(reinterpret_cast(m_upStreamingBuffer->getBufferPointer())+inputOffset); + for (auto j=0; j::max); + } + // Always remember to flush! + if (m_upStreamingBuffer->needsManualFlushOrInvalidate()) + { + const IDeviceMemoryAllocation::MappedMemoryRange range(m_upStreamingBuffer->getBuffer()->getBoundMemory(),inputOffset,inputSize); + m_device->flushMappedMemoryRanges(1,&range); + } + } + + // Obtain our command pool once one gets recycled + uint32_t poolIx; + do + { + poolIx = m_poolCache->acquirePool(); + } while (poolIx==ICommandPoolCache::invalid_index); + + // finally allocate our output range + const uint32_t outputSize = sizeof(output_t)*elementCount; + auto outputOffset = m_downStreamingBuffer->invalid_value; + m_downStreamingBuffer->multi_allocate(waitTill,AllocationCount,&outputOffset,&outputSize,&m_alignment); + + smart_refctd_ptr cmdbuf; + { + m_device->createCommandBuffers(m_poolCache->getPool(poolIx),IGPUCommandBuffer::EL_PRIMARY,1,&cmdbuf); + // lets record, its still a one time submit because we have to re-record with different push constants each time + cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); + cmdbuf->bindComputePipeline(m_pipeline.get()); + // This is the new fun part, pushing constants + const PushConstantData pc = { + .inputAddress=m_upStreamingBufferAddress+inputOffset, + .outputAddress=m_downStreamingBufferAddress+outputOffset, + .dataElementCount=elementCount + }; + cmdbuf->pushConstants(m_pipeline->getLayout(),IShader::ESS_COMPUTE,0u,sizeof(pc),&pc); + // Good old trick to get rounded up divisions, in case you're not familiar + cmdbuf->dispatch((elementCount-1)/WorkgroupSize+1,1,1); + cmdbuf->end(); + } + + // TODO: redo with a single timeline semaphore + auto fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED); + { + IGPUQueue::SSubmitInfo submitInfo = {}; + submitInfo.commandBufferCount = 1; + submitInfo.commandBuffers = &cmdbuf.get(); + + queue->startCapture(); + queue->submit(1u,&submitInfo,fence.get()); + queue->endCapture(); + } + + // We can also actually latch our Command Pool reset and its return to the pool of free pools! + m_poolCache->releaseSet(m_device.get(),smart_refctd_ptr(fence),poolIx); + + // As promised, we can defer an upstreaming buffer deallocation until a fence is signalled + // You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation. + m_upStreamingBuffer->multi_deallocate(AllocationCount,&inputOffset,&inputSize,smart_refctd_ptr(fence)); + + // Because C++17 and C++20 can't make their mind up about what to do with `this` in event of a [=] capture, lets triple ensure the m_iteration is captured by value. + const auto savedIterNum = m_iteration; + + // Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer. + // Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory. + // Its nice because it will also remember to invalidate our memory mapping if its not coherent. + auto latchedConsumer = make_smart_refctd_ptr( + IDeviceMemoryAllocation::MemoryRange(outputOffset,outputSize), + // Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals + [=](const size_t dstOffset, const void* bufSrc, const size_t size)->void + { + // The unused variable is used for letting the consumer know the subsection of the output we've managed to download + // But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves. + assert(dstOffset==0 && size==outputSize); + + // I can const cast, we know the mapping is just a pointer + output_t* const data = reinterpret_cast(const_cast(bufSrc)); + auto median = data+elementCount/2; + std::nth_element(data,median,data+elementCount); + + m_logger->log("Iteration %d Median of Minimum Distances is %f",ILogger::ELL_PERFORMANCE,savedIterNum,*median); + }, + // Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it + // hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands. + // It could also be latched in the upstreaming deallocate, because its the same fence. + std::move(cmdbuf),m_downStreamingBuffer + ); + // We put a function we want to execute + m_downStreamingBuffer->multi_deallocate(AllocationCount,&outputOffset,&outputSize,std::move(fence),&latchedConsumer.get()); + } + + bool onAppTerminated() override + { + // Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated` + // (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain) + while (m_downStreamingBuffer->cull_frees()) {} + + return device_base_t::onAppTerminated(); + } +}; + + +NBL_MAIN_FUNC(PropertyPoolsApp) \ No newline at end of file diff --git a/66_PropertyPools/pipeline.groovy b/66_PropertyPools/pipeline.groovy new file mode 100644 index 000000000..1a7b043a4 --- /dev/null +++ b/66_PropertyPools/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CStreamingAndBufferDeviceAddressBuilder extends IBuilder +{ + public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a20a33a9..09a73bfe0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,5 +65,6 @@ if(NBL_BUILD_EXAMPLES) #add_subdirectory(61_UI EXCLUDE_FROM_ALL) add_subdirectory(62_CAD EXCLUDE_FROM_ALL) add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL) + add_subdirectory(66_PropertyPools EXCLUDE_FROM_ALL) add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42 endif() \ No newline at end of file From 43d95c8cca36441dfdd754ba66f24b88ae18426b Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Sun, 21 Jan 2024 10:58:54 -0300 Subject: [PATCH 02/13] Add creation of property pool handler to example --- 66_PropertyPools/main.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp index 155ece55b..941536751 100644 --- a/66_PropertyPools/main.cpp +++ b/66_PropertyPools/main.cpp @@ -62,6 +62,8 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex uint64_t m_upStreamingBufferAddress; uint64_t m_downStreamingBufferAddress; + smart_refctd_ptr m_propertyPoolHandler; + // You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!) uint32_t m_alignment; @@ -86,12 +88,15 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex if (!asset_base_t::onAppInitialized(std::move(system))) return false; + m_propertyPoolHandler = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_device)); + // this time we load a shader directly from a file smart_refctd_ptr shader; { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = m_logger.get(); lp.workingDirectory = ""; // virtual root + auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl",lp); const auto assets = assetBundle.getContents(); if (assets.empty()) From 66e93fbb23c374e445ab3af66848a836b34052c1 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Wed, 24 Jan 2024 21:43:22 -0300 Subject: [PATCH 03/13] Work on doing transferProperties on example --- 66_PropertyPools/main.cpp | 129 +++++++++++++------------------------- 1 file changed, 45 insertions(+), 84 deletions(-) diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp index 941536751..e59f6385a 100644 --- a/66_PropertyPools/main.cpp +++ b/66_PropertyPools/main.cpp @@ -63,6 +63,11 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex uint64_t m_downStreamingBufferAddress; smart_refctd_ptr m_propertyPoolHandler; + smart_refctd_ptr m_scratchBuffer; + smart_refctd_ptr m_addressBuffer; + smart_refctd_ptr m_transferSrcBuffer; + smart_refctd_ptr m_transferDstBuffer; + std::vector m_data; // You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!) uint32_t m_alignment; @@ -74,6 +79,9 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex // We'll run the iterations in reverse, easier to write "keep running" uint32_t m_iteration = 200; + static constexpr uint64_t TransfersAmount = 1024; + static constexpr uint64_t MaxValuesPerTransfer = 512; + public: // Yay thanks to multiple inheritance we cannot forward ctors anymore PropertyPoolsApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : @@ -90,6 +98,27 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex m_propertyPoolHandler = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_device)); + auto createBuffer = [&](uint64_t size) + { + video::IGPUBuffer::SCreationParams creationParams; + creationParams.size = size; + creationParams.usage = core::bitflag(asset::IBuffer::EUF_STORAGE_BUFFER_BIT) | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF; + + auto buffer = m_device->createBuffer(std::move(creationParams)); + nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buffer->getMemoryReqs(); + m_device->allocate(reqs, buffer.get(), nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT); + + return buffer; + }; + + m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount); + m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer); + m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer); + m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer); + + for (uint16_t i = 0; i < uint16_t((uint32_t(1) << 16) - 1); i++) + m_data.push_back(i); + // this time we load a shader directly from a file smart_refctd_ptr shader; { @@ -167,42 +196,6 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex m_iteration--; IGPUQueue* const queue = getComputeQueue(); - // Note that I'm using the sample struct with methods that have identical code which compiles as both C++ and HLSL - auto rng = nbl::hlsl::Xoroshiro64StarStar::construct({m_iteration^0xdeadbeefu,std::hash()(_NBL_APP_NAME_)}); - - // we dynamically choose the number of elements for each iteration - const auto elementCount = rng()%MaxPossibleElementCount; - const uint32_t inputSize = sizeof(input_t)*elementCount; - - // The allocators can do multiple allocations at once for efficiency - const uint32_t AllocationCount = 1; - // It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value - // this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args. - auto inputOffset = m_upStreamingBuffer->invalid_value; - - // We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled) - // Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later). - std::chrono::steady_clock::time_point waitTill(std::chrono::years(45)); - // note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly - m_upStreamingBuffer->multi_allocate(waitTill,AllocationCount,&inputOffset,&inputSize,&m_alignment); - - // Generate our data in-place on the allocated staging buffer - { - auto* const inputPtr = reinterpret_cast(reinterpret_cast(m_upStreamingBuffer->getBufferPointer())+inputOffset); - for (auto j=0; j::max); - } - // Always remember to flush! - if (m_upStreamingBuffer->needsManualFlushOrInvalidate()) - { - const IDeviceMemoryAllocation::MappedMemoryRange range(m_upStreamingBuffer->getBuffer()->getBoundMemory(),inputOffset,inputSize); - m_device->flushMappedMemoryRanges(1,&range); - } - } - // Obtain our command pool once one gets recycled uint32_t poolIx; do @@ -210,26 +203,28 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex poolIx = m_poolCache->acquirePool(); } while (poolIx==ICommandPoolCache::invalid_index); - // finally allocate our output range - const uint32_t outputSize = sizeof(output_t)*elementCount; - auto outputOffset = m_downStreamingBuffer->invalid_value; - m_downStreamingBuffer->multi_allocate(waitTill,AllocationCount,&outputOffset,&outputSize,&m_alignment); - smart_refctd_ptr cmdbuf; { m_device->createCommandBuffers(m_poolCache->getPool(poolIx),IGPUCommandBuffer::EL_PRIMARY,1,&cmdbuf); // lets record, its still a one time submit because we have to re-record with different push constants each time cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); cmdbuf->bindComputePipeline(m_pipeline.get()); - // This is the new fun part, pushing constants - const PushConstantData pc = { - .inputAddress=m_upStreamingBufferAddress+inputOffset, - .outputAddress=m_downStreamingBufferAddress+outputOffset, - .dataElementCount=elementCount - }; - cmdbuf->pushConstants(m_pipeline->getLayout(),IShader::ESS_COMPUTE,0u,sizeof(pc),&pc); - // Good old trick to get rounded up divisions, in case you're not familiar - cmdbuf->dispatch((elementCount-1)/WorkgroupSize+1,1,1); + + // COMMAND RECORDING + cmdbuf->updateBuffer(m_transferSrcBuffer.get(), 0, sizeof(uint16_t) * m_data.size(), &m_data[0]); + CPropertyPoolHandler::TransferRequest transferRequest; + transferRequest.memblock = asset::SBufferRange { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr(m_transferSrcBuffer) }; + transferRequest.elementSize = m_data.size(); + transferRequest.elementCount = 1; + transferRequest.buffer = asset::SBufferBinding { 0, core::smart_refctd_ptr(m_transferDstBuffer) }; + + m_propertyPoolHandler->transferProperties(cmdbuf.get(), nullptr, + asset::SBufferBinding{0, core::smart_refctd_ptr(m_scratchBuffer)}, + asset::SBufferBinding{0, core::smart_refctd_ptr(m_addressBuffer)}, + &transferRequest, &transferRequest + 1, + m_logger.get(), 0, MaxValuesPerTransfer + ); + cmdbuf->end(); } @@ -247,40 +242,6 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex // We can also actually latch our Command Pool reset and its return to the pool of free pools! m_poolCache->releaseSet(m_device.get(),smart_refctd_ptr(fence),poolIx); - - // As promised, we can defer an upstreaming buffer deallocation until a fence is signalled - // You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation. - m_upStreamingBuffer->multi_deallocate(AllocationCount,&inputOffset,&inputSize,smart_refctd_ptr(fence)); - - // Because C++17 and C++20 can't make their mind up about what to do with `this` in event of a [=] capture, lets triple ensure the m_iteration is captured by value. - const auto savedIterNum = m_iteration; - - // Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer. - // Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory. - // Its nice because it will also remember to invalidate our memory mapping if its not coherent. - auto latchedConsumer = make_smart_refctd_ptr( - IDeviceMemoryAllocation::MemoryRange(outputOffset,outputSize), - // Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals - [=](const size_t dstOffset, const void* bufSrc, const size_t size)->void - { - // The unused variable is used for letting the consumer know the subsection of the output we've managed to download - // But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves. - assert(dstOffset==0 && size==outputSize); - - // I can const cast, we know the mapping is just a pointer - output_t* const data = reinterpret_cast(const_cast(bufSrc)); - auto median = data+elementCount/2; - std::nth_element(data,median,data+elementCount); - - m_logger->log("Iteration %d Median of Minimum Distances is %f",ILogger::ELL_PERFORMANCE,savedIterNum,*median); - }, - // Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it - // hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands. - // It could also be latched in the upstreaming deallocate, because its the same fence. - std::move(cmdbuf),m_downStreamingBuffer - ); - // We put a function we want to execute - m_downStreamingBuffer->multi_deallocate(AllocationCount,&outputOffset,&outputSize,std::move(fence),&latchedConsumer.get()); } bool onAppTerminated() override From 56f855debea003f6ef80a55bd2a8ec5b6975226e Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Wed, 24 Jan 2024 23:12:21 -0300 Subject: [PATCH 04/13] Work on property pool example --- 66_PropertyPools/main.cpp | 183 +++++++++++++++++++++++++++++++++++--- 1 file changed, 172 insertions(+), 11 deletions(-) diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp index e59f6385a..e1ab9d7b3 100644 --- a/66_PropertyPools/main.cpp +++ b/66_PropertyPools/main.cpp @@ -3,13 +3,147 @@ // For conditions of distribution and use, see copyright notice in nabla.h -// I've moved out a tiny part of this example into a shared header for reuse, please open and read it. -#include "../common/MonoDeviceApplication.hpp" +#include "nbl/video/surface/CSurfaceVulkan.h" + +#include "../common/BasicMultiQueueApplication.hpp" #include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp" +namespace nbl::examples +{ + +using namespace nbl; +using namespace core; +using namespace system; +using namespace ui; +using namespace asset; +using namespace video; + +// Virtual Inheritance because apps might end up doing diamond inheritance +class WindowedApplication : public virtual BasicMultiQueueApplication +{ + using base_t = BasicMultiQueueApplication; + + public: + using base_t::base_t; + + virtual video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override + { + auto retval = base_t::getAPIFeaturesToEnable(); + // We only support one swapchain mode, surface, the other one is Display which we have not implemented yet. + retval.swapchainMode = video::E_SWAPCHAIN_MODE::ESM_SURFACE; + return retval; + } + + // New function, we neeed to know about surfaces to create ahead of time + virtual core::vector getSurfaces() const = 0; + + virtual core::set filterDevices(const core::SRange& physicalDevices) const + { + const auto firstFilter = base_t::filterDevices(physicalDevices); + + video::SPhysicalDeviceFilter deviceFilter = {}; + + const auto surfaces = getSurfaces(); + deviceFilter.requiredSurfaceCompatibilities = surfaces.data(); + deviceFilter.requiredSurfaceCompatibilitiesCount = surfaces.size(); + + return deviceFilter(physicalDevices); + } + + virtual bool onAppInitialized(smart_refctd_ptr&& system) + { + // Remember to call the base class initialization! + if (!base_t::onAppInitialized(std::move(system))) + return false; + + #ifdef _NBL_PLATFORM_WINDOWS_ + m_winMgr = nbl::ui::IWindowManagerWin32::create(); + #else + #error "Unimplemented!" + #endif + } + + core::smart_refctd_ptr m_winMgr; +}; + + +// Before we get onto creating a window, we need to discuss how Nabla handles input, clipboards and cursor control +class IWindowClosedCallback : public virtual nbl::ui::IWindow::IEventCallback +{ + public: + IWindowClosedCallback() : m_gotWindowClosedMsg(false) {} + + // unless you create a separate callback per window, both will "trip" this condition + bool windowGotClosed() const {return m_gotWindowClosedMsg;} + + private: + bool onWindowClosed_impl() override + { + m_gotWindowClosedMsg = true; + return true; + } + + bool m_gotWindowClosedMsg; +}; + +// We inherit from an application that tries to find Graphics and Compute queues +// because applications with presentable images often want to perform Graphics family operations +// Virtual Inheritance because apps might end up doing diamond inheritance +class SingleNonResizableWindowApplication : public virtual WindowedApplication +{ + using base_t = WindowedApplication; + + protected: + virtual IWindow::SCreationParams getWindowCreationParams() const + { + IWindow::SCreationParams params = {}; + params.callback = make_smart_refctd_ptr(); + params.width = 640; + params.height = 480; + params.x = 32; + params.y = 32; + params.flags = IWindow::ECF_NONE; + params.windowCaption = "SingleNonResizableWindowApplication"; + return params; + } + + core::smart_refctd_ptr m_window; + core::smart_refctd_ptr m_surface; + + public: + using base_t::base_t; + + virtual bool onAppInitialized(smart_refctd_ptr&& system) override + { + // Remember to call the base class initialization! + if (!base_t::onAppInitialized(std::move(system))) + return false; + + m_window = m_winMgr->createWindow(getWindowCreationParams()); + m_surface = video::CSurfaceVulkanWin32::create(core::smart_refctd_ptr(m_api),core::smart_refctd_ptr_static_cast(m_window)); + return true; + } + + virtual core::vector getSurfaces() const + { + return {{m_surface.get()/*,EQF_NONE*/}}; + } + + virtual bool keepRunning() override + { + if (!m_window || reinterpret_cast(m_window->getEventCallback())->windowGotClosed()) + return false; + + return true; + } +}; +} + + using namespace nbl; using namespace core; using namespace system; +using namespace ui; using namespace asset; using namespace video; @@ -19,7 +153,7 @@ using namespace video; // In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants -class PropertyPoolsApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication +class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication { using device_base_t = examples::MonoDeviceApplication; using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication; @@ -98,23 +232,29 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex m_propertyPoolHandler = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_device)); - auto createBuffer = [&](uint64_t size) + auto createBuffer = [&](uint64_t size, core::bitflag flags, const char* name, bool hostVisible) { video::IGPUBuffer::SCreationParams creationParams; - creationParams.size = size; - creationParams.usage = core::bitflag(asset::IBuffer::EUF_STORAGE_BUFFER_BIT) | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF; + creationParams.size = ((size + 3) / 4) * 4; // Align + creationParams.usage = flags + | asset::IBuffer::EUF_STORAGE_BUFFER_BIT + | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT + | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF; auto buffer = m_device->createBuffer(std::move(creationParams)); nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buffer->getMemoryReqs(); + if (hostVisible) + reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); m_device->allocate(reqs, buffer.get(), nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT); + buffer->setObjectDebugName(name); return buffer; }; - m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount); - m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer); - m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer); - m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer); + m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_scratchBuffer", false); + m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_addressBuffer", false); + m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_transferSrcBuffer", false); + m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_transferDstBuffer", true); for (uint16_t i = 0; i < uint16_t((uint32_t(1) << 16) - 1); i++) m_data.push_back(i); @@ -211,7 +351,12 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex cmdbuf->bindComputePipeline(m_pipeline.get()); // COMMAND RECORDING - cmdbuf->updateBuffer(m_transferSrcBuffer.get(), 0, sizeof(uint16_t) * m_data.size(), &m_data[0]); + uint32_t dataSize = (((sizeof(uint16_t) * m_data.size()) + 3) / 4) * 4; + uint32_t maxUpload = 65536; + for (uint32_t offset = 0; offset < dataSize; offset += maxUpload) + { + cmdbuf->updateBuffer(m_transferSrcBuffer.get(), offset, maxUpload, &m_data[offset / sizeof(uint16_t)]); + } CPropertyPoolHandler::TransferRequest transferRequest; transferRequest.memblock = asset::SBufferRange { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr(m_transferSrcBuffer) }; transferRequest.elementSize = m_data.size(); @@ -239,6 +384,22 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex queue->submit(1u,&submitInfo,fence.get()); queue->endCapture(); } + + { + // Readback ds + auto mem = m_transferDstBuffer->getBoundMemory(); + assert(mem->isMappable()); + auto ptr = m_device->mapMemory(nbl::video::IDeviceMemoryAllocation::MappedMemoryRange(mem, 0, mem->getAllocationSize()), video::IDeviceMemoryAllocation::EMCAF_READ); + auto uint16_t_ptr = static_cast(ptr); + + for (uint32_t i = 0; i < 128; i++) + { + uint16_t value = uint16_t_ptr[i]; + std::printf("%i, ", value); + } + std::printf("\n"); + m_device->unmapMemory(mem); + } // We can also actually latch our Command Pool reset and its return to the pool of free pools! m_poolCache->releaseSet(m_device.get(),smart_refctd_ptr(fence),poolIx); From 3adca44133c70815bea718b3c925197c7ff52f63 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Sat, 27 Jan 2024 18:38:46 -0300 Subject: [PATCH 05/13] Fix vulkan_1_3 incompatibilities --- 66_PropertyPools/main.cpp | 141 +++++++++++++++++++++----------------- 1 file changed, 80 insertions(+), 61 deletions(-) diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp index e1ab9d7b3..d3d9822cd 100644 --- a/66_PropertyPools/main.cpp +++ b/66_PropertyPools/main.cpp @@ -44,8 +44,7 @@ class WindowedApplication : public virtual BasicMultiQueueApplication video::SPhysicalDeviceFilter deviceFilter = {}; const auto surfaces = getSurfaces(); - deviceFilter.requiredSurfaceCompatibilities = surfaces.data(); - deviceFilter.requiredSurfaceCompatibilitiesCount = surfaces.size(); + deviceFilter.requiredSurfaceCompatibilities = { surfaces.data(), surfaces.size() }; return deviceFilter(physicalDevices); } @@ -210,8 +209,10 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat // Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools. smart_refctd_ptr m_poolCache; - // We'll run the iterations in reverse, easier to write "keep running" - uint32_t m_iteration = 200; + // This example really lets the advantages of a timeline semaphore shine through! + smart_refctd_ptr m_timeline; + uint64_t m_iteration = 0; + constexpr static inline uint64_t MaxIterations = 200; static constexpr uint64_t TransfersAmount = 1024; static constexpr uint64_t MaxValuesPerTransfer = 512; @@ -234,21 +235,21 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat auto createBuffer = [&](uint64_t size, core::bitflag flags, const char* name, bool hostVisible) { - video::IGPUBuffer::SCreationParams creationParams; - creationParams.size = ((size + 3) / 4) * 4; // Align - creationParams.usage = flags - | asset::IBuffer::EUF_STORAGE_BUFFER_BIT - | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT - | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF; - - auto buffer = m_device->createBuffer(std::move(creationParams)); - nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buffer->getMemoryReqs(); - if (hostVisible) - reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); - m_device->allocate(reqs, buffer.get(), nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT); - buffer->setObjectDebugName(name); - - return buffer; + video::IGPUBuffer::SCreationParams creationParams; + creationParams.size = ((size + 3) / 4) * 4; // Align + creationParams.usage = flags + | asset::IBuffer::EUF_STORAGE_BUFFER_BIT + | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT + | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF; + + auto buffer = m_device->createBuffer(std::move(creationParams)); + nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buffer->getMemoryReqs(); + if (hostVisible) + reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); + m_device->allocate(reqs, buffer.get(), nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT); + buffer->setObjectDebugName(name); + + return buffer; }; m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_scratchBuffer", false); @@ -260,30 +261,25 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat m_data.push_back(i); // this time we load a shader directly from a file - smart_refctd_ptr shader; + smart_refctd_ptr shader; { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = m_logger.get(); lp.workingDirectory = ""; // virtual root - auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl",lp); const auto assets = assetBundle.getContents(); if (assets.empty()) return logFail("Could not load shader!"); // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader - auto source = IAsset::castDown(assets[0]); + auto source = IAsset::castDown(assets[0]); // The down-cast should not fail! assert(source); - IGPUObjectFromAssetConverter::SParams conversionParams = {}; - conversionParams.device = m_device.get(); - conversionParams.assetManager = m_assetMgr.get(); - created_gpu_object_array convertedGPUObjects = std::make_unique()->getGPUObjectsFromAssets(&source,&source+1,conversionParams); - if (convertedGPUObjects->empty() || !convertedGPUObjects->front()) - return logFail("Conversion of a CPU Specialized Shader to GPU failed!"); - - shader = convertedGPUObjects->front(); + // this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple + shader = m_device->createShader(source.get()); + if (!shader) + return logFail("Creation of a GPU Shader to from CPU Shader source failed!"); } // The StreamingTransientDataBuffers are actually composed on top of another useful utility called `CAsyncSingleBufferSubAllocator` @@ -296,8 +292,8 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat return logFail("Failed to create Utilities!"); m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer(); m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer(); - m_upStreamingBufferAddress = m_device->getBufferDeviceAddress(m_upStreamingBuffer->getBuffer()); - m_downStreamingBufferAddress = m_device->getBufferDeviceAddress(m_downStreamingBuffer->getBuffer()); + m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress(); + m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress(); // People love Reflection but I prefer Shader Sources instead! const nbl::asset::SPushConstantRange pcRange = {.stageFlags=IShader::ESS_COMPUTE,.offset=0,.size=sizeof(PushConstantData)}; @@ -307,7 +303,14 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat // I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic // only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding. // Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size. - m_pipeline = m_device->createComputePipeline(nullptr,m_device->createPipelineLayout(&pcRange,&pcRange+1),std::move(shader)); + { + auto layout = m_device->createPipelineLayout({&pcRange,1}); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = layout.get(); + params.shader.shader = shader.get(); + if (!m_device->createComputePipelines(nullptr,{¶ms,1},&m_pipeline)) + return logFail("Failed to create compute pipeline!\n"); + } const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits(); // The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices @@ -321,9 +324,12 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat // We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are // the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously. constexpr auto MaxConcurrency = 64; + // Since this time we don't throw the Command Pools away and we'll reset them instead, we don't create the pools with the transient flag - m_poolCache = make_smart_refctd_ptr(m_device.get(),getComputeQueue()->getFamilyIndex(), IGPUCommandPool::ECF_NONE, MaxConcurrency); + m_poolCache = ICommandPoolCache::create(core::smart_refctd_ptr(m_device),getComputeQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::NONE,MaxConcurrency); + // In contrast to fences, we just need one semaphore to rule all dispatches + m_timeline = m_device->createSemaphore(m_iteration); return true; } @@ -334,7 +340,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat void workLoopBody() override { m_iteration--; - IGPUQueue* const queue = getComputeQueue(); + IQueue* const queue = getComputeQueue(); // Obtain our command pool once one gets recycled uint32_t poolIx; @@ -345,9 +351,9 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat smart_refctd_ptr cmdbuf; { - m_device->createCommandBuffers(m_poolCache->getPool(poolIx),IGPUCommandBuffer::EL_PRIMARY,1,&cmdbuf); + m_poolCache->getPool(poolIx)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1},core::smart_refctd_ptr(m_logger)); // lets record, its still a one time submit because we have to re-record with different push constants each time - cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); cmdbuf->bindComputePipeline(m_pipeline.get()); // COMMAND RECORDING @@ -355,7 +361,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat uint32_t maxUpload = 65536; for (uint32_t offset = 0; offset < dataSize; offset += maxUpload) { - cmdbuf->updateBuffer(m_transferSrcBuffer.get(), offset, maxUpload, &m_data[offset / sizeof(uint16_t)]); + cmdbuf->updateBuffer({ offset, maxUpload, core::smart_refctd_ptr(m_transferSrcBuffer) }, &m_data[offset / sizeof(uint16_t)]); } CPropertyPoolHandler::TransferRequest transferRequest; transferRequest.memblock = asset::SBufferRange { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr(m_transferSrcBuffer) }; @@ -363,7 +369,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat transferRequest.elementCount = 1; transferRequest.buffer = asset::SBufferBinding { 0, core::smart_refctd_ptr(m_transferDstBuffer) }; - m_propertyPoolHandler->transferProperties(cmdbuf.get(), nullptr, + m_propertyPoolHandler->transferProperties(cmdbuf.get(), asset::SBufferBinding{0, core::smart_refctd_ptr(m_scratchBuffer)}, asset::SBufferBinding{0, core::smart_refctd_ptr(m_addressBuffer)}, &transferRequest, &transferRequest + 1, @@ -373,36 +379,49 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat cmdbuf->end(); } - // TODO: redo with a single timeline semaphore - auto fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED); + + const auto savedIterNum = m_iteration++; { - IGPUQueue::SSubmitInfo submitInfo = {}; - submitInfo.commandBufferCount = 1; - submitInfo.commandBuffers = &cmdbuf.get(); + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = + { + .cmdbuf = cmdbuf.get() + }; + const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = + { + .semaphore = m_timeline.get(), + .value = m_iteration, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + // Generally speaking we don't need to wait on any semaphore because in this example every dispatch gets its own clean piece of memory to use + // from the point of view of the GPU. Implicit domain operations between Host and Device happen upon a submit and a semaphore/fence signal operation, + // this ensures we can touch the input and get accurate values from the output memory using the CPU before and after respectively, each submit becoming PENDING. + // If we actually cared about this submit seeing the memory accesses of a previous dispatch we could add a semaphore wait + const IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = {}, + .commandBuffers = {&cmdbufInfo,1}, + .signalSemaphores = {&signalInfo,1} + }; queue->startCapture(); - queue->submit(1u,&submitInfo,fence.get()); + queue->submit({ &submitInfo,1 }); queue->endCapture(); } { - // Readback ds - auto mem = m_transferDstBuffer->getBoundMemory(); - assert(mem->isMappable()); - auto ptr = m_device->mapMemory(nbl::video::IDeviceMemoryAllocation::MappedMemoryRange(mem, 0, mem->getAllocationSize()), video::IDeviceMemoryAllocation::EMCAF_READ); - auto uint16_t_ptr = static_cast(ptr); - - for (uint32_t i = 0; i < 128; i++) - { - uint16_t value = uint16_t_ptr[i]; - std::printf("%i, ", value); - } - std::printf("\n"); - m_device->unmapMemory(mem); + //// Readback ds + //auto mem = m_transferDstBuffer->getBoundMemory(); + //assert(mem->isMappable()); + //auto ptr = m_device->mapMemory(nbl::video::IDeviceMemoryAllocation::MappedMemoryRange(mem, 0, mem->getAllocationSize()), video::IDeviceMemoryAllocation::EMCAF_READ); + //auto uint16_t_ptr = static_cast(ptr); + + //for (uint32_t i = 0; i < 128; i++) + //{ + // uint16_t value = uint16_t_ptr[i]; + // std::printf("%i, ", value); + //} + //std::printf("\n"); + //m_device->unmapMemory(mem); } - - // We can also actually latch our Command Pool reset and its return to the pool of free pools! - m_poolCache->releaseSet(m_device.get(),smart_refctd_ptr(fence),poolIx); } bool onAppTerminated() override From e8e512f027614057749fd6ff483c8e98be407a15 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Sat, 27 Jan 2024 21:35:00 -0300 Subject: [PATCH 06/13] Update property pool example for vulkan_1_3 --- 66_PropertyPools/main.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp index d3d9822cd..ff1e47b77 100644 --- a/66_PropertyPools/main.cpp +++ b/66_PropertyPools/main.cpp @@ -334,12 +334,11 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat } // Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script) - bool keepRunning() override { return m_iteration; } + bool keepRunning() override { return m_iterationgetBoundMemory(); - //assert(mem->isMappable()); - //auto ptr = m_device->mapMemory(nbl::video::IDeviceMemoryAllocation::MappedMemoryRange(mem, 0, mem->getAllocationSize()), video::IDeviceMemoryAllocation::EMCAF_READ); - //auto uint16_t_ptr = static_cast(ptr); - - //for (uint32_t i = 0; i < 128; i++) - //{ - // uint16_t value = uint16_t_ptr[i]; - // std::printf("%i, ", value); - //} - //std::printf("\n"); - //m_device->unmapMemory(mem); + // Readback ds + auto mem = m_transferDstBuffer->getBoundMemory(); + void* ptr = mem.memory->map({ mem.offset, mem.memory->getAllocationSize() }); + + auto uint16_t_ptr = reinterpret_cast(ptr); + + for (uint32_t i = 0; i < 128; i++) + { + uint16_t value = uint16_t_ptr[i]; + std::printf("%i, ", value); + } + std::printf("\n"); + bool success = mem.memory->unmap(); + assert(success); } } From f8340306a6a29089fce5a9a106bca20a5ad336c1 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Mon, 29 Jan 2024 15:12:00 -0300 Subject: [PATCH 07/13] WIP testing --- 66_PropertyPools/main.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp index ff1e47b77..5230ae552 100644 --- a/66_PropertyPools/main.cpp +++ b/66_PropertyPools/main.cpp @@ -252,7 +252,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat return buffer; }; - m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_scratchBuffer", false); + m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_scratchBuffer", true); m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_addressBuffer", false); m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_transferSrcBuffer", false); m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_transferDstBuffer", true); @@ -408,17 +408,18 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat { // Readback ds - auto mem = m_transferDstBuffer->getBoundMemory(); + auto mem = m_scratchBuffer->getBoundMemory(); void* ptr = mem.memory->map({ mem.offset, mem.memory->getAllocationSize() }); - auto uint16_t_ptr = reinterpret_cast(ptr); - - for (uint32_t i = 0; i < 128; i++) + for (uint32_t i = 0; i < sizeof(nbl::hlsl::property_pools::TransferRequest) * 10; i++) { - uint16_t value = uint16_t_ptr[i]; + uint16_t value = reinterpret_cast(ptr)[i]; std::printf("%i, ", value); } std::printf("\n"); + std::printf("should be %I64i: %I64i\n", m_transferSrcBuffer->getDeviceAddress(), reinterpret_cast(reinterpret_cast(ptr) + 40 * 3)[0]); + std::printf("should be %I64i: %I64i\n", m_transferDstBuffer->getDeviceAddress(), reinterpret_cast(reinterpret_cast(ptr) + 40 * 4)[0]); + std::printf("should be 3: %i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 5)[0]); bool success = mem.memory->unmap(); assert(success); } From 9682dee73e84b105b0df8a09504cdbea7532a312 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Mon, 12 Feb 2024 17:44:06 -0300 Subject: [PATCH 08/13] WIP suballocated descriptor set --- 66_PropertyPools/main.cpp | 42 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp index 5230ae552..443979b02 100644 --- a/66_PropertyPools/main.cpp +++ b/66_PropertyPools/main.cpp @@ -4,6 +4,7 @@ #include "nbl/video/surface/CSurfaceVulkan.h" +#include "nbl/video/alloc/SubAllocatedDescriptorSet.h" #include "../common/BasicMultiQueueApplication.hpp" #include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp" @@ -202,6 +203,8 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat smart_refctd_ptr m_transferDstBuffer; std::vector m_data; + smart_refctd_ptr>> m_subAllocDescriptorSet; + // You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!) uint32_t m_alignment; @@ -217,6 +220,10 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat static constexpr uint64_t TransfersAmount = 1024; static constexpr uint64_t MaxValuesPerTransfer = 512; + constexpr static inline uint32_t maxDescriptorSetAllocationAlignment = 64u*1024u; // if you need larger alignments then you're not right in the head + constexpr static inline uint32_t minDescriptorSetAllocationSize = 1u; + + public: // Yay thanks to multiple inheritance we cannot forward ctors anymore PropertyPoolsApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : @@ -225,6 +232,8 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat // we stuff all our work here because its a "single shot" app bool onAppInitialized(smart_refctd_ptr&& system) override { + using nbl::video::IGPUDescriptorSetLayout; + // Remember to call the base class initialization! if (!device_base_t::onAppInitialized(std::move(system))) return false; @@ -330,6 +339,35 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat // In contrast to fences, we just need one semaphore to rule all dispatches m_timeline = m_device->createSemaphore(m_iteration); + + + // Descriptor set sub allocator + + video::IGPUDescriptorSetLayout::SBinding bindings[1]; + { + bindings[0].binding = 0; + bindings[0].count = 65535u; + bindings[0].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) + | IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT + | IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT; + bindings[0].type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE; + bindings[0].stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE; + } + + std::span bindingsSpan(bindings); + + // TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1) + auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr>>( + bindings, maxDescriptorSetAllocationAlignment, minDescriptorSetAllocationSize + ); + + uint32_t allocation = -1; + uint32_t size = 10; + uint32_t alignment = 1; + subAllocatedDescriptorSet->multi_allocate(1, &allocation, &size, &alignment); + m_logger->log("Allocation: %d\n", system::ILogger::ELL_ERROR, allocation); + assert(allocation); + return true; } @@ -417,8 +455,8 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat std::printf("%i, ", value); } std::printf("\n"); - std::printf("should be %I64i: %I64i\n", m_transferSrcBuffer->getDeviceAddress(), reinterpret_cast(reinterpret_cast(ptr) + 40 * 3)[0]); - std::printf("should be %I64i: %I64i\n", m_transferDstBuffer->getDeviceAddress(), reinterpret_cast(reinterpret_cast(ptr) + 40 * 4)[0]); + std::printf("should be %I64i (alignment: %I64i): %I64i\n", m_transferSrcBuffer->getDeviceAddress(), m_transferSrcBuffer->getDeviceAddress() & 7, reinterpret_cast(reinterpret_cast(ptr) + 40 * 3)[0]); + std::printf("should be %I64i (alignment: %I64i): %I64i\n", m_transferDstBuffer->getDeviceAddress(), m_transferDstBuffer->getDeviceAddress() & 7, reinterpret_cast(reinterpret_cast(ptr) + 40 * 4)[0]); std::printf("should be 3: %i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 5)[0]); bool success = mem.memory->unmap(); assert(success); From 48be8e8350826ef75142eb92d07c17c472337fa4 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Tue, 13 Feb 2024 10:23:49 -0300 Subject: [PATCH 09/13] Testing sub allocator descriptor set allocations --- 66_PropertyPools/main.cpp | 48 +++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp index 443979b02..b6b82f754 100644 --- a/66_PropertyPools/main.cpp +++ b/66_PropertyPools/main.cpp @@ -346,7 +346,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat video::IGPUDescriptorSetLayout::SBinding bindings[1]; { bindings[0].binding = 0; - bindings[0].count = 65535u; + bindings[0].count = 65536u; bindings[0].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) | IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT | IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT; @@ -361,12 +361,46 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat bindings, maxDescriptorSetAllocationAlignment, minDescriptorSetAllocationSize ); - uint32_t allocation = -1; - uint32_t size = 10; - uint32_t alignment = 1; - subAllocatedDescriptorSet->multi_allocate(1, &allocation, &size, &alignment); - m_logger->log("Allocation: %d\n", system::ILogger::ELL_ERROR, allocation); - assert(allocation); + std::vector allocation, size; + { + for (uint32_t i = 0; i < 512; i++) + { + allocation.push_back(core::GeneralpurposeAddressAllocator::invalid_address); + size.push_back(4); + } + subAllocatedDescriptorSet->multi_allocate(allocation.size(), &allocation[0], &size[0]); + for (uint32_t i = 0; i < allocation.size(); i++) + { + m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]); + assert(allocation[i] != core::GeneralpurposeAddressAllocator::invalid_address); + } + } + { + std::vector addr, freeSize; + for (uint32_t i = 0; i < 512; i+=2) + { + addr.push_back(allocation[i]); + freeSize.push_back(4); + } + subAllocatedDescriptorSet->multi_deallocate(addr.size(), &addr[0], &freeSize[0]); + } + + m_logger->log("Freed some allocations", system::ILogger::ELL_INFO); + allocation.clear(); + size.clear(); + { + for (uint32_t i = 0; i < 512; i++) + { + allocation.push_back(core::GeneralpurposeAddressAllocator::invalid_address); + size.push_back(2); + } + subAllocatedDescriptorSet->multi_allocate(allocation.size(), &allocation[0], &size[0]); + for (uint32_t i = 0; i < allocation.size(); i++) + { + m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]); + assert(allocation[i] != core::GeneralpurposeAddressAllocator::invalid_address); + } + } return true; } From 7bc9f35bf054711e92f0fef1ed4d4df5f62bcb31 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Wed, 14 Feb 2024 16:31:40 -0300 Subject: [PATCH 10/13] Work on property pool example fixes --- 66_PropertyPools/main.cpp | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp index b6b82f754..dc16dfeae 100644 --- a/66_PropertyPools/main.cpp +++ b/66_PropertyPools/main.cpp @@ -436,9 +436,11 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat } CPropertyPoolHandler::TransferRequest transferRequest; transferRequest.memblock = asset::SBufferRange { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr(m_transferSrcBuffer) }; - transferRequest.elementSize = m_data.size(); - transferRequest.elementCount = 1; + transferRequest.elementSize = 1; + transferRequest.elementCount = m_data.size(); transferRequest.buffer = asset::SBufferBinding { 0, core::smart_refctd_ptr(m_transferDstBuffer) }; + transferRequest.srcAddressesOffset = IPropertyPool::invalid; + transferRequest.dstAddressesOffset = IPropertyPool::invalid; m_propertyPoolHandler->transferProperties(cmdbuf.get(), asset::SBufferBinding{0, core::smart_refctd_ptr(m_scratchBuffer)}, @@ -447,7 +449,8 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat m_logger.get(), 0, MaxValuesPerTransfer ); - cmdbuf->end(); + auto result = cmdbuf->end(); + assert(result); } @@ -474,13 +477,18 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat }; queue->startCapture(); - queue->submit({ &submitInfo,1 }); + auto statusCode = queue->submit({ &submitInfo,1 }); queue->endCapture(); + assert(statusCode == IQueue::RESULT::SUCCESS); } { + ISemaphore::SWaitInfo infos[1] = {{.semaphore=m_timeline.get(),.value=m_iteration}}; + m_device->blockForSemaphores(infos); // Readback ds - auto mem = m_scratchBuffer->getBoundMemory(); + // TODO: This should readback the m_transferDstBuffer instead + // (we'll read back the destination buffer and check that copy went through as expected) + auto mem = m_transferDstBuffer->getBoundMemory(); // Scratch buffer has the transfer requests void* ptr = mem.memory->map({ mem.offset, mem.memory->getAllocationSize() }); for (uint32_t i = 0; i < sizeof(nbl::hlsl::property_pools::TransferRequest) * 10; i++) @@ -489,9 +497,15 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat std::printf("%i, ", value); } std::printf("\n"); - std::printf("should be %I64i (alignment: %I64i): %I64i\n", m_transferSrcBuffer->getDeviceAddress(), m_transferSrcBuffer->getDeviceAddress() & 7, reinterpret_cast(reinterpret_cast(ptr) + 40 * 3)[0]); - std::printf("should be %I64i (alignment: %I64i): %I64i\n", m_transferDstBuffer->getDeviceAddress(), m_transferDstBuffer->getDeviceAddress() & 7, reinterpret_cast(reinterpret_cast(ptr) + 40 * 4)[0]); - std::printf("should be 3: %i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 5)[0]); + //std::printf("srcAddr %I64i (alignment: %I64i): %I64i\n", m_transferSrcBuffer->getDeviceAddress(), m_transferSrcBuffer->getDeviceAddress() & 7, reinterpret_cast(reinterpret_cast(ptr) + 40 * 3)[0]); + //std::printf("dstAddr %I64i (alignment: %I64i): %I64i\n", m_transferDstBuffer->getDeviceAddress(), m_transferDstBuffer->getDeviceAddress() & 7, reinterpret_cast(reinterpret_cast(ptr) + 40 * 4)[0]); + //std::printf("srcIndexAddr %I64i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 5)[0]); + //std::printf("dstIndexAddr %I64i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 6)[0]); + //std::printf("elementCount %I64i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 7)[0]); + //std::printf("propertySize %i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 8)[0]); + //std::printf("fill %i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 9)[0]); + //std::printf("srcIndexSizeLog2 %i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 10)[0]); + //std::printf("dstIndexSizeLog2 %i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 11)[0]); bool success = mem.memory->unmap(); assert(success); } From 102aa472c52581ce1297c234df7ac3f73c74c7cf Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Wed, 14 Feb 2024 22:26:00 -0300 Subject: [PATCH 11/13] WIP example --- 66_PropertyPools/main.cpp | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp index dc16dfeae..f17d7cf58 100644 --- a/66_PropertyPools/main.cpp +++ b/66_PropertyPools/main.cpp @@ -446,7 +446,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat asset::SBufferBinding{0, core::smart_refctd_ptr(m_scratchBuffer)}, asset::SBufferBinding{0, core::smart_refctd_ptr(m_addressBuffer)}, &transferRequest, &transferRequest + 1, - m_logger.get(), 0, MaxValuesPerTransfer + m_logger.get(), 0, m_data.size() ); auto result = cmdbuf->end(); @@ -485,27 +485,21 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat { ISemaphore::SWaitInfo infos[1] = {{.semaphore=m_timeline.get(),.value=m_iteration}}; m_device->blockForSemaphores(infos); + // Readback ds - // TODO: This should readback the m_transferDstBuffer instead // (we'll read back the destination buffer and check that copy went through as expected) auto mem = m_transferDstBuffer->getBoundMemory(); // Scratch buffer has the transfer requests void* ptr = mem.memory->map({ mem.offset, mem.memory->getAllocationSize() }); - for (uint32_t i = 0; i < sizeof(nbl::hlsl::property_pools::TransferRequest) * 10; i++) + for (uint32_t i = 0; i < 1024; /*m_data.size();*/ i++) { - uint16_t value = reinterpret_cast(ptr)[i]; - std::printf("%i, ", value); + uint16_t expected = reinterpret_cast(ptr)[i]; + uint16_t actual = m_data[i]; + std::printf("%i, ", expected); + //assert(expected == actual); } std::printf("\n"); - //std::printf("srcAddr %I64i (alignment: %I64i): %I64i\n", m_transferSrcBuffer->getDeviceAddress(), m_transferSrcBuffer->getDeviceAddress() & 7, reinterpret_cast(reinterpret_cast(ptr) + 40 * 3)[0]); - //std::printf("dstAddr %I64i (alignment: %I64i): %I64i\n", m_transferDstBuffer->getDeviceAddress(), m_transferDstBuffer->getDeviceAddress() & 7, reinterpret_cast(reinterpret_cast(ptr) + 40 * 4)[0]); - //std::printf("srcIndexAddr %I64i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 5)[0]); - //std::printf("dstIndexAddr %I64i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 6)[0]); - //std::printf("elementCount %I64i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 7)[0]); - //std::printf("propertySize %i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 8)[0]); - //std::printf("fill %i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 9)[0]); - //std::printf("srcIndexSizeLog2 %i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 10)[0]); - //std::printf("dstIndexSizeLog2 %i\n", reinterpret_cast(reinterpret_cast(ptr) + 40 * 11)[0]); + _NBL_DEBUG_BREAK_IF(true); bool success = mem.memory->unmap(); assert(success); } From ac178253475bf4a7fd172d6cb54d39894c847822 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Fri, 16 Feb 2024 15:46:20 -0300 Subject: [PATCH 12/13] Remove unused things from example 05 --- 66_PropertyPools/app_resources/common.hlsl | 2 - 66_PropertyPools/main.cpp | 187 +-------------------- 2 files changed, 3 insertions(+), 186 deletions(-) diff --git a/66_PropertyPools/app_resources/common.hlsl b/66_PropertyPools/app_resources/common.hlsl index 6f339aa13..456dc6740 100644 --- a/66_PropertyPools/app_resources/common.hlsl +++ b/66_PropertyPools/app_resources/common.hlsl @@ -16,7 +16,5 @@ struct PushConstantData NBL_CONSTEXPR uint32_t WorkgroupSize = 256; -#include "nbl/builtin/hlsl/random/xoroshiro.hlsl" - // Yes we do have our own re-creation of C++'s STL in HLSL2021 ! #include "nbl/builtin/hlsl/limits.hlsl" \ No newline at end of file diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp index f17d7cf58..c69a6abef 100644 --- a/66_PropertyPools/main.cpp +++ b/66_PropertyPools/main.cpp @@ -158,55 +158,12 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat using device_base_t = examples::MonoDeviceApplication; using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication; - // This is the first example that submits multiple workloads in-flight. - // What the shader does is it computes the minimum distance of each point against K other random input points. - // Having the GPU randomly access parts of the buffer requires it to be DEVICE_LOCAL for performance. - // Then the CPU downloads the results and finds the median minimum distance via quick-select. - // This bizzare synthetic workload was specifically chosen for its unfriendliness towards simple buffer usage. - // The fact we have variable sized workloads and run them in a loop means we either have to dynamically - // suballocate from a single buffer or have K worst-case sized buffers we round robin for K-workloads in flight. - // Creating and destroying buffers at runtime is not an option as those are very expensive operations. - // Also since CPU needs to heapify the outputs, we need to have the GPU write them into RAM not VRAM. - smart_refctd_ptr m_pipeline; - - // The Utility class has lots of methods to handle staging without relying on ReBAR or EXT_host_image_copy as well as more complex methods we'll cover later. - // Until EXT_host_image_copy becomes ubiquitous across all Nabla Core Profile devices, you need to stage image copies from an IGPUBuffer to an IGPUImage. - // Why use Staging for buffers in the age of ReBAR? While GPU workloads overlap the CPU, individual GPU workloads's execution might not overlap each other - // but their data might. In this case you want to "precisely" time the data update on the GPU timeline between the end and start of a workload. - // For very small updates you could use the commandbuffer updateBuffer method, but it has a size limit and the data enqueued takes up space in the commandpool. - // Sometimes it might be unfeasible to either have multiple copies or update references to those copies without a cascade update. - // One example is the transformation graph of nodes in a scene, where a copy-on-write of a node would require the update the offset/pointer held by - // any other node that refers to it. This quickly turns into a cascade that would force you to basically create a full copy of the entire data structure - // after most updates. Whereas with staging you'd "queue up" the much smaller set of updates to apply between each computation step which uses the graph. - // Another example are UBO and SSBO bindings, where once you run out of dynamic bindings, you can no longer easily change offsets without introducting extra indirection in shaders. - // Actually staging can help you re-use a commandbuffer because you don't need to re-record it if you don't need to change the offsets at which you bind! - // Finally ReBAR is a precious resource, my 8GB RTX 3070 only reports a 214MB Heap backing HOST_VISIBLE and DEVICE_LOCAL device local memory type. - smart_refctd_ptr m_utils; - - // We call them downstreaming and upstreaming, simply by how we used them so far. - // Meaning that upstreaming is uncached and usually ReBAR (DEVICE_LOCAL), for simple memcpy like sequential writes. - // While the downstreaming is CACHED and not DEVICE_LOCAL for fast random acess by the CPU. - // However there are cases when you'd want to use a buffer with flags identical to the default downstreaming buffer for uploads, - // such cases is when a CPU needs to build a data-structure in-place (due to memory constraints) before GPU accesses it, - // one example are Host Acceleration Structure builds (BVH building requires lots of repeated memory accesses). - // When choosing the memory properties of a mapped buffer consider which processor (CPU or GPU) needs faster access in event of a cache-miss. - nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer; - StreamingTransientDataBufferMT<>* m_downStreamingBuffer; - // These are Buffer Device Addresses - uint64_t m_upStreamingBufferAddress; - uint64_t m_downStreamingBufferAddress; - smart_refctd_ptr m_propertyPoolHandler; smart_refctd_ptr m_scratchBuffer; smart_refctd_ptr m_addressBuffer; smart_refctd_ptr m_transferSrcBuffer; smart_refctd_ptr m_transferDstBuffer; std::vector m_data; - - smart_refctd_ptr>> m_subAllocDescriptorSet; - - // You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!) - uint32_t m_alignment; // The pool cache is just a formalized way of round-robining command pools and resetting + reusing them after their most recent submit signals finished. // Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools. @@ -220,9 +177,6 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat static constexpr uint64_t TransfersAmount = 1024; static constexpr uint64_t MaxValuesPerTransfer = 512; - constexpr static inline uint32_t maxDescriptorSetAllocationAlignment = 64u*1024u; // if you need larger alignments then you're not right in the head - constexpr static inline uint32_t minDescriptorSetAllocationSize = 1u; - public: // Yay thanks to multiple inheritance we cannot forward ctors anymore @@ -269,67 +223,6 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat for (uint16_t i = 0; i < uint16_t((uint32_t(1) << 16) - 1); i++) m_data.push_back(i); - // this time we load a shader directly from a file - smart_refctd_ptr shader; - { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = ""; // virtual root - auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl",lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - return logFail("Could not load shader!"); - - // lets go straight from ICPUSpecializedShader to IGPUSpecializedShader - auto source = IAsset::castDown(assets[0]); - // The down-cast should not fail! - assert(source); - - // this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple - shader = m_device->createShader(source.get()); - if (!shader) - return logFail("Creation of a GPU Shader to from CPU Shader source failed!"); - } - - // The StreamingTransientDataBuffers are actually composed on top of another useful utility called `CAsyncSingleBufferSubAllocator` - // The difference is that the streaming ones are made on top of ranges of `IGPUBuffer`s backed by mappable memory, whereas the - // `CAsyncSingleBufferSubAllocator` just allows you suballocate subranges of any `IGPUBuffer` range with deferred/latched frees. - constexpr uint32_t DownstreamBufferSize = sizeof(output_t)<<24; - constexpr uint32_t UpstreamBufferSize = sizeof(input_t)<<24; - m_utils = make_smart_refctd_ptr(smart_refctd_ptr(m_device),smart_refctd_ptr(m_logger),DownstreamBufferSize,UpstreamBufferSize); - if (!m_utils) - return logFail("Failed to create Utilities!"); - m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer(); - m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer(); - m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress(); - m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress(); - - // People love Reflection but I prefer Shader Sources instead! - const nbl::asset::SPushConstantRange pcRange = {.stageFlags=IShader::ESS_COMPUTE,.offset=0,.size=sizeof(PushConstantData)}; - - // This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size - // and using traditional SSBO bindings would force us to update the Descriptor Set every frame. - // I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic - // only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding. - // Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size. - { - auto layout = m_device->createPipelineLayout({&pcRange,1}); - IGPUComputePipeline::SCreationParams params = {}; - params.layout = layout.get(); - params.shader.shader = shader.get(); - if (!m_device->createComputePipelines(nullptr,{¶ms,1},&m_pipeline)) - return logFail("Failed to create compute pipeline!\n"); - } - - const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits(); - // The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices - // which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets. - // Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc. - // there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those. - // We'll align to max of coherent atom size even if the memory is coherent, - // and we also need to take into account BDA shader loads need to be aligned to the type being loaded. - m_alignment = core::max(deviceLimits.nonCoherentAtomSize,alignof(float)); - // We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are // the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously. constexpr auto MaxConcurrency = 64; @@ -339,69 +232,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat // In contrast to fences, we just need one semaphore to rule all dispatches m_timeline = m_device->createSemaphore(m_iteration); - - - // Descriptor set sub allocator - - video::IGPUDescriptorSetLayout::SBinding bindings[1]; - { - bindings[0].binding = 0; - bindings[0].count = 65536u; - bindings[0].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) - | IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT - | IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT; - bindings[0].type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE; - bindings[0].stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE; - } - - std::span bindingsSpan(bindings); - - // TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1) - auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr>>( - bindings, maxDescriptorSetAllocationAlignment, minDescriptorSetAllocationSize - ); - - std::vector allocation, size; - { - for (uint32_t i = 0; i < 512; i++) - { - allocation.push_back(core::GeneralpurposeAddressAllocator::invalid_address); - size.push_back(4); - } - subAllocatedDescriptorSet->multi_allocate(allocation.size(), &allocation[0], &size[0]); - for (uint32_t i = 0; i < allocation.size(); i++) - { - m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]); - assert(allocation[i] != core::GeneralpurposeAddressAllocator::invalid_address); - } - } - { - std::vector addr, freeSize; - for (uint32_t i = 0; i < 512; i+=2) - { - addr.push_back(allocation[i]); - freeSize.push_back(4); - } - subAllocatedDescriptorSet->multi_deallocate(addr.size(), &addr[0], &freeSize[0]); - } - - m_logger->log("Freed some allocations", system::ILogger::ELL_INFO); - allocation.clear(); - size.clear(); - { - for (uint32_t i = 0; i < 512; i++) - { - allocation.push_back(core::GeneralpurposeAddressAllocator::invalid_address); - size.push_back(2); - } - subAllocatedDescriptorSet->multi_allocate(allocation.size(), &allocation[0], &size[0]); - for (uint32_t i = 0; i < allocation.size(); i++) - { - m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]); - assert(allocation[i] != core::GeneralpurposeAddressAllocator::invalid_address); - } - } - + return true; } @@ -425,7 +256,6 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat m_poolCache->getPool(poolIx)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1},core::smart_refctd_ptr(m_logger)); // lets record, its still a one time submit because we have to re-record with different push constants each time cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - cmdbuf->bindComputePipeline(m_pipeline.get()); // COMMAND RECORDING uint32_t dataSize = (((sizeof(uint16_t) * m_data.size()) + 3) / 4) * 4; @@ -437,7 +267,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat CPropertyPoolHandler::TransferRequest transferRequest; transferRequest.memblock = asset::SBufferRange { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr(m_transferSrcBuffer) }; transferRequest.elementSize = 1; - transferRequest.elementCount = m_data.size(); + transferRequest.elementCount = (m_data.size() * sizeof(uint16_t)) / sizeof(uint32_t); transferRequest.buffer = asset::SBufferBinding { 0, core::smart_refctd_ptr(m_transferDstBuffer) }; transferRequest.srcAddressesOffset = IPropertyPool::invalid; transferRequest.dstAddressesOffset = IPropertyPool::invalid; @@ -496,24 +326,13 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat uint16_t expected = reinterpret_cast(ptr)[i]; uint16_t actual = m_data[i]; std::printf("%i, ", expected); - //assert(expected == actual); + assert(expected == actual); } std::printf("\n"); - _NBL_DEBUG_BREAK_IF(true); bool success = mem.memory->unmap(); assert(success); } } - - bool onAppTerminated() override - { - // Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated` - // (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain) - while (m_downStreamingBuffer->cull_frees()) {} - - return device_base_t::onAppTerminated(); - } }; - NBL_MAIN_FUNC(PropertyPoolsApp) \ No newline at end of file From e7b1f9bc5236f457eaa27c2771d875f4564c95f7 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Fri, 16 Feb 2024 15:48:18 -0300 Subject: [PATCH 13/13] Remove window app stuff --- 66_PropertyPools/main.cpp | 135 +------------------------------------- 1 file changed, 1 insertion(+), 134 deletions(-) diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp index c69a6abef..2e28ca527 100644 --- a/66_PropertyPools/main.cpp +++ b/66_PropertyPools/main.cpp @@ -9,137 +9,6 @@ #include "../common/BasicMultiQueueApplication.hpp" #include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp" -namespace nbl::examples -{ - -using namespace nbl; -using namespace core; -using namespace system; -using namespace ui; -using namespace asset; -using namespace video; - -// Virtual Inheritance because apps might end up doing diamond inheritance -class WindowedApplication : public virtual BasicMultiQueueApplication -{ - using base_t = BasicMultiQueueApplication; - - public: - using base_t::base_t; - - virtual video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override - { - auto retval = base_t::getAPIFeaturesToEnable(); - // We only support one swapchain mode, surface, the other one is Display which we have not implemented yet. - retval.swapchainMode = video::E_SWAPCHAIN_MODE::ESM_SURFACE; - return retval; - } - - // New function, we neeed to know about surfaces to create ahead of time - virtual core::vector getSurfaces() const = 0; - - virtual core::set filterDevices(const core::SRange& physicalDevices) const - { - const auto firstFilter = base_t::filterDevices(physicalDevices); - - video::SPhysicalDeviceFilter deviceFilter = {}; - - const auto surfaces = getSurfaces(); - deviceFilter.requiredSurfaceCompatibilities = { surfaces.data(), surfaces.size() }; - - return deviceFilter(physicalDevices); - } - - virtual bool onAppInitialized(smart_refctd_ptr&& system) - { - // Remember to call the base class initialization! - if (!base_t::onAppInitialized(std::move(system))) - return false; - - #ifdef _NBL_PLATFORM_WINDOWS_ - m_winMgr = nbl::ui::IWindowManagerWin32::create(); - #else - #error "Unimplemented!" - #endif - } - - core::smart_refctd_ptr m_winMgr; -}; - - -// Before we get onto creating a window, we need to discuss how Nabla handles input, clipboards and cursor control -class IWindowClosedCallback : public virtual nbl::ui::IWindow::IEventCallback -{ - public: - IWindowClosedCallback() : m_gotWindowClosedMsg(false) {} - - // unless you create a separate callback per window, both will "trip" this condition - bool windowGotClosed() const {return m_gotWindowClosedMsg;} - - private: - bool onWindowClosed_impl() override - { - m_gotWindowClosedMsg = true; - return true; - } - - bool m_gotWindowClosedMsg; -}; - -// We inherit from an application that tries to find Graphics and Compute queues -// because applications with presentable images often want to perform Graphics family operations -// Virtual Inheritance because apps might end up doing diamond inheritance -class SingleNonResizableWindowApplication : public virtual WindowedApplication -{ - using base_t = WindowedApplication; - - protected: - virtual IWindow::SCreationParams getWindowCreationParams() const - { - IWindow::SCreationParams params = {}; - params.callback = make_smart_refctd_ptr(); - params.width = 640; - params.height = 480; - params.x = 32; - params.y = 32; - params.flags = IWindow::ECF_NONE; - params.windowCaption = "SingleNonResizableWindowApplication"; - return params; - } - - core::smart_refctd_ptr m_window; - core::smart_refctd_ptr m_surface; - - public: - using base_t::base_t; - - virtual bool onAppInitialized(smart_refctd_ptr&& system) override - { - // Remember to call the base class initialization! - if (!base_t::onAppInitialized(std::move(system))) - return false; - - m_window = m_winMgr->createWindow(getWindowCreationParams()); - m_surface = video::CSurfaceVulkanWin32::create(core::smart_refctd_ptr(m_api),core::smart_refctd_ptr_static_cast(m_window)); - return true; - } - - virtual core::vector getSurfaces() const - { - return {{m_surface.get()/*,EQF_NONE*/}}; - } - - virtual bool keepRunning() override - { - if (!m_window || reinterpret_cast(m_window->getEventCallback())->windowGotClosed()) - return false; - - return true; - } -}; -} - - using namespace nbl; using namespace core; using namespace system; @@ -147,13 +16,11 @@ using namespace ui; using namespace asset; using namespace video; - #include "app_resources/common.hlsl" #include "nbl/builtin/hlsl/bit.hlsl" - // In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants -class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication +class PropertyPoolsApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication { using device_base_t = examples::MonoDeviceApplication; using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;