From dddf5755d6a7bef8d78aba37949b8badd85594fa Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Tue, 16 Jan 2024 21:34:53 -0300
Subject: [PATCH 1/9] Work on property pools example

---
 66_PropertyPools/CMakeLists.txt               |  24 ++
 66_PropertyPools/app_resources/common.hlsl    |  22 ++
 .../app_resources/shader.comp.hlsl            |  33 ++
 66_PropertyPools/config.json.template         |  28 ++
 66_PropertyPools/main.cpp                     | 292 ++++++++++++++++++
 66_PropertyPools/pipeline.groovy              |  50 +++
 CMakeLists.txt                                |   1 +
 7 files changed, 450 insertions(+)
 create mode 100644 66_PropertyPools/CMakeLists.txt
 create mode 100644 66_PropertyPools/app_resources/common.hlsl
 create mode 100644 66_PropertyPools/app_resources/shader.comp.hlsl
 create mode 100644 66_PropertyPools/config.json.template
 create mode 100644 66_PropertyPools/main.cpp
 create mode 100644 66_PropertyPools/pipeline.groovy
diff --git a/66_PropertyPools/CMakeLists.txt b/66_PropertyPools/CMakeLists.txt
new file mode 100644
index 000000000..bc1624875
--- /dev/null
+++ b/66_PropertyPools/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/66_PropertyPools/app_resources/common.hlsl b/66_PropertyPools/app_resources/common.hlsl
new file mode 100644
index 000000000..6f339aa13
--- /dev/null
+++ b/66_PropertyPools/app_resources/common.hlsl
@@ -0,0 +1,22 @@
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+// Unfortunately not every piece of C++14 metaprogramming syntax is available in HLSL 202x
+// https://github.com/microsoft/DirectXShaderCompiler/issues/5751#issuecomment-1800847954
+typedef nbl::hlsl::float32_t3 input_t;
+typedef nbl::hlsl::float32_t output_t;
+
+NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxPossibleElementCount = 1 << 20;
+
+struct PushConstantData
+{
+	uint64_t inputAddress;
+	uint64_t outputAddress;
+	uint32_t dataElementCount;
+};
+
+NBL_CONSTEXPR uint32_t WorkgroupSize = 256;
+
+#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
+
+// Yes we do have our own re-creation of C++'s STL in HLSL2021 !
+#include "nbl/builtin/hlsl/limits.hlsl"
\ No newline at end of file
diff --git a/66_PropertyPools/app_resources/shader.comp.hlsl b/66_PropertyPools/app_resources/shader.comp.hlsl
new file mode 100644
index 000000000..4aeef0e0f
--- /dev/null
+++ b/66_PropertyPools/app_resources/shader.comp.hlsl
@@ -0,0 +1,33 @@
+#include "common.hlsl"
+
+// just a small test
+#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+
+[[vk::push_constant]] PushConstantData pushConstants;
+
+// does absolutely nothing, a later example will show how it gets used
+template<typename capability_traits=nbl::hlsl::jit::device_capabilities_traits>
+void dummyTraitTest() {}
+
+[numthreads(WorkgroupSize,1,1)]
+void main(uint32_t3 ID : SV_DispatchThreadID)
+{
+	dummyTraitTest();
+	if (ID.x>=pushConstants.dataElementCount)
+		return;
+
+	const input_t self = vk::RawBufferLoad<input_t>(pushConstants.inputAddress+sizeof(input_t)*ID.x);
+
+	nbl::hlsl::Xoroshiro64StarStar rng = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(pushConstants.dataElementCount,ID.x)^0xdeadbeefu);
+
+	float32_t acc = nbl::hlsl::numeric_limits<float32_t>::max;
+	const static uint32_t OthersToTest = 15;
+	[[unroll(OthersToTest)]]
+	for (uint32_t i=0; i<OthersToTest; i++)
+	{
+		const uint32_t offset = rng() % pushConstants.dataElementCount;
+		const input_t other = vk::RawBufferLoad<input_t>(pushConstants.inputAddress+sizeof(input_t)*offset);
+		acc = min(length(other-self),acc);
+	}
+	vk::RawBufferStore<float32_t>(pushConstants.outputAddress+sizeof(float32_t)*ID.x,acc);
+}
\ No newline at end of file
diff --git a/66_PropertyPools/config.json.template b/66_PropertyPools/config.json.template
new file mode 100644
index 000000000..717d05d53
--- /dev/null
+++ b/66_PropertyPools/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
new file mode 100644
index 000000000..155ece55b
--- /dev/null
+++ b/66_PropertyPools/main.cpp
@@ -0,0 +1,292 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+
+// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
+#include "../common/MonoDeviceApplication.hpp"
+#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace video;
+
+
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/bit.hlsl"
+
+
+// In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants 
+class PropertyPoolsApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
+{
+		using device_base_t = examples::MonoDeviceApplication;
+		using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
+
+		// This is the first example that submits multiple workloads in-flight. 
+		// What the shader does is it computes the minimum distance of each point against K other random input points.
+		// Having the GPU randomly access parts of the buffer requires it to be DEVICE_LOCAL for performance.
+		// Then the CPU downloads the results and finds the median minimum distance via quick-select.
+		// This bizzare synthetic workload was specifically chosen for its unfriendliness towards simple buffer usage.
+		// The fact we have variable sized workloads and run them in a loop means we either have to dynamically
+		// suballocate from a single buffer or have K worst-case sized buffers we round robin for K-workloads in flight.
+		// Creating and destroying buffers at runtime is not an option as those are very expensive operations. 
+		// Also since CPU needs to heapify the outputs, we need to have the GPU write them into RAM not VRAM.
+		smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
+
+		// The Utility class has lots of methods to handle staging without relying on ReBAR or EXT_host_image_copy as well as more complex methods we'll cover later.
+		// Until EXT_host_image_copy becomes ubiquitous across all Nabla Core Profile devices, you need to stage image copies from an IGPUBuffer to an IGPUImage.
+		// Why use Staging for buffers in the age of ReBAR? While GPU workloads overlap the CPU, individual GPU workloads's execution might not overlap each other
+		// but their data might. In this case you want to "precisely" time the data update on the GPU timeline between the end and start of a workload.
+		// For very small updates you could use the commandbuffer updateBuffer method, but it has a size limit and the data enqueued takes up space in the commandpool.
+		// Sometimes it might be unfeasible to either have multiple copies or update references to those copies without a cascade update.
+		// One example is the transformation graph of nodes in a scene, where a copy-on-write of a node would require the update the offset/pointer held by
+		// any other node that refers to it. This quickly turns into a cascade that would force you to basically create a full copy of the entire data structure
+		// after most updates. Whereas with staging you'd "queue up" the much smaller set of updates to apply between each computation step which uses the graph.
+		// Another example are UBO and SSBO bindings, where once you run out of dynamic bindings, you can no longer easily change offsets without introducting extra indirection in shaders.
+		// Actually staging can help you re-use a commandbuffer because you don't need to re-record it if you don't need to change the offsets at which you bind!
+		// Finally ReBAR is a precious resource, my 8GB RTX 3070 only reports a 214MB Heap backing HOST_VISIBLE and DEVICE_LOCAL device local memory type.
+		smart_refctd_ptr<nbl::video::IUtilities> m_utils;
+
+		// We call them downstreaming and upstreaming, simply by how we used them so far.
+		// Meaning that upstreaming is uncached and usually ReBAR (DEVICE_LOCAL), for simple memcpy like sequential writes.
+		// While the downstreaming is CACHED and not DEVICE_LOCAL for fast random acess by the CPU.
+		// However there are cases when you'd want to use a buffer with flags identical to the default downstreaming buffer for uploads,
+		// such cases is when a CPU needs to build a data-structure in-place (due to memory constraints) before GPU accesses it,
+		// one example are Host Acceleration Structure builds (BVH building requires lots of repeated memory accesses).
+		// When choosing the memory properties of a mapped buffer consider which processor (CPU or GPU) needs faster access in event of a cache-miss.
+		nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer;
+		StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
+		// These are Buffer Device Addresses
+		uint64_t m_upStreamingBufferAddress;
+		uint64_t m_downStreamingBufferAddress;
+
+		// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
+		uint32_t m_alignment;
+		
+		// The pool cache is just a formalized way of round-robining command pools and resetting + reusing them after their most recent submit signals finished.
+		// Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools.
+		smart_refctd_ptr<nbl::video::ICommandPoolCache> m_poolCache;
+
+		// We'll run the iterations in reverse, easier to write "keep running"
+		uint32_t m_iteration = 200;
+
+	public:
+		// Yay thanks to multiple inheritance we cannot forward ctors anymore
+		PropertyPoolsApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+			system::IApplicationFramework(_localInputCWD,_localOutputCWD,_sharedInputCWD,_sharedOutputCWD) {}
+
+		// we stuff all our work here because its a "single shot" app
+		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		{
+			// Remember to call the base class initialization!
+			if (!device_base_t::onAppInitialized(std::move(system)))
+				return false;
+			if (!asset_base_t::onAppInitialized(std::move(system)))
+				return false;
+
+			// this time we load a shader directly from a file
+			smart_refctd_ptr<IGPUSpecializedShader> shader;
+			{
+				IAssetLoader::SAssetLoadParams lp = {};
+				lp.logger = m_logger.get();
+				lp.workingDirectory = ""; // virtual root
+				auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl",lp);
+				const auto assets = assetBundle.getContents();
+				if (assets.empty())
+					return logFail("Could not load shader!");
+
+				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
+				auto source = IAsset::castDown<ICPUSpecializedShader>(assets[0]);
+				// The down-cast should not fail!
+				assert(source);
+
+				IGPUObjectFromAssetConverter::SParams conversionParams = {};
+				conversionParams.device = m_device.get();
+				conversionParams.assetManager = m_assetMgr.get();
+				created_gpu_object_array<ICPUSpecializedShader> convertedGPUObjects = std::make_unique<IGPUObjectFromAssetConverter>()->getGPUObjectsFromAssets(&source,&source+1,conversionParams);
+				if (convertedGPUObjects->empty() || !convertedGPUObjects->front())
+					return logFail("Conversion of a CPU Specialized Shader to GPU failed!");
+
+				shader = convertedGPUObjects->front();
+			}
+
+			// The StreamingTransientDataBuffers are actually composed on top of another useful utility called `CAsyncSingleBufferSubAllocator`
+			// The difference is that the streaming ones are made on top of ranges of `IGPUBuffer`s backed by mappable memory, whereas the
+			// `CAsyncSingleBufferSubAllocator` just allows you suballocate subranges of any `IGPUBuffer` range with deferred/latched frees.
+			constexpr uint32_t DownstreamBufferSize = sizeof(output_t)<<24;
+			constexpr uint32_t UpstreamBufferSize = sizeof(input_t)<<24;
+			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device),smart_refctd_ptr(m_logger),DownstreamBufferSize,UpstreamBufferSize);
+			if (!m_utils)
+				return logFail("Failed to create Utilities!");
+			m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
+			m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
+			m_upStreamingBufferAddress = m_device->getBufferDeviceAddress(m_upStreamingBuffer->getBuffer());
+			m_downStreamingBufferAddress = m_device->getBufferDeviceAddress(m_downStreamingBuffer->getBuffer());
+
+			// People love Reflection but I prefer Shader Sources instead!
+			const nbl::asset::SPushConstantRange pcRange = {.stageFlags=IShader::ESS_COMPUTE,.offset=0,.size=sizeof(PushConstantData)};
+
+			// This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size
+			// and using traditional SSBO bindings would force us to update the Descriptor Set every frame.
+			// I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic
+			// only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding.
+			// Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size.
+			m_pipeline = m_device->createComputePipeline(nullptr,m_device->createPipelineLayout(&pcRange,&pcRange+1),std::move(shader));
+
+			const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
+			// The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices
+			// which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets.
+			// Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc.
+			// there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those.
+			// We'll align to max of coherent atom size even if the memory is coherent,
+			// and we also need to take into account BDA shader loads need to be aligned to the type being loaded.
+			m_alignment = core::max(deviceLimits.nonCoherentAtomSize,alignof(float));
+
+			// We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are
+			// the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously.
+			constexpr auto MaxConcurrency = 64;
+			// Since this time we don't throw the Command Pools away and we'll reset them instead, we don't create the pools with the transient flag
+			m_poolCache = make_smart_refctd_ptr<ICommandPoolCache>(m_device.get(),getComputeQueue()->getFamilyIndex(), IGPUCommandPool::ECF_NONE, MaxConcurrency);
+
+			return true;
+		}
+
+		// Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script)
+		bool keepRunning() override { return m_iteration; }
+
+		// Finally the first actual work-loop
+		void workLoopBody() override
+		{
+			m_iteration--;
+			IGPUQueue* const queue = getComputeQueue();
+
+			// Note that I'm using the sample struct with methods that have identical code which compiles as both C++ and HLSL
+			auto rng = nbl::hlsl::Xoroshiro64StarStar::construct({m_iteration^0xdeadbeefu,std::hash<string>()(_NBL_APP_NAME_)});
+
+			// we dynamically choose the number of elements for each iteration
+			const auto elementCount = rng()%MaxPossibleElementCount;
+			const uint32_t inputSize = sizeof(input_t)*elementCount;
+
+			// The allocators can do multiple allocations at once for efficiency
+			const uint32_t AllocationCount = 1;
+			// It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value
+			// this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args.
+			auto inputOffset = m_upStreamingBuffer->invalid_value;
+
+			// We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled)
+			// Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later).
+			std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
+			// note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly
+			m_upStreamingBuffer->multi_allocate(waitTill,AllocationCount,&inputOffset,&inputSize,&m_alignment);
+
+			// Generate our data in-place on the allocated staging buffer
+			{
+				auto* const inputPtr = reinterpret_cast<input_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer())+inputOffset);
+				for (auto j=0; j<elementCount; j++)
+				{
+					const nbl::hlsl::float32_t3 generated(rng(),rng(),rng());
+					// make sure our bitpatterns are in [0,1]^2 as a float
+					inputPtr[j] = generated/float(nbl::hlsl::numeric_limits<decltype(rng())>::max);
+				}
+				// Always remember to flush!
+				if (m_upStreamingBuffer->needsManualFlushOrInvalidate())
+				{
+					const IDeviceMemoryAllocation::MappedMemoryRange range(m_upStreamingBuffer->getBuffer()->getBoundMemory(),inputOffset,inputSize);
+					m_device->flushMappedMemoryRanges(1,&range);
+				}
+			}
+
+			// Obtain our command pool once one gets recycled
+			uint32_t poolIx;
+			do
+			{
+				poolIx = m_poolCache->acquirePool();
+			} while (poolIx==ICommandPoolCache::invalid_index);
+
+			// finally allocate our output range
+			const uint32_t outputSize = sizeof(output_t)*elementCount;
+			auto outputOffset = m_downStreamingBuffer->invalid_value;
+			m_downStreamingBuffer->multi_allocate(waitTill,AllocationCount,&outputOffset,&outputSize,&m_alignment);
+
+			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+			{
+				m_device->createCommandBuffers(m_poolCache->getPool(poolIx),IGPUCommandBuffer::EL_PRIMARY,1,&cmdbuf);
+				// lets record, its still a one time submit because we have to re-record with different push constants each time
+				cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
+				cmdbuf->bindComputePipeline(m_pipeline.get());
+				// This is the new fun part, pushing constants
+				const PushConstantData pc = {
+					.inputAddress=m_upStreamingBufferAddress+inputOffset,
+					.outputAddress=m_downStreamingBufferAddress+outputOffset,
+					.dataElementCount=elementCount
+				};
+				cmdbuf->pushConstants(m_pipeline->getLayout(),IShader::ESS_COMPUTE,0u,sizeof(pc),&pc);
+				// Good old trick to get rounded up divisions, in case you're not familiar
+				cmdbuf->dispatch((elementCount-1)/WorkgroupSize+1,1,1);
+				cmdbuf->end();
+			}
+
+			// TODO: redo with a single timeline semaphore
+			auto fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED);
+			{
+				IGPUQueue::SSubmitInfo submitInfo = {};
+				submitInfo.commandBufferCount = 1;
+				submitInfo.commandBuffers = &cmdbuf.get();
+
+				queue->startCapture();
+				queue->submit(1u,&submitInfo,fence.get());
+				queue->endCapture();
+			}
+				
+			// We can also actually latch our Command Pool reset and its return to the pool of free pools!
+			m_poolCache->releaseSet(m_device.get(),smart_refctd_ptr(fence),poolIx);
+
+			// As promised, we can defer an upstreaming buffer deallocation until a fence is signalled
+			// You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation.
+			m_upStreamingBuffer->multi_deallocate(AllocationCount,&inputOffset,&inputSize,smart_refctd_ptr(fence));
+
+			// Because C++17 and C++20 can't make their mind up about what to do with `this` in event of a [=] capture, lets triple ensure the m_iteration is captured by value.
+			const auto savedIterNum = m_iteration;
+				
+			// Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer.
+			// Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory.
+			// Its nice because it will also remember to invalidate our memory mapping if its not coherent.
+			auto latchedConsumer = make_smart_refctd_ptr<IUtilities::CDownstreamingDataConsumer>(
+				IDeviceMemoryAllocation::MemoryRange(outputOffset,outputSize),
+				// Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals
+				[=](const size_t dstOffset, const void* bufSrc, const size_t size)->void
+				{
+					// The unused variable is used for letting the consumer know the subsection of the output we've managed to download
+					// But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves.
+					assert(dstOffset==0 && size==outputSize);
+
+					// I can const cast, we know the mapping is just a pointer
+					output_t* const data = reinterpret_cast<output_t*>(const_cast<void*>(bufSrc));
+					auto median = data+elementCount/2;
+					std::nth_element(data,median,data+elementCount);
+
+					m_logger->log("Iteration %d Median of Minimum Distances is %f",ILogger::ELL_PERFORMANCE,savedIterNum,*median);
+				},
+				// Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it
+				// hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands.
+				// It could also be latched in the upstreaming deallocate, because its the same fence.
+				std::move(cmdbuf),m_downStreamingBuffer
+			);
+			// We put a function we want to execute 
+			m_downStreamingBuffer->multi_deallocate(AllocationCount,&outputOffset,&outputSize,std::move(fence),&latchedConsumer.get());
+		}
+
+		bool onAppTerminated() override
+		{
+			// Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated`
+			// (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain)
+			while (m_downStreamingBuffer->cull_frees()) {}
+
+			return device_base_t::onAppTerminated();
+		}
+};
+
+
+NBL_MAIN_FUNC(PropertyPoolsApp)
\ No newline at end of file
diff --git a/66_PropertyPools/pipeline.groovy b/66_PropertyPools/pipeline.groovy
new file mode 100644
index 000000000..1a7b043a4
--- /dev/null
+++ b/66_PropertyPools/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
+{
+	public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a20a33a9..09a73bfe0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,5 +65,6 @@ if(NBL_BUILD_EXAMPLES)
 	#add_subdirectory(61_UI EXCLUDE_FROM_ALL)
 	add_subdirectory(62_CAD EXCLUDE_FROM_ALL)
 	add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
+	add_subdirectory(66_PropertyPools EXCLUDE_FROM_ALL)
 	add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
 endif()
\ No newline at end of file

From 43d95c8cca36441dfdd754ba66f24b88ae18426b Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Sun, 21 Jan 2024 10:58:54 -0300
Subject: [PATCH 2/9] Add creation of property pool handler to example

---
 66_PropertyPools/main.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index 155ece55b..941536751 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -62,6 +62,8 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 		uint64_t m_upStreamingBufferAddress;
 		uint64_t m_downStreamingBufferAddress;
 
+		smart_refctd_ptr<CPropertyPoolHandler> m_propertyPoolHandler;
+
 		// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
 		uint32_t m_alignment;
 		
@@ -86,12 +88,15 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
 
+			m_propertyPoolHandler = core::make_smart_refctd_ptr<CPropertyPoolHandler>(core::smart_refctd_ptr(m_device));
+
 			// this time we load a shader directly from a file
 			smart_refctd_ptr<IGPUSpecializedShader> shader;
 			{
 				IAssetLoader::SAssetLoadParams lp = {};
 				lp.logger = m_logger.get();
 				lp.workingDirectory = ""; // virtual root
+
 				auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl",lp);
 				const auto assets = assetBundle.getContents();
 				if (assets.empty())

From 66e93fbb23c374e445ab3af66848a836b34052c1 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Wed, 24 Jan 2024 21:43:22 -0300
Subject: [PATCH 3/9] Work on doing transferProperties on example

---
 66_PropertyPools/main.cpp | 129 +++++++++++++-------------------------
 1 file changed, 45 insertions(+), 84 deletions(-)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index 941536751..e59f6385a 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -63,6 +63,11 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 		uint64_t m_downStreamingBufferAddress;
 
 		smart_refctd_ptr<CPropertyPoolHandler> m_propertyPoolHandler;
+		smart_refctd_ptr<IGPUBuffer> m_scratchBuffer;
+		smart_refctd_ptr<IGPUBuffer> m_addressBuffer;
+		smart_refctd_ptr<IGPUBuffer> m_transferSrcBuffer;
+		smart_refctd_ptr<IGPUBuffer> m_transferDstBuffer;
+		std::vector<uint16_t> m_data;
 
 		// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
 		uint32_t m_alignment;
@@ -74,6 +79,9 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 		// We'll run the iterations in reverse, easier to write "keep running"
 		uint32_t m_iteration = 200;
 
+		static constexpr uint64_t TransfersAmount = 1024;
+		static constexpr uint64_t MaxValuesPerTransfer = 512;
+
 	public:
 		// Yay thanks to multiple inheritance we cannot forward ctors anymore
 		PropertyPoolsApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
@@ -90,6 +98,27 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 
 			m_propertyPoolHandler = core::make_smart_refctd_ptr<CPropertyPoolHandler>(core::smart_refctd_ptr(m_device));
 
+			auto createBuffer = [&](uint64_t size)
+			{
+					video::IGPUBuffer::SCreationParams creationParams;
+					creationParams.size = size;
+					creationParams.usage = core::bitflag(asset::IBuffer::EUF_STORAGE_BUFFER_BIT) | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;
+
+					auto buffer = m_device->createBuffer(std::move(creationParams));
+					nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buffer->getMemoryReqs();
+					m_device->allocate(reqs, buffer.get(), nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
+
+					return buffer;
+			};
+
+			m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount);
+			m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer);
+			m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer);
+			m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer);
+
+			for (uint16_t i = 0; i < uint16_t((uint32_t(1) << 16) - 1); i++)
+				m_data.push_back(i);
+
 			// this time we load a shader directly from a file
 			smart_refctd_ptr<IGPUSpecializedShader> shader;
 			{
@@ -167,42 +196,6 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 			m_iteration--;
 			IGPUQueue* const queue = getComputeQueue();
 
-			// Note that I'm using the sample struct with methods that have identical code which compiles as both C++ and HLSL
-			auto rng = nbl::hlsl::Xoroshiro64StarStar::construct({m_iteration^0xdeadbeefu,std::hash<string>()(_NBL_APP_NAME_)});
-
-			// we dynamically choose the number of elements for each iteration
-			const auto elementCount = rng()%MaxPossibleElementCount;
-			const uint32_t inputSize = sizeof(input_t)*elementCount;
-
-			// The allocators can do multiple allocations at once for efficiency
-			const uint32_t AllocationCount = 1;
-			// It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value
-			// this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args.
-			auto inputOffset = m_upStreamingBuffer->invalid_value;
-
-			// We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled)
-			// Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later).
-			std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
-			// note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly
-			m_upStreamingBuffer->multi_allocate(waitTill,AllocationCount,&inputOffset,&inputSize,&m_alignment);
-
-			// Generate our data in-place on the allocated staging buffer
-			{
-				auto* const inputPtr = reinterpret_cast<input_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer())+inputOffset);
-				for (auto j=0; j<elementCount; j++)
-				{
-					const nbl::hlsl::float32_t3 generated(rng(),rng(),rng());
-					// make sure our bitpatterns are in [0,1]^2 as a float
-					inputPtr[j] = generated/float(nbl::hlsl::numeric_limits<decltype(rng())>::max);
-				}
-				// Always remember to flush!
-				if (m_upStreamingBuffer->needsManualFlushOrInvalidate())
-				{
-					const IDeviceMemoryAllocation::MappedMemoryRange range(m_upStreamingBuffer->getBuffer()->getBoundMemory(),inputOffset,inputSize);
-					m_device->flushMappedMemoryRanges(1,&range);
-				}
-			}
-
 			// Obtain our command pool once one gets recycled
 			uint32_t poolIx;
 			do
@@ -210,26 +203,28 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 				poolIx = m_poolCache->acquirePool();
 			} while (poolIx==ICommandPoolCache::invalid_index);
 
-			// finally allocate our output range
-			const uint32_t outputSize = sizeof(output_t)*elementCount;
-			auto outputOffset = m_downStreamingBuffer->invalid_value;
-			m_downStreamingBuffer->multi_allocate(waitTill,AllocationCount,&outputOffset,&outputSize,&m_alignment);
-
 			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
 			{
 				m_device->createCommandBuffers(m_poolCache->getPool(poolIx),IGPUCommandBuffer::EL_PRIMARY,1,&cmdbuf);
 				// lets record, its still a one time submit because we have to re-record with different push constants each time
 				cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
 				cmdbuf->bindComputePipeline(m_pipeline.get());
-				// This is the new fun part, pushing constants
-				const PushConstantData pc = {
-					.inputAddress=m_upStreamingBufferAddress+inputOffset,
-					.outputAddress=m_downStreamingBufferAddress+outputOffset,
-					.dataElementCount=elementCount
-				};
-				cmdbuf->pushConstants(m_pipeline->getLayout(),IShader::ESS_COMPUTE,0u,sizeof(pc),&pc);
-				// Good old trick to get rounded up divisions, in case you're not familiar
-				cmdbuf->dispatch((elementCount-1)/WorkgroupSize+1,1,1);
+
+				// COMMAND RECORDING
+				cmdbuf->updateBuffer(m_transferSrcBuffer.get(), 0, sizeof(uint16_t) * m_data.size(), &m_data[0]);
+				CPropertyPoolHandler::TransferRequest transferRequest;
+				transferRequest.memblock = asset::SBufferRange<video::IGPUBuffer> { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr<video::IGPUBuffer>(m_transferSrcBuffer) };
+				transferRequest.elementSize = m_data.size();
+				transferRequest.elementCount = 1;
+				transferRequest.buffer = asset::SBufferBinding<video::IGPUBuffer> { 0, core::smart_refctd_ptr<video::IGPUBuffer>(m_transferDstBuffer) };
+
+				m_propertyPoolHandler->transferProperties(cmdbuf.get(), nullptr,
+					asset::SBufferBinding<video::IGPUBuffer>{0, core::smart_refctd_ptr(m_scratchBuffer)}, 
+					asset::SBufferBinding<video::IGPUBuffer>{0, core::smart_refctd_ptr(m_addressBuffer)}, 
+					&transferRequest, &transferRequest + 1,
+					m_logger.get(), 0, MaxValuesPerTransfer
+					);
+
 				cmdbuf->end();
 			}
 
@@ -247,40 +242,6 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 				
 			// We can also actually latch our Command Pool reset and its return to the pool of free pools!
 			m_poolCache->releaseSet(m_device.get(),smart_refctd_ptr(fence),poolIx);
-
-			// As promised, we can defer an upstreaming buffer deallocation until a fence is signalled
-			// You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation.
-			m_upStreamingBuffer->multi_deallocate(AllocationCount,&inputOffset,&inputSize,smart_refctd_ptr(fence));
-
-			// Because C++17 and C++20 can't make their mind up about what to do with `this` in event of a [=] capture, lets triple ensure the m_iteration is captured by value.
-			const auto savedIterNum = m_iteration;
-				
-			// Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer.
-			// Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory.
-			// Its nice because it will also remember to invalidate our memory mapping if its not coherent.
-			auto latchedConsumer = make_smart_refctd_ptr<IUtilities::CDownstreamingDataConsumer>(
-				IDeviceMemoryAllocation::MemoryRange(outputOffset,outputSize),
-				// Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals
-				[=](const size_t dstOffset, const void* bufSrc, const size_t size)->void
-				{
-					// The unused variable is used for letting the consumer know the subsection of the output we've managed to download
-					// But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves.
-					assert(dstOffset==0 && size==outputSize);
-
-					// I can const cast, we know the mapping is just a pointer
-					output_t* const data = reinterpret_cast<output_t*>(const_cast<void*>(bufSrc));
-					auto median = data+elementCount/2;
-					std::nth_element(data,median,data+elementCount);
-
-					m_logger->log("Iteration %d Median of Minimum Distances is %f",ILogger::ELL_PERFORMANCE,savedIterNum,*median);
-				},
-				// Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it
-				// hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands.
-				// It could also be latched in the upstreaming deallocate, because its the same fence.
-				std::move(cmdbuf),m_downStreamingBuffer
-			);
-			// We put a function we want to execute 
-			m_downStreamingBuffer->multi_deallocate(AllocationCount,&outputOffset,&outputSize,std::move(fence),&latchedConsumer.get());
 		}
 
 		bool onAppTerminated() override

From 56f855debea003f6ef80a55bd2a8ec5b6975226e Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Wed, 24 Jan 2024 23:12:21 -0300
Subject: [PATCH 4/9] Work on property pool example

---
 66_PropertyPools/main.cpp | 183 +++++++++++++++++++++++++++++++++++---
 1 file changed, 172 insertions(+), 11 deletions(-)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index e59f6385a..e1ab9d7b3 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -3,13 +3,147 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 
-// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
-#include "../common/MonoDeviceApplication.hpp"
+#include "nbl/video/surface/CSurfaceVulkan.h"
+
+#include "../common/BasicMultiQueueApplication.hpp"
 #include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
+namespace nbl::examples
+{
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace ui;
+using namespace asset;
+using namespace video;
+
+// Virtual Inheritance because apps might end up doing diamond inheritance
+class WindowedApplication : public virtual BasicMultiQueueApplication
+{
+		using base_t = BasicMultiQueueApplication;
+
+	public:
+		using base_t::base_t;
+
+		virtual video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override
+		{
+			auto retval = base_t::getAPIFeaturesToEnable();
+			// We only support one swapchain mode, surface, the other one is Display which we have not implemented yet.
+			retval.swapchainMode = video::E_SWAPCHAIN_MODE::ESM_SURFACE;
+			return retval;
+		}
+
+		// New function, we neeed to know about surfaces to create ahead of time
+		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const = 0;
+
+		virtual core::set<video::IPhysicalDevice*> filterDevices(const core::SRange<video::IPhysicalDevice* const>& physicalDevices) const
+		{
+			const auto firstFilter = base_t::filterDevices(physicalDevices);
+
+			video::SPhysicalDeviceFilter deviceFilter = {};
+			
+			const auto surfaces = getSurfaces();
+			deviceFilter.requiredSurfaceCompatibilities = surfaces.data();
+			deviceFilter.requiredSurfaceCompatibilitiesCount = surfaces.size();
+
+			return deviceFilter(physicalDevices);
+		}
+		
+		virtual bool onAppInitialized(smart_refctd_ptr<ISystem>&& system)
+		{
+			// Remember to call the base class initialization!
+			if (!base_t::onAppInitialized(std::move(system)))
+				return false;
+
+		#ifdef _NBL_PLATFORM_WINDOWS_
+			m_winMgr = nbl::ui::IWindowManagerWin32::create();
+		#else
+			#error "Unimplemented!"
+		#endif
+		}
+
+		core::smart_refctd_ptr<ui::IWindowManager> m_winMgr;
+};
+
+
+// Before we get onto creating a window, we need to discuss how Nabla handles input, clipboards and cursor control
+class IWindowClosedCallback : public virtual nbl::ui::IWindow::IEventCallback
+{
+	public:
+		IWindowClosedCallback() : m_gotWindowClosedMsg(false) {}
+
+		// unless you create a separate callback per window, both will "trip" this condition
+		bool windowGotClosed() const {return m_gotWindowClosedMsg;}
+
+	private:
+		bool onWindowClosed_impl() override
+		{
+			m_gotWindowClosedMsg = true;
+			return true;
+		}
+
+		bool m_gotWindowClosedMsg;
+};
+
+// We inherit from an application that tries to find Graphics and Compute queues
+// because applications with presentable images often want to perform Graphics family operations
+// Virtual Inheritance because apps might end up doing diamond inheritance
+class SingleNonResizableWindowApplication : public virtual WindowedApplication
+{
+		using base_t = WindowedApplication;
+
+	protected:
+		virtual IWindow::SCreationParams getWindowCreationParams() const
+		{
+			IWindow::SCreationParams params = {};
+			params.callback = make_smart_refctd_ptr<IWindowClosedCallback>();
+			params.width = 640;
+			params.height = 480;
+			params.x = 32;
+			params.y = 32;
+			params.flags = IWindow::ECF_NONE;
+			params.windowCaption = "SingleNonResizableWindowApplication";
+			return params;
+		}
+
+		core::smart_refctd_ptr<ui::IWindow> m_window;
+		core::smart_refctd_ptr<video::ISurfaceVulkan> m_surface;
+
+	public:
+		using base_t::base_t;
+
+		virtual bool onAppInitialized(smart_refctd_ptr<nbl::system::ISystem>&& system) override
+		{
+			// Remember to call the base class initialization!
+			if (!base_t::onAppInitialized(std::move(system)))
+				return false;
+
+			m_window = m_winMgr->createWindow(getWindowCreationParams());
+			m_surface = video::CSurfaceVulkanWin32::create(core::smart_refctd_ptr(m_api),core::smart_refctd_ptr_static_cast<ui::IWindowWin32>(m_window));
+			return true;
+		}
+
+		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const
+		{
+			return {{m_surface.get()/*,EQF_NONE*/}};
+		}
+
+		virtual bool keepRunning() override
+		{
+			if (!m_window || reinterpret_cast<const IWindowClosedCallback*>(m_window->getEventCallback())->windowGotClosed())
+				return false;
+
+			return true;
+		}
+};
+}
+
+
 using namespace nbl;
 using namespace core;
 using namespace system;
+using namespace ui;
 using namespace asset;
 using namespace video;
 
@@ -19,7 +153,7 @@ using namespace video;
 
 
 // In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants 
-class PropertyPoolsApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
+class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
 {
 		using device_base_t = examples::MonoDeviceApplication;
 		using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
@@ -98,23 +232,29 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 
 			m_propertyPoolHandler = core::make_smart_refctd_ptr<CPropertyPoolHandler>(core::smart_refctd_ptr(m_device));
 
-			auto createBuffer = [&](uint64_t size)
+			auto createBuffer = [&](uint64_t size, core::bitflag<asset::IBuffer::E_USAGE_FLAGS> flags, const char* name, bool hostVisible)
 			{
 					video::IGPUBuffer::SCreationParams creationParams;
-					creationParams.size = size;
-					creationParams.usage = core::bitflag(asset::IBuffer::EUF_STORAGE_BUFFER_BIT) | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;
+					creationParams.size = ((size + 3) / 4) * 4; // Align
+					creationParams.usage = flags
+						| asset::IBuffer::EUF_STORAGE_BUFFER_BIT
+						| asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT 
+						| asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;
 
 					auto buffer = m_device->createBuffer(std::move(creationParams));
 					nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buffer->getMemoryReqs();
+					if (hostVisible) 
+						reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
 					m_device->allocate(reqs, buffer.get(), nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
+					buffer->setObjectDebugName(name);
 
 					return buffer;
 			};
 
-			m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount);
-			m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer);
-			m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer);
-			m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer);
+			m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_scratchBuffer", false);
+			m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_addressBuffer", false);
+			m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_transferSrcBuffer", false);
+			m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_transferDstBuffer", true);
 
 			for (uint16_t i = 0; i < uint16_t((uint32_t(1) << 16) - 1); i++)
 				m_data.push_back(i);
@@ -211,7 +351,12 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 				cmdbuf->bindComputePipeline(m_pipeline.get());
 
 				// COMMAND RECORDING
-				cmdbuf->updateBuffer(m_transferSrcBuffer.get(), 0, sizeof(uint16_t) * m_data.size(), &m_data[0]);
+				uint32_t dataSize = (((sizeof(uint16_t) * m_data.size()) + 3) / 4) * 4;
+				uint32_t maxUpload = 65536;
+				for (uint32_t offset = 0; offset < dataSize; offset += maxUpload)
+				{
+					cmdbuf->updateBuffer(m_transferSrcBuffer.get(), offset, maxUpload, &m_data[offset / sizeof(uint16_t)]);
+				}
 				CPropertyPoolHandler::TransferRequest transferRequest;
 				transferRequest.memblock = asset::SBufferRange<video::IGPUBuffer> { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr<video::IGPUBuffer>(m_transferSrcBuffer) };
 				transferRequest.elementSize = m_data.size();
@@ -239,6 +384,22 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 				queue->submit(1u,&submitInfo,fence.get());
 				queue->endCapture();
 			}
+
+			{
+				// Readback ds
+				auto mem = m_transferDstBuffer->getBoundMemory();
+				assert(mem->isMappable());
+				auto ptr = m_device->mapMemory(nbl::video::IDeviceMemoryAllocation::MappedMemoryRange(mem, 0, mem->getAllocationSize()), video::IDeviceMemoryAllocation::EMCAF_READ);
+				auto uint16_t_ptr = static_cast<uint16_t*>(ptr);
+
+				for (uint32_t i = 0; i < 128; i++)
+				{
+					uint16_t value = uint16_t_ptr[i];
+					std::printf("%i, ", value);
+				}
+				std::printf("\n");
+				m_device->unmapMemory(mem);
+			}
 				
 			// We can also actually latch our Command Pool reset and its return to the pool of free pools!
 			m_poolCache->releaseSet(m_device.get(),smart_refctd_ptr(fence),poolIx);

From bbfcd9cfb88a451e4d11a394fde20549319faa76 Mon Sep 17 00:00:00 2001
From: Eichenherz <transferusuniversalis@gmail.com>
Date: Sat, 17 Feb 2024 14:51:23 +0200
Subject: [PATCH 5/9] Added app_resources and orgnaized files

---
 26_CentralLimitBoxBlur/CMakeLists.txt         |  19 ++
 .../app_resources/compute.hlsl                |  31 ++
 .../app_resources/descriptors.hlsl            |  45 +++
 26_CentralLimitBoxBlur/main.cpp               | 312 ++++++++++++++++++
 CMakeLists.txt                                |   2 +-
 5 files changed, 408 insertions(+), 1 deletion(-)
 create mode 100644 26_CentralLimitBoxBlur/CMakeLists.txt
 create mode 100644 26_CentralLimitBoxBlur/app_resources/compute.hlsl
 create mode 100644 26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
 create mode 100644 26_CentralLimitBoxBlur/main.cpp

diff --git a/26_CentralLimitBoxBlur/CMakeLists.txt b/26_CentralLimitBoxBlur/CMakeLists.txt
new file mode 100644
index 000000000..bd3146859
--- /dev/null
+++ b/26_CentralLimitBoxBlur/CMakeLists.txt
@@ -0,0 +1,19 @@
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/26_CentralLimitBoxBlur/app_resources/compute.hlsl b/26_CentralLimitBoxBlur/app_resources/compute.hlsl
new file mode 100644
index 000000000..2f077ecfb
--- /dev/null
+++ b/26_CentralLimitBoxBlur/app_resources/compute.hlsl
@@ -0,0 +1,31 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#pragma shader_stage(compute)
+
+#include "common.hlsl"
+#include "box_blur.hlsl"
+
+[[vk::push_constant]]
+BBoxBlurParams boxBlurParams;
+
+[numthreads( DefaultWorkgroupSize, 1, 1 )]
+void main( uint3 invocationID : SV_DispatchThreadID )
+{
+	uint32_t direction = boxBlurParams.getDirection();
+	uint32_t wrapMode = boxBlurParams.getWrapMode();
+	nbl::hlsl::float32_t4 borderColor = float32_t4(1.f, 0.f, 1.f, 1.f);
+	if( boxBlurParams.getWrapMode() == WRAP_MODE_CLAMP_TO_BORDER )
+	{
+		borderColor = boxBlurParams.getBorderColor();
+	}
+
+	BufferAccessor textureAccessor = BufferAccessorCtor( boxBlurParams.inputDimensions, boxBlurParams.inputStrides,
+														 boxBlurParams.outputStrides );
+
+	for( uint32_t ch = 0; ch < boxBlurParams.getChannelCount(); ++ch )
+	{
+		BoxBlur( ch, direction, boxBlurParams.radius, wrapMode, borderColor, textureAccessor );
+	}
+}
diff --git a/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl b/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
new file mode 100644
index 000000000..851f4659c
--- /dev/null
+++ b/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
@@ -0,0 +1,45 @@
+#include "common.hlsl"
+
+[[vk::binding( 0, 0 )]] Buffer<nbl::hlsl::float32_t> input;
+[[vk::binding( 1, 0 )]] RWBuffer<nbl::hlsl::float32_t> output;
+
+
+// TODO: figure the proper way to do templated BufferAccessors
+struct BufferAccessor
+{
+	uint32_t3 dimension;
+	uint32_t inputStride;
+	uint32_t outputStride;
+	//uint32_t channelCount;
+
+	nbl::hlsl::float32_t getPaddedData( const uint32_t3 coordinate, const uint32_t channel )
+	{
+		uint32_t stridedIdx = dot( uint32_t4( coordinate, channel ), inputStride );
+
+		float data = 0.f;
+		if( all( coordinate < dimension ) )
+		{
+			data = input[ stridedIdx ];
+		}
+			
+		return data;
+	}
+
+	void setData( const uint32_t3 coordinate, const uint32_t channel, const float32_t val )
+	{
+		if( all( coordinate < dimension ) )
+		{
+			uint32_t strided_idx = dot( uint32_t4( coordinate, channel ), outputStride );
+			output[ strided_idx ] = val;
+		}
+	}
+};
+
+BufferAccessor BufferAccessorCtor( uint32_t3 dimension, uint32_t inputStride, uint32_t outputStride )
+{
+	BufferAccessor ba;
+	ba.dimension = dimension;
+	ba.inputStride = inputStride;
+	ba.outputStride = outputStride;
+	return ba;
+}
diff --git a/26_CentralLimitBoxBlur/main.cpp b/26_CentralLimitBoxBlur/main.cpp
new file mode 100644
index 000000000..d0bf418eb
--- /dev/null
+++ b/26_CentralLimitBoxBlur/main.cpp
@@ -0,0 +1,312 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+
+// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
+#include "../common/MonoSystemMonoLoggerApplication.hpp"
+#include "common.hlsl"
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace video;
+
+
+// this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platforms
+class HelloComputeApp final : public nbl::examples::MonoSystemMonoLoggerApplication
+{
+	using base_t = examples::MonoSystemMonoLoggerApplication;
+public:
+	// Generally speaking because certain platforms delay initialization from main object construction you should just forward and not do anything in the ctor
+	using base_t::base_t;
+
+	// we stuff all our work here because its a "single shot" app
+	bool onAppInitialized( smart_refctd_ptr<ISystem>&& system ) override
+	{
+		// Remember to call the base class initialization!
+		if( !base_t::onAppInitialized( std::move( system ) ) )
+			return false;
+		// `system` could have been null (see the comments in `MonoSystemMonoLoggerApplication::onAppInitialized` as for why)
+		// use `MonoSystemMonoLoggerApplication::m_system` throughout the example instead!
+
+		// You should already know Vulkan and come here to save on the boilerplate, if you don't know what instances and instance extensions are, then find out.
+		smart_refctd_ptr<nbl::video::CVulkanConnection> api;
+		{
+			// You generally want to default initialize any parameter structs
+			nbl::video::IAPIConnection::SFeatures apiFeaturesToEnable = {};
+			// generally you want to make your life easier during development
+			apiFeaturesToEnable.validations = true;
+			apiFeaturesToEnable.synchronizationValidation = true;
+			// want to make sure we have this so we can name resources for vieweing in RenderDoc captures
+			apiFeaturesToEnable.debugUtils = true;
+			// create our Vulkan instance
+			if( !( api = CVulkanConnection::create( smart_refctd_ptr( m_system ), 0, _NBL_APP_NAME_, smart_refctd_ptr( base_t::m_logger ), apiFeaturesToEnable ) ) )
+				return logFail( "Failed to crate an IAPIConnection!" );
+		}
+
+		// We won't go deep into performing physical device selection in this example, we'll take any device with a compute queue.
+		// Nabla has its own set of required baseline Vulkan features anyway, it won't report any device that doesn't meet them.
+		nbl::video::IPhysicalDevice* physDev = nullptr;
+		ILogicalDevice::SCreationParams params = {};
+		// we will only deal with a single queue in this example
+		params.queueParamsCount = 1;
+		params.queueParams[ 0 ].count = 1;
+		for( auto physDevIt = api->getPhysicalDevices().begin(); physDevIt != api->getPhysicalDevices().end(); physDevIt++ )
+		{
+			const auto familyProps = ( *physDevIt )->getQueueFamilyProperties();
+			// this is the only "complicated" part, we want to create a queue that supports compute pipelines
+			for( auto i = 0; i < familyProps.size(); i++ )
+				if( familyProps[ i ].queueFlags.hasFlags( IQueue::FAMILY_FLAGS::COMPUTE_BIT ) )
+				{
+					physDev = *physDevIt;
+					params.queueParams[ 0 ].familyIndex = i;
+					break;
+				}
+		}
+		if( !physDev )
+			return logFail( "Failed to find any Physical Devices with Compute capable Queue Families!" );
+
+		// logical devices need to be created form physical devices which will actually let us create vulkan objects and use the physical device
+		smart_refctd_ptr<ILogicalDevice> device = physDev->createLogicalDevice( std::move( params ) );
+		if( !device )
+			return logFail( "Failed to create a Logical Device!" );
+
+		constexpr uint32_t WorkgroupSize = 256;
+		constexpr uint32_t WorkgroupCount = 2048;
+		// A word about `nbl::asset::IAsset`s, whenever you see an `nbl::asset::ICPUSomething` you can be sure an `nbl::video::IGPUSomething exists, and they both inherit from `nbl::asset::ISomething`.
+		// The convention is that an `ICPU` object represents a potentially Mutable (and in the past, Serializable) recipe for creating an `IGPU` object, and later examples will show automated systems for doing that.
+		// The Assets always form a Directed Acyclic Graph and our type system enforces that property at compile time (i.e. an `IBuffer` cannot reference an `IImageView` even indirectly).
+		// Another reason for the 1:1 pairing of types is that one can use a CPU-to-GPU associative cache (asset manager has a default one) and use the pointers to the CPU objects as UUIDs.
+		// The ICPUShader is just a mutable container for source code (can be high level like HLSL needing compilation to SPIR-V or SPIR-V itself) held in an `nbl::asset::ICPUBuffer`.
+		// They can be created: from buffers of code, by compilation from some other source code, or loaded from files (next example will do that).
+		smart_refctd_ptr<nbl::asset::ICPUShader> cpuShader;
+		{
+			// Normally we'd use the ISystem and the IAssetManager to load shaders flexibly from (virtual) files for ease of development (syntax highlighting and Intellisense),
+			// but I want to show the full process of assembling a shader from raw source code at least once.
+			smart_refctd_ptr<nbl::asset::IShaderCompiler> compiler = make_smart_refctd_ptr<nbl::asset::CHLSLCompiler>( smart_refctd_ptr( m_system ) );
+
+			// A simple shader that writes out the Global Invocation Index to the position it corresponds to in the buffer
+			// Note the injection of a define from C++ to keep the workgroup size in sync.
+			// P.S. We don't have an entry point name compiler option because we expect that future compilers should support multiple entry points, so for now there must be a single entry point called "main".
+			constexpr const char* source = R"===(
+					#pragma wave shader_stage(compute)
+
+					[[vk::binding(0,0)]] RWStructuredBuffer<uint32_t> buff;
+
+					[numthreads(WORKGROUP_SIZE,1,1)]
+					void main(uint32_t3 ID : SV_DispatchThreadID)
+					{
+						buff[ID.x] = ID.x;
+					}
+				)===";
+
+			// Yes we know workgroup sizes can come from specialization constants, however DXC has a problem with that https://github.com/microsoft/DirectXShaderCompiler/issues/3092
+			const string WorkgroupSizeAsStr = std::to_string( WorkgroupSize );
+			const IShaderCompiler::SPreprocessorOptions::SMacroDefinition WorkgroupSizeDefine = { "WORKGROUP_SIZE",WorkgroupSizeAsStr };
+
+			CHLSLCompiler::SOptions options = {};
+			// really we should set it to `ESS_COMPUTE` since we know, but we'll test the `#pragma` handling fur teh lulz
+			options.stage = asset::IShader::E_SHADER_STAGE::ESS_UNKNOWN;
+			// want as much debug as possible
+			options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
+			// this lets you source-level debug/step shaders in renderdoc
+			if( physDev->getLimits().shaderNonSemanticInfo )
+				options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_NON_SEMANTIC_BIT;
+			// if you don't set the logger and source identifier you'll have no meaningful errors
+			options.preprocessorOptions.sourceIdentifier = "embedded.comp.hlsl";
+			options.preprocessorOptions.logger = m_logger.get();
+			options.preprocessorOptions.extraDefines = { &WorkgroupSizeDefine,&WorkgroupSizeDefine + 1 };
+			if( !( cpuShader = compiler->compileToSPIRV( source, options ) ) )
+				return logFail( "Failed to compile following HLSL Shader:\n%s\n", source );
+		}
+
+		// Note how each ILogicalDevice method takes a smart-pointer r-value, so that the GPU objects refcount their dependencies
+		smart_refctd_ptr<nbl::video::IGPUShader> shader = device->createShader( cpuShader.get() );
+		if( !shader )
+			return logFail( "Failed to create a GPU Shader, seems the Driver doesn't like the SPIR-V we're feeding it!\n" );
+
+		// the simplest example would have used push constants and BDA, but RenderDoc's debugging of that sucks, so I'll demonstrate "classical" binding of buffers with descriptors
+		nbl::video::IGPUDescriptorSetLayout::SBinding bindings[ 1 ] = {
+			{
+				.binding = 0,
+				.type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, // not is not the time for descriptor indexing
+				.stageFlags = IGPUShader::ESS_COMPUTE,
+				.count = 1,
+				.samplers = nullptr // irrelevant for a buffer
+			}
+		};
+		smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout = device->createDescriptorSetLayout( bindings );
+		if( !dsLayout )
+			return logFail( "Failed to create a Descriptor Layout!\n" );
+
+		// Nabla actually has facilities for SPIR-V Reflection and "guessing" pipeline layouts for a given SPIR-V which we'll cover in a different example
+		smart_refctd_ptr<nbl::video::IGPUPipelineLayout> pplnLayout = device->createPipelineLayout( {}, smart_refctd_ptr( dsLayout ) );
+		if( !pplnLayout )
+			return logFail( "Failed to create a Pipeline Layout!\n" );
+
+		// We use strong typing on the pipelines (Compute, Graphics, Mesh, RT), since there's no reason to polymorphically switch between different pipelines
+		smart_refctd_ptr<nbl::video::IGPUComputePipeline> pipeline;
+		{
+			IGPUComputePipeline::SCreationParams params = {};
+			params.layout = pplnLayout.get();
+			// Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
+			params.shader.entryPoint = "main";
+			params.shader.shader = shader.get();
+			// we'll cover the specialization constant API in another example
+			if( !device->createComputePipelines( nullptr, { &params,1 }, &pipeline ) )
+				return logFail( "Failed to create pipelines (compile & link shaders)!\n" );
+		}
+
+		// Our Descriptor Sets track (refcount) resources written into them, so you can pretty much drop and forget whatever you write into them.
+		// A later Descriptor Indexing example will test that this tracking is also correct for Update-After-Bind Descriptor Set bindings too.
+		smart_refctd_ptr<nbl::video::IGPUDescriptorSet> ds;
+
+		// A `nbl::video::DeviceMemoryAllocator` is an interface to implement anything that can dish out free memory range to bind to back a `nbl::video::IGPUBuffer` or a `nbl::video::IGPUImage`
+		// The Logical Device itself implements the interface and behaves as the most simple allocator, it will create a new `nbl::video::IDeviceMemoryAllocation` every single time.
+		// We will cover allocators and suballocation in a later example.
+		nbl::video::IDeviceMemoryAllocator::SAllocation allocation = {};
+		{
+			constexpr size_t BufferSize = sizeof( uint32_t ) * WorkgroupSize * WorkgroupCount;
+
+			// Always default the creation parameters, there's a lot of extra stuff for DirectX/CUDA interop and slotting into external engines you don't usually care about. 
+			nbl::video::IGPUBuffer::SCreationParams params = {};
+			params.size = BufferSize;
+			// While the usages on `ICPUBuffers` are mere hints to our automated CPU-to-GPU conversion systems which need to be patched up anyway,
+			// the usages on an `IGPUBuffer` are crucial to specify correctly.
+			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+			smart_refctd_ptr<IGPUBuffer> outputBuff = device->createBuffer( std::move( params ) );
+			if( !outputBuff )
+				return logFail( "Failed to create a GPU Buffer of size %d!\n", params.size );
+
+			// Naming objects is cool because not only errors (such as Vulkan Validation Layers) will show their names, but RenderDoc captures too.
+			outputBuff->setObjectDebugName( "My Output Buffer" );
+
+			// We don't want to bother explaining best staging buffer practices just yet, so we will create a buffer over
+			// a memory type thats Host Visible (can be mapped and give the CPU a direct pointer to read from)
+			nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs();
+			// you can simply constrain the memory requirements by AND-ing the type bits of the host visible memory types
+			reqs.memoryTypeBits &= physDev->getHostVisibleMemoryTypeBits();
+
+			// There are actually two `allocate` overloads, one which allocates memory if you already know the type you want.
+			// And this one which is a utility which tries to allocate from every type that matches your requirements in some order of preference.
+			// The other of preference (iteration over compatible types) can be controlled by the method's template parameter,
+			// the default is from lowest index to highest, but skipping over incompatible types.
+			allocation = device->allocate( reqs, outputBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE );
+			if( !allocation.isValid() )
+				return logFail( "Failed to allocate Device Memory compatible with our GPU Buffer!\n" );
+
+			// Note that we performed a Dedicated Allocation above, so there's no need to bind the memory anymore (since the allocator knows the dedication, it can already bind).
+			// This is a carryover from having an OpenGL backend, where you couldn't have a memory allocation separate from the resource, so all allocations had to be "dedicated".
+			// In Vulkan dedicated allocations are the most performant and still make sense as long as you won't blow the 4096 allocation limit on windows.
+			// You should always use dedicated allocations for images used for swapchains, framebuffer attachments (esp transient), as well as objects used in CUDA/DirectX interop.
+			assert( outputBuff->getBoundMemory().memory == allocation.memory.get() );
+
+			// This is a cool utility you can use instead of counting up how much of each descriptor type you need to N_i allocate descriptor sets with layout L_i from a single pool
+			smart_refctd_ptr<nbl::video::IDescriptorPool> pool = device->createDescriptorPoolForDSLayouts( IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 } );
+
+			// note how the pool will go out of scope but thanks for backreferences in each object to its parent/dependency it will be kept alive for as long as all the Sets it allocated
+			ds = pool->createDescriptorSet( std::move( dsLayout ) );
+			// we still use Vulkan 1.0 descriptor update style, could move to Update Templates but Descriptor Buffer ubiquity seems just around the corner
+			{
+				IGPUDescriptorSet::SDescriptorInfo info[ 1 ];
+				info[ 0 ].desc = smart_refctd_ptr( outputBuff ); // bad API, too late to change, should just take raw-pointers since not consumed
+				info[ 0 ].info.buffer = { .offset = 0,.size = BufferSize };
+				IGPUDescriptorSet::SWriteDescriptorSet writes[ 1 ] = {
+					{.dstSet = ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info}
+				};
+				device->updateDescriptorSets( writes, {} );
+			}
+		}
+
+		// To be able to read the contents of the buffer we need to map its memory
+		// P.S. Nabla mandates Persistent Memory Mappings on all backends (but not coherent memory types)
+		auto ptr = allocation.memory->map( { 0ull,allocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ );
+		if( !ptr )
+			return logFail( "Failed to map the Device Memory!\n" );
+
+		// Our commandbuffers are cool because they refcount the resources used by each command you record into them, so you can rely a commandbuffer on keeping them alive.
+		smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdbuf;
+		{
+			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = device->createCommandPool( params.queueParams[ 0 ].familyIndex, IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT );
+			if( !cmdpool->createCommandBuffers( IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf ) )
+				return logFail( "Failed to create Command Buffers!\n" );
+		}
+
+		cmdbuf->begin( IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT );
+		// If you enable the `debugUtils` API Connection feature on a supported backend as we've done, you'll get these pretty debug sections in RenderDoc
+		cmdbuf->beginDebugMarker( "My Compute Dispatch", core::vectorSIMDf( 0, 1, 0, 1 ) );
+		// you want to bind the pipeline first to avoid accidental unbind of descriptor sets due to compatibility matching
+		cmdbuf->bindComputePipeline( pipeline.get() );
+		cmdbuf->bindDescriptorSets( nbl::asset::EPBP_COMPUTE, pplnLayout.get(), 0, 1, &ds.get() );
+		cmdbuf->dispatch( WorkgroupCount, 1, 1 );
+		cmdbuf->endDebugMarker();
+		// Normally you'd want to perform a memory barrier when using the output of a compute shader or renderpass,
+		// however waiting on a timeline semaphore (or fence) on the Host makes all Device writes visible.
+		cmdbuf->end();
+
+		// Only Timeline Semaphores are supported in Nabla, there's no fences or binary semaphores.
+		// Swapchains run on adaptors with empty submits that make them look like they work with Timeline Semaphores,
+		// which has important side-effects we'll cover in another example.
+		constexpr auto StartedValue = 0;
+		constexpr auto FinishedValue = 45;
+		static_assert( FinishedValue > StartedValue );
+		smart_refctd_ptr<ISemaphore> progress = device->createSemaphore( StartedValue );
+		{
+			// queues are inherent parts of the device, ergo not refcounted (you refcount the device instead)
+			IQueue* queue = device->getQueue( params.queueParams[ 0 ].familyIndex, 0 );
+
+			// Default, we have no semaphores to wait on before we can start our workload
+			IQueue::SSubmitInfo submitInfos[ 1 ] = {};
+			// The IGPUCommandBuffer is the only object whose usage does not get automagically tracked internally, you're responsible for holding onto it as long as the GPU needs it.
+			// So this is why our commandbuffer, even though its transient lives in the scope equal or above the place where we wait for the submission to be signalled as complete.
+			const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = cmdbuf.get()} };
+			submitInfos[ 0 ].commandBuffers = cmdbufs;
+			// But we do need to signal completion by incrementing the Timeline Semaphore counter as soon as the compute shader is done
+			const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = progress.get(),.value = FinishedValue,.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} };
+			submitInfos[ 0 ].signalSemaphores = signals;
+
+			// We have a cool integration with RenderDoc that allows you to start and end captures programmatically.
+			// This is super useful for debugging multi-queue workloads and by default RenderDoc delimits captures only by Swapchain presents.
+			queue->startCapture();
+			queue->submit( submitInfos );
+			queue->endCapture();
+		}
+		// As the name implies this function will not progress until the fence signals or repeated waiting returns an error.
+		const ISemaphore::SWaitInfo waitInfos[] = { {
+			.semaphore = progress.get(),
+			.value = FinishedValue
+		} };
+		device->blockForSemaphores( waitInfos );
+
+		// You don't need to do this, but putting it here to demonstrate that its safe to drop a commandbuffer after GPU is done (try moving it above and see if you BSOD or just get a validation error). 
+		cmdbuf = nullptr;
+
+		// if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches
+		const ILogicalDevice::MappedMemoryRange memoryRange( allocation.memory.get(), 0ull, allocation.memory->getAllocationSize() );
+		if( !allocation.memory->getMemoryPropertyFlags().hasFlags( IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT ) )
+			device->invalidateMappedMemoryRanges( 1, &memoryRange );
+
+		// a simple test to check we got the right thing back
+		auto buffData = reinterpret_cast< const uint32_t* >( ptr );
+		for( auto i = 0; i < WorkgroupSize * WorkgroupCount; i++ )
+			if( buffData[ i ] != i )
+				return logFail( "DWORD at position %d doesn't match!\n", i );
+		// This allocation would unmap itself in the dtor anyway, but lets showcase the API usage
+		allocation.memory->unmap();
+
+		return true;
+	}
+
+	// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
+	void workLoopBody() override {}
+
+	// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
+	bool keepRunning() override { return false; }
+
+};
+
+
+NBL_MAIN_FUNC( HelloComputeApp )
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 09a73bfe0..5b104b06a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,7 +43,7 @@ if(NBL_BUILD_EXAMPLES)
 	endif()
 	add_subdirectory(23_ArithmeticUnitTest EXCLUDE_FROM_ALL)
 	# add_subdirectory(23_Autoexposure EXCLUDE_FROM_ALL)
-	# add_subdirectory(25_Blur EXCLUDE_FROM_ALL)
+	add_subdirectory(26_CentralLimitBoxBlur EXCLUDE_FROM_ALL)
 	add_subdirectory(25_FilterTest EXCLUDE_FROM_ALL)
 	# add_subdirectory(36_CUDAInterop EXCLUDE_FROM_ALL)
 

From ae2957922f450a105761a222ddf37b03939ece21 Mon Sep 17 00:00:00 2001
From: Eichenherz <transferusuniversalis@gmail.com>
Date: Wed, 21 Feb 2024 18:54:19 +0200
Subject: [PATCH 6/9] Renamed compute main

---
 .../app_resources/compute.hlsl                |  31 ----
 .../app_resources/descriptors.hlsl            |   4 +-
 .../app_resources/main.comp.hlsl              |  79 ++++++++++
 26_CentralLimitBoxBlur/main.cpp               | 149 ++++++------------
 4 files changed, 127 insertions(+), 136 deletions(-)
 delete mode 100644 26_CentralLimitBoxBlur/app_resources/compute.hlsl
 create mode 100644 26_CentralLimitBoxBlur/app_resources/main.comp.hlsl

diff --git a/26_CentralLimitBoxBlur/app_resources/compute.hlsl b/26_CentralLimitBoxBlur/app_resources/compute.hlsl
deleted file mode 100644
index 2f077ecfb..000000000
--- a/26_CentralLimitBoxBlur/app_resources/compute.hlsl
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#pragma shader_stage(compute)
-
-#include "common.hlsl"
-#include "box_blur.hlsl"
-
-[[vk::push_constant]]
-BBoxBlurParams boxBlurParams;
-
-[numthreads( DefaultWorkgroupSize, 1, 1 )]
-void main( uint3 invocationID : SV_DispatchThreadID )
-{
-	uint32_t direction = boxBlurParams.getDirection();
-	uint32_t wrapMode = boxBlurParams.getWrapMode();
-	nbl::hlsl::float32_t4 borderColor = float32_t4(1.f, 0.f, 1.f, 1.f);
-	if( boxBlurParams.getWrapMode() == WRAP_MODE_CLAMP_TO_BORDER )
-	{
-		borderColor = boxBlurParams.getBorderColor();
-	}
-
-	BufferAccessor textureAccessor = BufferAccessorCtor( boxBlurParams.inputDimensions, boxBlurParams.inputStrides,
-														 boxBlurParams.outputStrides );
-
-	for( uint32_t ch = 0; ch < boxBlurParams.getChannelCount(); ++ch )
-	{
-		BoxBlur( ch, direction, boxBlurParams.radius, wrapMode, borderColor, textureAccessor );
-	}
-}
diff --git a/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl b/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
index 851f4659c..8ac3649b0 100644
--- a/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
+++ b/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
@@ -1,4 +1,4 @@
-#include "common.hlsl"
+#include "nbl/builtin/hlsl/blur/common.hlsl"
 
 [[vk::binding( 0, 0 )]] Buffer<nbl::hlsl::float32_t> input;
 [[vk::binding( 1, 0 )]] RWBuffer<nbl::hlsl::float32_t> output;
@@ -21,7 +21,7 @@ struct BufferAccessor
 		{
 			data = input[ stridedIdx ];
 		}
-			
+
 		return data;
 	}
 
diff --git a/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl b/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl
new file mode 100644
index 000000000..2a01d7749
--- /dev/null
+++ b/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl
@@ -0,0 +1,79 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#pragma shader_stage(compute)
+
+#include "nbl/builtin/hlsl/blur/common.hlsl"
+
+//#include "descriptors"
+////////////////////////////
+[[vk::binding( 0, 0 )]] Buffer<nbl::hlsl::float32_t> input;
+[[vk::binding( 1, 0 )]] RWBuffer<nbl::hlsl::float32_t> output;
+
+
+// TODO: figure the proper way to do templated BufferAccessors
+struct BufferAccessor
+{
+	uint32_t3 dimension;
+	uint32_t inputStride;
+	uint32_t outputStride;
+	//uint32_t channelCount;
+
+	nbl::hlsl::float32_t getPaddedData( const uint32_t3 coordinate, const uint32_t channel )
+	{
+		uint32_t stridedIdx = dot( uint32_t4( coordinate, channel ), inputStride );
+
+		float data = 0.f;
+		if( all( coordinate < dimension ) )
+		{
+			data = input[ stridedIdx ];
+		}
+
+		return data;
+	}
+
+	void setData( const uint32_t3 coordinate, const uint32_t channel, const float32_t val )
+	{
+		if( all( coordinate < dimension ) )
+		{
+			uint32_t strided_idx = dot( uint32_t4( coordinate, channel ), outputStride );
+			output[ strided_idx ] = val;
+		}
+	}
+};
+
+BufferAccessor BufferAccessorCtor( uint32_t3 dimension, uint32_t inputStride, uint32_t outputStride )
+{
+	BufferAccessor ba;
+	ba.dimension = dimension;
+	ba.inputStride = inputStride;
+	ba.outputStride = outputStride;
+	return ba;
+}
+////////////////////////////
+
+#include "nbl/builtin/hlsl/blur/box_blur.hlsl"
+
+[[vk::push_constant]]
+BoxBlurParams boxBlurParams;
+
+[numthreads( WORKGROUP_SIZE, 1, 1 )]
+void main( uint3 invocationID : SV_DispatchThreadID )
+{
+	uint32_t direction = boxBlurParams.getDirection();
+	uint32_t wrapMode = boxBlurParams.getWrapMode();
+	nbl::hlsl::float32_t4 borderColor = float32_t4(1.f, 0.f, 1.f, 1.f);
+	if( boxBlurParams.getWrapMode() == WRAP_MODE_CLAMP_TO_BORDER )
+	{
+		borderColor = boxBlurParams.getBorderColor();
+	}
+
+	BufferAccessor textureAccessor = BufferAccessorCtor( 
+		boxBlurParams.inputDimensions.xyz, boxBlurParams.inputStrides, boxBlurParams.outputStrides );
+
+	for( uint32_t ch = 0; ch < boxBlurParams.getChannelCount(); ++ch )
+	{
+		BoxBlur( ch, direction, boxBlurParams.radius, wrapMode, borderColor, textureAccessor );
+	}
+}
diff --git a/26_CentralLimitBoxBlur/main.cpp b/26_CentralLimitBoxBlur/main.cpp
index d0bf418eb..e25d8cf53 100644
--- a/26_CentralLimitBoxBlur/main.cpp
+++ b/26_CentralLimitBoxBlur/main.cpp
@@ -4,8 +4,10 @@
 
 
 // I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
-#include "../common/MonoSystemMonoLoggerApplication.hpp"
-#include "common.hlsl"
+#include "../common/MonoDeviceApplication.hpp"
+#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+#include "CArchive.h"
 
 using namespace nbl;
 using namespace core;
@@ -13,121 +15,62 @@ using namespace system;
 using namespace asset;
 using namespace video;
 
+#define _NBL_PLATFORM_WINDOWS_
 
-// this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platforms
-class HelloComputeApp final : public nbl::examples::MonoSystemMonoLoggerApplication
+class BoxBlurDemo final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
 {
-	using base_t = examples::MonoSystemMonoLoggerApplication;
-public:
-	// Generally speaking because certain platforms delay initialization from main object construction you should just forward and not do anything in the ctor
-	using base_t::base_t;
+	using device_base_t = examples::MonoDeviceApplication;
+	using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
 
-	// we stuff all our work here because its a "single shot" app
+public:
+	BoxBlurDemo( const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD 
+	) : system::IApplicationFramework( _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD ) {}
+	
 	bool onAppInitialized( smart_refctd_ptr<ISystem>&& system ) override
 	{
 		// Remember to call the base class initialization!
-		if( !base_t::onAppInitialized( std::move( system ) ) )
-			return false;
-		// `system` could have been null (see the comments in `MonoSystemMonoLoggerApplication::onAppInitialized` as for why)
-		// use `MonoSystemMonoLoggerApplication::m_system` throughout the example instead!
-
-		// You should already know Vulkan and come here to save on the boilerplate, if you don't know what instances and instance extensions are, then find out.
-		smart_refctd_ptr<nbl::video::CVulkanConnection> api;
+		if( !device_base_t::onAppInitialized( std::move( system ) ) )
 		{
-			// You generally want to default initialize any parameter structs
-			nbl::video::IAPIConnection::SFeatures apiFeaturesToEnable = {};
-			// generally you want to make your life easier during development
-			apiFeaturesToEnable.validations = true;
-			apiFeaturesToEnable.synchronizationValidation = true;
-			// want to make sure we have this so we can name resources for vieweing in RenderDoc captures
-			apiFeaturesToEnable.debugUtils = true;
-			// create our Vulkan instance
-			if( !( api = CVulkanConnection::create( smart_refctd_ptr( m_system ), 0, _NBL_APP_NAME_, smart_refctd_ptr( base_t::m_logger ), apiFeaturesToEnable ) ) )
-				return logFail( "Failed to crate an IAPIConnection!" );
+			return false;
 		}
-
-		// We won't go deep into performing physical device selection in this example, we'll take any device with a compute queue.
-		// Nabla has its own set of required baseline Vulkan features anyway, it won't report any device that doesn't meet them.
-		nbl::video::IPhysicalDevice* physDev = nullptr;
-		ILogicalDevice::SCreationParams params = {};
-		// we will only deal with a single queue in this example
-		params.queueParamsCount = 1;
-		params.queueParams[ 0 ].count = 1;
-		for( auto physDevIt = api->getPhysicalDevices().begin(); physDevIt != api->getPhysicalDevices().end(); physDevIt++ )
+		if( !asset_base_t::onAppInitialized( std::move( system ) ) )
 		{
-			const auto familyProps = ( *physDevIt )->getQueueFamilyProperties();
-			// this is the only "complicated" part, we want to create a queue that supports compute pipelines
-			for( auto i = 0; i < familyProps.size(); i++ )
-				if( familyProps[ i ].queueFlags.hasFlags( IQueue::FAMILY_FLAGS::COMPUTE_BIT ) )
-				{
-					physDev = *physDevIt;
-					params.queueParams[ 0 ].familyIndex = i;
-					break;
-				}
+			return false;
 		}
-		if( !physDev )
-			return logFail( "Failed to find any Physical Devices with Compute capable Queue Families!" );
-
-		// logical devices need to be created form physical devices which will actually let us create vulkan objects and use the physical device
-		smart_refctd_ptr<ILogicalDevice> device = physDev->createLogicalDevice( std::move( params ) );
-		if( !device )
-			return logFail( "Failed to create a Logical Device!" );
 
 		constexpr uint32_t WorkgroupSize = 256;
 		constexpr uint32_t WorkgroupCount = 2048;
-		// A word about `nbl::asset::IAsset`s, whenever you see an `nbl::asset::ICPUSomething` you can be sure an `nbl::video::IGPUSomething exists, and they both inherit from `nbl::asset::ISomething`.
-		// The convention is that an `ICPU` object represents a potentially Mutable (and in the past, Serializable) recipe for creating an `IGPU` object, and later examples will show automated systems for doing that.
-		// The Assets always form a Directed Acyclic Graph and our type system enforces that property at compile time (i.e. an `IBuffer` cannot reference an `IImageView` even indirectly).
-		// Another reason for the 1:1 pairing of types is that one can use a CPU-to-GPU associative cache (asset manager has a default one) and use the pointers to the CPU objects as UUIDs.
-		// The ICPUShader is just a mutable container for source code (can be high level like HLSL needing compilation to SPIR-V or SPIR-V itself) held in an `nbl::asset::ICPUBuffer`.
-		// They can be created: from buffers of code, by compilation from some other source code, or loaded from files (next example will do that).
-		smart_refctd_ptr<nbl::asset::ICPUShader> cpuShader;
-		{
-			// Normally we'd use the ISystem and the IAssetManager to load shaders flexibly from (virtual) files for ease of development (syntax highlighting and Intellisense),
-			// but I want to show the full process of assembling a shader from raw source code at least once.
-			smart_refctd_ptr<nbl::asset::IShaderCompiler> compiler = make_smart_refctd_ptr<nbl::asset::CHLSLCompiler>( smart_refctd_ptr( m_system ) );
-
-			// A simple shader that writes out the Global Invocation Index to the position it corresponds to in the buffer
-			// Note the injection of a define from C++ to keep the workgroup size in sync.
-			// P.S. We don't have an entry point name compiler option because we expect that future compilers should support multiple entry points, so for now there must be a single entry point called "main".
-			constexpr const char* source = R"===(
-					#pragma wave shader_stage(compute)
-
-					[[vk::binding(0,0)]] RWStructuredBuffer<uint32_t> buff;
-
-					[numthreads(WORKGROUP_SIZE,1,1)]
-					void main(uint32_t3 ID : SV_DispatchThreadID)
-					{
-						buff[ID.x] = ID.x;
-					}
-				)===";
-
-			// Yes we know workgroup sizes can come from specialization constants, however DXC has a problem with that https://github.com/microsoft/DirectXShaderCompiler/issues/3092
-			const string WorkgroupSizeAsStr = std::to_string( WorkgroupSize );
-			const IShaderCompiler::SPreprocessorOptions::SMacroDefinition WorkgroupSizeDefine = { "WORKGROUP_SIZE",WorkgroupSizeAsStr };
-
-			CHLSLCompiler::SOptions options = {};
-			// really we should set it to `ESS_COMPUTE` since we know, but we'll test the `#pragma` handling fur teh lulz
-			options.stage = asset::IShader::E_SHADER_STAGE::ESS_UNKNOWN;
-			// want as much debug as possible
-			options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
-			// this lets you source-level debug/step shaders in renderdoc
-			if( physDev->getLimits().shaderNonSemanticInfo )
-				options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_NON_SEMANTIC_BIT;
-			// if you don't set the logger and source identifier you'll have no meaningful errors
-			options.preprocessorOptions.sourceIdentifier = "embedded.comp.hlsl";
-			options.preprocessorOptions.logger = m_logger.get();
-			options.preprocessorOptions.extraDefines = { &WorkgroupSizeDefine,&WorkgroupSizeDefine + 1 };
-			if( !( cpuShader = compiler->compileToSPIRV( source, options ) ) )
-				return logFail( "Failed to compile following HLSL Shader:\n%s\n", source );
-		}
 
-		// Note how each ILogicalDevice method takes a smart-pointer r-value, so that the GPU objects refcount their dependencies
-		smart_refctd_ptr<nbl::video::IGPUShader> shader = device->createShader( cpuShader.get() );
+
+		// load shader source from file
+		auto getShaderSource = [ & ]( const char* filePath ) -> auto
+		{
+			IAssetLoader::SAssetLoadParams lparams = {};
+			lparams.logger = m_logger.get();
+			lparams.workingDirectory = "";
+			auto bundle = m_assetMgr->getAsset( filePath, lparams );
+			if( bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER )
+			{
+				m_logger->log( "Shader %s not found!", ILogger::ELL_ERROR, filePath );
+				exit( -1 );
+			}
+			auto firstAssetInBundle = bundle.getContents()[ 0 ];
+			return smart_refctd_ptr_static_cast< ICPUShader >( firstAssetInBundle );
+		};
+		auto computeMain = getShaderSource( "app_resources/main.comp.hlsl" );
+
+		smart_refctd_ptr<ICPUShader> overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
+			computeMain.get(), 
+			"#define WORKGROUP_SIZE %s\n#define PASSES_PER_AXIS %d\n#define AXIS_DIM %d\n",
+			std::to_string( WorkgroupSize ).c_str(), 3, 4
+		);
+		smart_refctd_ptr<IGPUShader> shader = m_device->createShader( overridenUnspecialized.get() );
 		if( !shader )
-			return logFail( "Failed to create a GPU Shader, seems the Driver doesn't like the SPIR-V we're feeding it!\n" );
+		{
+			return logFail( "Creation of a GPU Shader to from CPU Shader source failed!" );
+		}
 
-		// the simplest example would have used push constants and BDA, but RenderDoc's debugging of that sucks, so I'll demonstrate "classical" binding of buffers with descriptors
+		/*// the simplest example would have used push constants and BDA, but RenderDoc's debugging of that sucks, so I'll demonstrate "classical" binding of buffers with descriptors
 		nbl::video::IGPUDescriptorSetLayout::SBinding bindings[ 1 ] = {
 			{
 				.binding = 0,
@@ -297,7 +240,7 @@ class HelloComputeApp final : public nbl::examples::MonoSystemMonoLoggerApplicat
 		// This allocation would unmap itself in the dtor anyway, but lets showcase the API usage
 		allocation.memory->unmap();
 
-		return true;
+		return true;*/
 	}
 
 	// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
@@ -309,4 +252,4 @@ class HelloComputeApp final : public nbl::examples::MonoSystemMonoLoggerApplicat
 };
 
 
-NBL_MAIN_FUNC( HelloComputeApp )
\ No newline at end of file
+NBL_MAIN_FUNC( BoxBlurDemo )
\ No newline at end of file

From dc83f3b02a02e3b080c8d214be89b738bc67ac45 Mon Sep 17 00:00:00 2001
From: Eichenherz <transferusuniversalis@gmail.com>
Date: Tue, 27 Feb 2024 10:17:27 +0200
Subject: [PATCH 7/9] Corrections

---
 26_CentralLimitBoxBlur/app_resources/main.comp.hlsl | 10 +++++-----
 common/MonoDeviceApplication.hpp                    |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl b/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl
index 2a01d7749..3f7bd4dfb 100644
--- a/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl
+++ b/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl
@@ -15,9 +15,9 @@
 // TODO: figure the proper way to do templated BufferAccessors
 struct BufferAccessor
 {
+	uint32_t4 inputStride;
+	uint32_t4 outputStride;
 	uint32_t3 dimension;
-	uint32_t inputStride;
-	uint32_t outputStride;
 	//uint32_t channelCount;
 
 	nbl::hlsl::float32_t getPaddedData( const uint32_t3 coordinate, const uint32_t channel )
@@ -33,7 +33,7 @@ struct BufferAccessor
 		return data;
 	}
 
-	void setData( const uint32_t3 coordinate, const uint32_t channel, const float32_t val )
+	void setData( const uint32_t3 coordinate, const uint32_t channel, NBL_CONST_REF_ARG(float32_t) val )
 	{
 		if( all( coordinate < dimension ) )
 		{
@@ -43,7 +43,7 @@ struct BufferAccessor
 	}
 };
 
-BufferAccessor BufferAccessorCtor( uint32_t3 dimension, uint32_t inputStride, uint32_t outputStride )
+BufferAccessor BufferAccessorCtor( uint32_t4 inputStride, uint32_t4 outputStride, uint32_t3 dimension )
 {
 	BufferAccessor ba;
 	ba.dimension = dimension;
@@ -70,7 +70,7 @@ void main( uint3 invocationID : SV_DispatchThreadID )
 	}
 
 	BufferAccessor textureAccessor = BufferAccessorCtor( 
-		boxBlurParams.inputDimensions.xyz, boxBlurParams.inputStrides, boxBlurParams.outputStrides );
+		boxBlurParams.inputStrides, boxBlurParams.outputStrides, boxBlurParams.inputDimensions.xyz );
 
 	for( uint32_t ch = 0; ch < boxBlurParams.getChannelCount(); ++ch )
 	{
diff --git a/common/MonoDeviceApplication.hpp b/common/MonoDeviceApplication.hpp
index b77e3442c..6a4911da7 100644
--- a/common/MonoDeviceApplication.hpp
+++ b/common/MonoDeviceApplication.hpp
@@ -18,7 +18,7 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication
 	public:
 		using base_t::base_t;
 
-	protected:
+	public:
 		// need this one for skipping passing all args into ApplicationFramework
 		MonoDeviceApplication() = default;
 

From 02269fe2bed31c7d3cbe995889a9e5176f75e79d Mon Sep 17 00:00:00 2001
From: Eichenherz <transferusuniversalis@gmail.com>
Date: Thu, 29 Feb 2024 19:46:47 +0200
Subject: [PATCH 8/9] Separate descriptors file. Example runs successfully,
 wrong "output"

---
 .../app_resources/descriptors.hlsl            |  24 +-
 .../app_resources/main.comp.hlsl              |  48 +--
 26_CentralLimitBoxBlur/main.cpp               | 356 +++++++++++-------
 3 files changed, 233 insertions(+), 195 deletions(-)

diff --git a/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl b/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
index 8ac3649b0..a2ed80aba 100644
--- a/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
+++ b/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
@@ -1,41 +1,41 @@
 #include "nbl/builtin/hlsl/blur/common.hlsl"
 
-[[vk::binding( 0, 0 )]] Buffer<nbl::hlsl::float32_t> input;
-[[vk::binding( 1, 0 )]] RWBuffer<nbl::hlsl::float32_t> output;
+[[vk::binding( 0, 0 )]] Texture2D<nbl::hlsl::float32_t> input;
+[[vk::binding( 1, 0 )]] RWTexture2D<nbl::hlsl::float32_t> output;
 
 
 // TODO: figure the proper way to do templated BufferAccessors
 struct BufferAccessor
 {
+	uint32_t4 inputStride;
+	uint32_t4 outputStride;
 	uint32_t3 dimension;
-	uint32_t inputStride;
-	uint32_t outputStride;
 	//uint32_t channelCount;
-
+	// mod image width x div image width y 
 	nbl::hlsl::float32_t getPaddedData( const uint32_t3 coordinate, const uint32_t channel )
 	{
-		uint32_t stridedIdx = dot( uint32_t4( coordinate, channel ), inputStride );
-
 		float data = 0.f;
 		if( all( coordinate < dimension ) )
 		{
-			data = input[ stridedIdx ];
+			uint32_t stridedIdx = dot( uint32_t4( coordinate, channel ), inputStride );// NOT CORRECT
+			//uint32_t2 idx = stridedIdx % 
+			//data = input[ stridedIdx ];
 		}
 
 		return data;
 	}
 
-	void setData( const uint32_t3 coordinate, const uint32_t channel, const float32_t val )
+	void setData( const uint32_t3 coordinate, const uint32_t channel, NBL_CONST_REF_ARG( float32_t ) val )
 	{
 		if( all( coordinate < dimension ) )
 		{
-			uint32_t strided_idx = dot( uint32_t4( coordinate, channel ), outputStride );
-			output[ strided_idx ] = val;
+			uint32_t stridedIdx = dot( uint32_t4( coordinate, channel ), outputStride ); // NOT CORRECT
+			//output[ stridedIdx ] = val;
 		}
 	}
 };
 
-BufferAccessor BufferAccessorCtor( uint32_t3 dimension, uint32_t inputStride, uint32_t outputStride )
+BufferAccessor BufferAccessorCtor( uint32_t4 inputStride, uint32_t4 outputStride, uint32_t3 dimension )
 {
 	BufferAccessor ba;
 	ba.dimension = dimension;
diff --git a/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl b/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl
index 3f7bd4dfb..f86cbf5d8 100644
--- a/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl
+++ b/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl
@@ -5,53 +5,7 @@
 #pragma shader_stage(compute)
 
 #include "nbl/builtin/hlsl/blur/common.hlsl"
-
-//#include "descriptors"
-////////////////////////////
-[[vk::binding( 0, 0 )]] Buffer<nbl::hlsl::float32_t> input;
-[[vk::binding( 1, 0 )]] RWBuffer<nbl::hlsl::float32_t> output;
-
-
-// TODO: figure the proper way to do templated BufferAccessors
-struct BufferAccessor
-{
-	uint32_t4 inputStride;
-	uint32_t4 outputStride;
-	uint32_t3 dimension;
-	//uint32_t channelCount;
-
-	nbl::hlsl::float32_t getPaddedData( const uint32_t3 coordinate, const uint32_t channel )
-	{
-		uint32_t stridedIdx = dot( uint32_t4( coordinate, channel ), inputStride );
-
-		float data = 0.f;
-		if( all( coordinate < dimension ) )
-		{
-			data = input[ stridedIdx ];
-		}
-
-		return data;
-	}
-
-	void setData( const uint32_t3 coordinate, const uint32_t channel, NBL_CONST_REF_ARG(float32_t) val )
-	{
-		if( all( coordinate < dimension ) )
-		{
-			uint32_t strided_idx = dot( uint32_t4( coordinate, channel ), outputStride );
-			output[ strided_idx ] = val;
-		}
-	}
-};
-
-BufferAccessor BufferAccessorCtor( uint32_t4 inputStride, uint32_t4 outputStride, uint32_t3 dimension )
-{
-	BufferAccessor ba;
-	ba.dimension = dimension;
-	ba.inputStride = inputStride;
-	ba.outputStride = outputStride;
-	return ba;
-}
-////////////////////////////
+#include "descriptors.hlsl"
 
 #include "nbl/builtin/hlsl/blur/box_blur.hlsl"
 
diff --git a/26_CentralLimitBoxBlur/main.cpp b/26_CentralLimitBoxBlur/main.cpp
index e25d8cf53..f89fd09b8 100644
--- a/26_CentralLimitBoxBlur/main.cpp
+++ b/26_CentralLimitBoxBlur/main.cpp
@@ -7,6 +7,8 @@
 #include "../common/MonoDeviceApplication.hpp"
 #include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
+#include <nbl/builtin/hlsl/blur/common.hlsl>
+
 #include "CArchive.h"
 
 using namespace nbl;
@@ -23,8 +25,13 @@ class BoxBlurDemo final : public examples::MonoDeviceApplication, public example
 	using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
 
 public:
-	BoxBlurDemo( const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD 
-	) : system::IApplicationFramework( _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD ) {}
+	BoxBlurDemo( 
+		const path& _localInputCWD, 
+		const path& _localOutputCWD, 
+		const path& _sharedInputCWD, 
+		const path& _sharedOutputCWD 
+	) : system::IApplicationFramework( _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD )
+	{}
 	
 	bool onAppInitialized( smart_refctd_ptr<ISystem>&& system ) override
 	{
@@ -39,30 +46,105 @@ class BoxBlurDemo final : public examples::MonoDeviceApplication, public example
 		}
 
 		constexpr uint32_t WorkgroupSize = 256;
-		constexpr uint32_t WorkgroupCount = 2048;
+		constexpr uint32_t AxisDimension = 3;
+		constexpr uint32_t PassesPerAxis = 4;
 
+		constexpr uint32_t WorkgroupCount = 2048;
 
-		// load shader source from file
-		auto getShaderSource = [ & ]( const char* filePath ) -> auto
+		IAssetLoader::SAssetLoadParams lparams = {};
+		lparams.logger = m_logger.get();
+		lparams.workingDirectory = "";
+		auto checkedLoad = [ & ]<class T>( const char* filePath ) -> smart_refctd_ptr<T>
 		{
-			IAssetLoader::SAssetLoadParams lparams = {};
-			lparams.logger = m_logger.get();
-			lparams.workingDirectory = "";
-			auto bundle = m_assetMgr->getAsset( filePath, lparams );
-			if( bundle.getContents().empty() || bundle.getAssetType() != IAsset::ET_SHADER )
+			// The `IAssetManager::getAsset` function is very complex, in essencee it:
+			// 1. takes a cache key or an IFile, if you gave it an `IFile` skip to step 3
+			// 2. it consults the loader override about how to get an `IFile` from your cache key
+			// 3. handles any failure in opening an `IFile` (which is why it takes a supposed filename), it allows the override to give a different file
+			// 4. tries to derive a working directory if you haven't provided one
+			// 5. looks for the assets in the cache if you haven't disabled that in the loader parameters
+			// 5a. lets the override choose relevant assets from the ones found under the cache key
+			// 5b. if nothing was found it lets the override intervene one last time
+			// 6. if there's no file to load from, return no assets
+			// 7. try all loaders associated with a file extension
+			// 8. then try all loaders by opening the file and checking if it will load
+			// 9. insert loaded assets into cache if required
+			// 10. restore assets from dummy state if needed (more on that in other examples)
+			// Take the docs with a grain of salt, the `getAsset` will be rewritten to deal with restores better in the near future.
+			nbl::asset::SAssetBundle bundle = m_assetMgr->getAsset( filePath, lparams );
+			if( bundle.getContents().empty() )
 			{
-				m_logger->log( "Shader %s not found!", ILogger::ELL_ERROR, filePath );
-				exit( -1 );
+				m_logger->log( "Asset %s failed to load! Are you sure it exists?", ILogger::ELL_ERROR, filePath );
+				return nullptr;
 			}
-			auto firstAssetInBundle = bundle.getContents()[ 0 ];
-			return smart_refctd_ptr_static_cast< ICPUShader >( firstAssetInBundle );
+			// All assets derive from `nbl::asset::IAsset`, and can be casted down if the type matches
+			static_assert( std::is_base_of_v<nbl::asset::IAsset, T> );
+			// The type of the root assets in the bundle is not known until runtime, so this is kinda like a `dynamic_cast` which will return nullptr on type mismatch
+			auto typedAsset = IAsset::castDown<T>( bundle.getContents()[ 0 ] ); // just grab the first asset in the bundle
+			if( !typedAsset )
+			{
+				m_logger->log( "Asset type mismatch want %d got %d !", ILogger::ELL_ERROR, T::AssetType, bundle.getAssetType() );
+
+			}
+			return typedAsset;
+		};
+
+		auto textureToBlur = checkedLoad.operator()< nbl::asset::ICPUImage >( "app_resources/tex.jpg" );
+		const auto& inCpuTexInfo = textureToBlur->getCreationParameters();
+		
+		auto createGPUImages = [ & ](
+			core::bitflag<IGPUImage::E_USAGE_FLAGS> usageFlags,
+			std::string_view name,
+			smart_refctd_ptr<nbl::video::IGPUImage>&& imgOut,
+			smart_refctd_ptr<nbl::video::IGPUImageView>&& imgViewOut
+		) {
+			video::IGPUImage::SCreationParams gpuImageCreateInfo;
+			gpuImageCreateInfo.flags = inCpuTexInfo.flags;
+			gpuImageCreateInfo.type = inCpuTexInfo.type;
+			gpuImageCreateInfo.extent = inCpuTexInfo.extent;
+			gpuImageCreateInfo.mipLevels = inCpuTexInfo.mipLevels;
+			gpuImageCreateInfo.arrayLayers = inCpuTexInfo.arrayLayers;
+			gpuImageCreateInfo.samples = inCpuTexInfo.samples;
+			gpuImageCreateInfo.tiling = video::IGPUImage::TILING::OPTIMAL;
+			gpuImageCreateInfo.usage = usageFlags | asset::IImage::EUF_TRANSFER_DST_BIT;
+			gpuImageCreateInfo.queueFamilyIndexCount = 0u;
+			gpuImageCreateInfo.queueFamilyIndices = nullptr;
+
+			gpuImageCreateInfo.format = m_physicalDevice->promoteImageFormat(
+				{ inCpuTexInfo.format, gpuImageCreateInfo.usage }, gpuImageCreateInfo.tiling
+			);
+			auto gpuImage = m_device->createImage( std::move( gpuImageCreateInfo ) );
+
+			auto gpuImageMemReqs = gpuImage->getMemoryReqs();
+			gpuImageMemReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
+			m_device->allocate( gpuImageMemReqs, gpuImage.get(), video::IDeviceMemoryAllocation::EMAF_NONE );
+
+			auto imgView = m_device->createImageView( {
+				.flags = IGPUImageView::ECF_NONE,
+				.subUsages = usageFlags,
+				.image = gpuImage,
+				.viewType = IGPUImageView::ET_2D,
+				.format = gpuImageCreateInfo.format
+			} );
+			gpuImage->setObjectDebugName( name.data() );
+			imgView->setObjectDebugName( ( std::string{ name } + "view" ).c_str() );
+			imgOut = gpuImage;
+			imgViewOut = imgView;
 		};
-		auto computeMain = getShaderSource( "app_resources/main.comp.hlsl" );
 
+
+		smart_refctd_ptr<nbl::video::IGPUImage> inputGpuImg;
+		smart_refctd_ptr<nbl::video::IGPUImage> outputGpuImg;
+		smart_refctd_ptr<nbl::video::IGPUImageView> inputGpuImgView;
+		smart_refctd_ptr<nbl::video::IGPUImageView> outputGpuImgView;
+		createGPUImages( IGPUImage::EUF_SAMPLED_BIT, "InputImg", std::move(inputGpuImg), std::move(inputGpuImgView));
+		createGPUImages( IGPUImage::EUF_STORAGE_BIT, "OutputImg", std::move(outputGpuImg), std::move(outputGpuImgView));
+
+
+		auto computeMain = checkedLoad.operator()< nbl::asset::ICPUShader >( "app_resources/main.comp.hlsl" );
 		smart_refctd_ptr<ICPUShader> overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
 			computeMain.get(), 
 			"#define WORKGROUP_SIZE %s\n#define PASSES_PER_AXIS %d\n#define AXIS_DIM %d\n",
-			std::to_string( WorkgroupSize ).c_str(), 3, 4
+			std::to_string( WorkgroupSize ).c_str(), AxisDimension, PassesPerAxis
 		);
 		smart_refctd_ptr<IGPUShader> shader = m_device->createShader( overridenUnspecialized.get() );
 		if( !shader )
@@ -70,146 +152,166 @@ class BoxBlurDemo final : public examples::MonoDeviceApplication, public example
 			return logFail( "Creation of a GPU Shader to from CPU Shader source failed!" );
 		}
 
-		/*// the simplest example would have used push constants and BDA, but RenderDoc's debugging of that sucks, so I'll demonstrate "classical" binding of buffers with descriptors
-		nbl::video::IGPUDescriptorSetLayout::SBinding bindings[ 1 ] = {
+
+		// TODO: move to shaderd cpp/hlsl descriptors file
+		NBL_CONSTEXPR_STATIC nbl::video::IGPUDescriptorSetLayout::SBinding bindings[] = {
 			{
 				.binding = 0,
-				.type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
-				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, // not is not the time for descriptor indexing
-				.stageFlags = IGPUShader::ESS_COMPUTE,
+				.type = nbl::asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER, 
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = IShader::ESS_COMPUTE,
+				.count = 1,
+				.samplers = nullptr
+			},
+			{
+				.binding = 1,
+				.type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = IShader::ESS_COMPUTE,
 				.count = 1,
-				.samplers = nullptr // irrelevant for a buffer
+				.samplers = nullptr 
 			}
 		};
-		smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout = device->createDescriptorSetLayout( bindings );
+		smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout = m_device->createDescriptorSetLayout( bindings );
 		if( !dsLayout )
+		{
 			return logFail( "Failed to create a Descriptor Layout!\n" );
-
-		// Nabla actually has facilities for SPIR-V Reflection and "guessing" pipeline layouts for a given SPIR-V which we'll cover in a different example
-		smart_refctd_ptr<nbl::video::IGPUPipelineLayout> pplnLayout = device->createPipelineLayout( {}, smart_refctd_ptr( dsLayout ) );
+		}
+		const asset::SPushConstantRange pushConst[] = { {.stageFlags = IShader::ESS_COMPUTE, .offset = 0, .size = sizeof( BoxBlurParams )} };
+		smart_refctd_ptr<nbl::video::IGPUPipelineLayout> pplnLayout = m_device->createPipelineLayout( pushConst, smart_refctd_ptr(dsLayout));
 		if( !pplnLayout )
+		{
 			return logFail( "Failed to create a Pipeline Layout!\n" );
+		}
 
-		// We use strong typing on the pipelines (Compute, Graphics, Mesh, RT), since there's no reason to polymorphically switch between different pipelines
 		smart_refctd_ptr<nbl::video::IGPUComputePipeline> pipeline;
 		{
 			IGPUComputePipeline::SCreationParams params = {};
 			params.layout = pplnLayout.get();
-			// Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
 			params.shader.entryPoint = "main";
 			params.shader.shader = shader.get();
 			// we'll cover the specialization constant API in another example
-			if( !device->createComputePipelines( nullptr, { &params,1 }, &pipeline ) )
+			if( !m_device->createComputePipelines( nullptr, { &params, 1 }, &pipeline ) )
+			{
 				return logFail( "Failed to create pipelines (compile & link shaders)!\n" );
+			}
 		}
-
-		// Our Descriptor Sets track (refcount) resources written into them, so you can pretty much drop and forget whatever you write into them.
-		// A later Descriptor Indexing example will test that this tracking is also correct for Update-After-Bind Descriptor Set bindings too.
+		smart_refctd_ptr<video::IGPUSampler> sampler = m_device->createSampler( { .TextureWrapU = ISampler::ETC_CLAMP_TO_EDGE } );
 		smart_refctd_ptr<nbl::video::IGPUDescriptorSet> ds;
-
-		// A `nbl::video::DeviceMemoryAllocator` is an interface to implement anything that can dish out free memory range to bind to back a `nbl::video::IGPUBuffer` or a `nbl::video::IGPUImage`
-		// The Logical Device itself implements the interface and behaves as the most simple allocator, it will create a new `nbl::video::IDeviceMemoryAllocation` every single time.
-		// We will cover allocators and suballocation in a later example.
-		nbl::video::IDeviceMemoryAllocator::SAllocation allocation = {};
+		smart_refctd_ptr<nbl::video::IDescriptorPool> pool = m_device->createDescriptorPoolForDSLayouts( 
+			IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 } );
+		ds = pool->createDescriptorSet( std::move( dsLayout ) );
 		{
-			constexpr size_t BufferSize = sizeof( uint32_t ) * WorkgroupSize * WorkgroupCount;
-
-			// Always default the creation parameters, there's a lot of extra stuff for DirectX/CUDA interop and slotting into external engines you don't usually care about. 
-			nbl::video::IGPUBuffer::SCreationParams params = {};
-			params.size = BufferSize;
-			// While the usages on `ICPUBuffers` are mere hints to our automated CPU-to-GPU conversion systems which need to be patched up anyway,
-			// the usages on an `IGPUBuffer` are crucial to specify correctly.
-			params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-			smart_refctd_ptr<IGPUBuffer> outputBuff = device->createBuffer( std::move( params ) );
-			if( !outputBuff )
-				return logFail( "Failed to create a GPU Buffer of size %d!\n", params.size );
-
-			// Naming objects is cool because not only errors (such as Vulkan Validation Layers) will show their names, but RenderDoc captures too.
-			outputBuff->setObjectDebugName( "My Output Buffer" );
-
-			// We don't want to bother explaining best staging buffer practices just yet, so we will create a buffer over
-			// a memory type thats Host Visible (can be mapped and give the CPU a direct pointer to read from)
-			nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs();
-			// you can simply constrain the memory requirements by AND-ing the type bits of the host visible memory types
-			reqs.memoryTypeBits &= physDev->getHostVisibleMemoryTypeBits();
-
-			// There are actually two `allocate` overloads, one which allocates memory if you already know the type you want.
-			// And this one which is a utility which tries to allocate from every type that matches your requirements in some order of preference.
-			// The other of preference (iteration over compatible types) can be controlled by the method's template parameter,
-			// the default is from lowest index to highest, but skipping over incompatible types.
-			allocation = device->allocate( reqs, outputBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE );
-			if( !allocation.isValid() )
-				return logFail( "Failed to allocate Device Memory compatible with our GPU Buffer!\n" );
-
-			// Note that we performed a Dedicated Allocation above, so there's no need to bind the memory anymore (since the allocator knows the dedication, it can already bind).
-			// This is a carryover from having an OpenGL backend, where you couldn't have a memory allocation separate from the resource, so all allocations had to be "dedicated".
-			// In Vulkan dedicated allocations are the most performant and still make sense as long as you won't blow the 4096 allocation limit on windows.
-			// You should always use dedicated allocations for images used for swapchains, framebuffer attachments (esp transient), as well as objects used in CUDA/DirectX interop.
-			assert( outputBuff->getBoundMemory().memory == allocation.memory.get() );
-
-			// This is a cool utility you can use instead of counting up how much of each descriptor type you need to N_i allocate descriptor sets with layout L_i from a single pool
-			smart_refctd_ptr<nbl::video::IDescriptorPool> pool = device->createDescriptorPoolForDSLayouts( IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 } );
-
-			// note how the pool will go out of scope but thanks for backreferences in each object to its parent/dependency it will be kept alive for as long as all the Sets it allocated
-			ds = pool->createDescriptorSet( std::move( dsLayout ) );
-			// we still use Vulkan 1.0 descriptor update style, could move to Update Templates but Descriptor Buffer ubiquity seems just around the corner
-			{
-				IGPUDescriptorSet::SDescriptorInfo info[ 1 ];
-				info[ 0 ].desc = smart_refctd_ptr( outputBuff ); // bad API, too late to change, should just take raw-pointers since not consumed
-				info[ 0 ].info.buffer = { .offset = 0,.size = BufferSize };
-				IGPUDescriptorSet::SWriteDescriptorSet writes[ 1 ] = {
-					{.dstSet = ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info}
-				};
-				device->updateDescriptorSets( writes, {} );
-			}
+			IGPUDescriptorSet::SDescriptorInfo info[ 2 ];
+			info[ 0 ].desc = inputGpuImgView;
+			info[ 0 ].info.image = { .sampler = sampler, .imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL };
+			info[ 1 ].desc = outputGpuImgView;
+			info[ 1 ].info.image = { .sampler = nullptr, .imageLayout = IImage::LAYOUT::GENERAL };
+
+			IGPUDescriptorSet::SWriteDescriptorSet writes[] = {
+				{ .dstSet = ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &info[ 0 ] },
+				{ .dstSet = ds.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &info[ 1 ] },
+			};
+			m_device->updateDescriptorSets( writes, {} );
 		}
 
-		// To be able to read the contents of the buffer we need to map its memory
-		// P.S. Nabla mandates Persistent Memory Mappings on all backends (but not coherent memory types)
-		auto ptr = allocation.memory->map( { 0ull,allocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ );
-		if( !ptr )
-			return logFail( "Failed to map the Device Memory!\n" );
+		uint32_t computeQueueIndex = getComputeQueue()->getFamilyIndex();
+		IQueue* queue = m_device->getQueue( computeQueueIndex, 0 );
 
-		// Our commandbuffers are cool because they refcount the resources used by each command you record into them, so you can rely a commandbuffer on keeping them alive.
 		smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdbuf;
+		smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(
+			computeQueueIndex, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT );
+		if( !cmdpool->createCommandBuffers( IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf ) )
+		{
+			return logFail( "Failed to create Command Buffers!\n" );
+		}
+
+		constexpr size_t StartedValue = 0;
+		constexpr size_t FinishedValue = 45;
+		static_assert( FinishedValue > StartedValue );
+		smart_refctd_ptr<ISemaphore> progress = m_device->createSemaphore( StartedValue );
+		
+		IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = cmdbuf.get()} };
+
+		nbl::video::SIntendedSubmitInfo::SFrontHalf frontHalf = { .queue = queue, .commandBuffers = cmdbufs };
+		smart_refctd_ptr<nbl::video::IUtilities> assetStagingMngr = 
+			make_smart_refctd_ptr<IUtilities>( smart_refctd_ptr( m_device ), smart_refctd_ptr( m_logger ) );
+
+		cmdbuf->begin( IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT );
+
+		queue->startCapture();
+		bool uploaded = assetStagingMngr->updateImageViaStagingBufferAutoSubmit( 
+			frontHalf, textureToBlur->getBuffer(), inCpuTexInfo.format,
+			inputGpuImg.get(), IImage::LAYOUT::UNDEFINED, textureToBlur->getRegions()
+		);
+		queue->endCapture();
+		if( !uploaded )
 		{
-			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = device->createCommandPool( params.queueParams[ 0 ].familyIndex, IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT );
-			if( !cmdpool->createCommandBuffers( IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf ) )
-				return logFail( "Failed to create Command Buffers!\n" );
+			return logFail( "Failed to upload cpu tex!\n" );
 		}
 
+		cmdbuf->reset( IGPUCommandBuffer::RESET_FLAGS::NONE );
+
+		BoxBlurParams pushConstData = {};
+		
+
 		cmdbuf->begin( IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT );
-		// If you enable the `debugUtils` API Connection feature on a supported backend as we've done, you'll get these pretty debug sections in RenderDoc
 		cmdbuf->beginDebugMarker( "My Compute Dispatch", core::vectorSIMDf( 0, 1, 0, 1 ) );
-		// you want to bind the pipeline first to avoid accidental unbind of descriptor sets due to compatibility matching
+		nbl::video::IGPUCommandBuffer::SImageResolve regions[] = {
+			{
+				.srcSubresource = { .layerCount = 1 },
+				.srcOffset = {},
+				.dstSubresource = { .layerCount = 1 },
+				.dstOffset = {},
+				.extent = inputGpuImg->getCreationParameters().extent
+			}
+		};
+		cmdbuf->resolveImage( 
+			inputGpuImg.get(), IImage::LAYOUT::UNDEFINED,
+			inputGpuImg.get(), IImage::LAYOUT::GENERAL,
+			std::size( regions ), regions );
+		nbl::video::IGPUCommandBuffer::SImageResolve regionsOut[] = {
+			{
+				.srcSubresource = {.layerCount = 1 },
+				.srcOffset = {},
+				.dstSubresource = {.layerCount = 1 },
+				.dstOffset = {},
+				.extent = outputGpuImg->getCreationParameters().extent
+			}
+		};
+		cmdbuf->resolveImage(
+			outputGpuImg.get(), IImage::LAYOUT::UNDEFINED,
+			outputGpuImg.get(), IImage::LAYOUT::GENERAL,
+			std::size( regionsOut ), regionsOut );
 		cmdbuf->bindComputePipeline( pipeline.get() );
 		cmdbuf->bindDescriptorSets( nbl::asset::EPBP_COMPUTE, pplnLayout.get(), 0, 1, &ds.get() );
+		cmdbuf->pushConstants( pplnLayout.get(), IShader::ESS_COMPUTE, 0, sizeof( BoxBlurParams ), &pushConstData );
+		cmdbuf->dispatch( WorkgroupCount, 1, 1 );
+
+		const nbl::asset::SMemoryBarrier barriers[] = {
+			{
+				.srcStageMask = nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+				.srcAccessMask= nbl::asset::ACCESS_FLAGS::SHADER_WRITE_BITS,
+				.dstStageMask= nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+				.dstAccessMask= nbl::asset::ACCESS_FLAGS::SHADER_READ_BITS,
+			}
+		};
+		cmdbuf->pipelineBarrier( nbl::asset::EDF_NONE, { .memBarriers = barriers } );
+
 		cmdbuf->dispatch( WorkgroupCount, 1, 1 );
 		cmdbuf->endDebugMarker();
 		// Normally you'd want to perform a memory barrier when using the output of a compute shader or renderpass,
 		// however waiting on a timeline semaphore (or fence) on the Host makes all Device writes visible.
 		cmdbuf->end();
-
-		// Only Timeline Semaphores are supported in Nabla, there's no fences or binary semaphores.
-		// Swapchains run on adaptors with empty submits that make them look like they work with Timeline Semaphores,
-		// which has important side-effects we'll cover in another example.
-		constexpr auto StartedValue = 0;
-		constexpr auto FinishedValue = 45;
-		static_assert( FinishedValue > StartedValue );
-		smart_refctd_ptr<ISemaphore> progress = device->createSemaphore( StartedValue );
+		
 		{
-			// queues are inherent parts of the device, ergo not refcounted (you refcount the device instead)
-			IQueue* queue = device->getQueue( params.queueParams[ 0 ].familyIndex, 0 );
-
-			// Default, we have no semaphores to wait on before we can start our workload
-			IQueue::SSubmitInfo submitInfos[ 1 ] = {};
 			// The IGPUCommandBuffer is the only object whose usage does not get automagically tracked internally, you're responsible for holding onto it as long as the GPU needs it.
 			// So this is why our commandbuffer, even though its transient lives in the scope equal or above the place where we wait for the submission to be signalled as complete.
 			const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = cmdbuf.get()} };
-			submitInfos[ 0 ].commandBuffers = cmdbufs;
 			// But we do need to signal completion by incrementing the Timeline Semaphore counter as soon as the compute shader is done
 			const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = progress.get(),.value = FinishedValue,.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} };
-			submitInfos[ 0 ].signalSemaphores = signals;
+			// Default, we have no semaphores to wait on before we can start our workload
+			IQueue::SSubmitInfo submitInfos[] = { { .commandBuffers = cmdbufs, .signalSemaphores = signals } };
 
 			// We have a cool integration with RenderDoc that allows you to start and end captures programmatically.
 			// This is super useful for debugging multi-queue workloads and by default RenderDoc delimits captures only by Swapchain presents.
@@ -218,29 +320,11 @@ class BoxBlurDemo final : public examples::MonoDeviceApplication, public example
 			queue->endCapture();
 		}
 		// As the name implies this function will not progress until the fence signals or repeated waiting returns an error.
-		const ISemaphore::SWaitInfo waitInfos[] = { {
-			.semaphore = progress.get(),
-			.value = FinishedValue
-		} };
-		device->blockForSemaphores( waitInfos );
-
-		// You don't need to do this, but putting it here to demonstrate that its safe to drop a commandbuffer after GPU is done (try moving it above and see if you BSOD or just get a validation error). 
-		cmdbuf = nullptr;
-
-		// if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches
-		const ILogicalDevice::MappedMemoryRange memoryRange( allocation.memory.get(), 0ull, allocation.memory->getAllocationSize() );
-		if( !allocation.memory->getMemoryPropertyFlags().hasFlags( IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT ) )
-			device->invalidateMappedMemoryRanges( 1, &memoryRange );
-
-		// a simple test to check we got the right thing back
-		auto buffData = reinterpret_cast< const uint32_t* >( ptr );
-		for( auto i = 0; i < WorkgroupSize * WorkgroupCount; i++ )
-			if( buffData[ i ] != i )
-				return logFail( "DWORD at position %d doesn't match!\n", i );
-		// This allocation would unmap itself in the dtor anyway, but lets showcase the API usage
-		allocation.memory->unmap();
-
-		return true;*/
+		const ISemaphore::SWaitInfo waitInfos[] = { { .semaphore = progress.get(), .value = FinishedValue } };
+		m_device->blockForSemaphores( waitInfos );
+			
+
+		return true;
 	}
 
 	// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"

From 9e3f721b83766344cee3248f7a8c19919c7ee46d Mon Sep 17 00:00:00 2001
From: Eichenherz <transferusuniversalis@gmail.com>
Date: Mon, 4 Mar 2024 11:12:39 +0200
Subject: [PATCH 9/9] Linear to 2d tex index

---
 .../app_resources/descriptors.hlsl            | 47 ++++++++++---------
 .../app_resources/main.comp.hlsl              |  5 +-
 2 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl b/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
index a2ed80aba..a2226fa45 100644
--- a/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
+++ b/26_CentralLimitBoxBlur/app_resources/descriptors.hlsl
@@ -1,45 +1,50 @@
 #include "nbl/builtin/hlsl/blur/common.hlsl"
 
-[[vk::binding( 0, 0 )]] Texture2D<nbl::hlsl::float32_t> input;
-[[vk::binding( 1, 0 )]] RWTexture2D<nbl::hlsl::float32_t> output;
+[[vk::binding( 0, 0 )]] Texture2D<nbl::hlsl::float32_t4> input;
+[[vk::binding( 1, 0 )]] RWTexture2D<nbl::hlsl::float32_t4> output;
 
 
 // TODO: figure the proper way to do templated BufferAccessors
 struct BufferAccessor
 {
-	uint32_t4 inputStride;
-	uint32_t4 outputStride;
-	uint32_t3 dimension;
-	//uint32_t channelCount;
-	// mod image width x div image width y 
-	nbl::hlsl::float32_t getPaddedData( const uint32_t3 coordinate, const uint32_t channel )
+	uint32_t2 chosenAxis;
+	
+	nbl::hlsl::float32_t get( const uint32_t linearIndex, const uint32_t channel )
 	{
-		float data = 0.f;
-		if( all( coordinate < dimension ) )
+		uint32_t3 texSize;
+		input.GetDimensions( 0, texSize.x, texSize.y, texSize.z );
+
+		uint32_t axisSize = dot( texSize.xy, chosenAxis );
+
+		uint32_t2 coordinate = { linearIndex % axisSize, linearIndex / axisSize };
+		float32_t data = 0.f;
+		if( all( coordinate < texSize.xy ) )
 		{
-			uint32_t stridedIdx = dot( uint32_t4( coordinate, channel ), inputStride );// NOT CORRECT
-			//uint32_t2 idx = stridedIdx % 
-			//data = input[ stridedIdx ];
+			float32_t4 pixel = input[ coordinate.xy ];
+			data = pixel[ channel ];
 		}
 
 		return data;
 	}
 
-	void setData( const uint32_t3 coordinate, const uint32_t channel, NBL_CONST_REF_ARG( float32_t ) val )
+	void set( const uint32_t linearIndex, const uint32_t channel, NBL_CONST_REF_ARG( float32_t ) val )
 	{
-		if( all( coordinate < dimension ) )
+		uint32_t2 texSize;
+		output.GetDimensions( texSize.x, texSize.y );
+
+		uint32_t axisSize = dot( texSize, chosenAxis );
+
+		uint32_t2 coordinate = { linearIndex % axisSize, linearIndex / axisSize };
+		if( all( coordinate < texSize ) )
 		{
-			uint32_t stridedIdx = dot( uint32_t4( coordinate, channel ), outputStride ); // NOT CORRECT
-			//output[ stridedIdx ] = val;
+			output[ coordinate.xy ][ channel ] = val;
 		}
 	}
 };
 
-BufferAccessor BufferAccessorCtor( uint32_t4 inputStride, uint32_t4 outputStride, uint32_t3 dimension )
+BufferAccessor BufferAccessorCtor( uint32_t2 chosenAxis )
 {
 	BufferAccessor ba;
-	ba.dimension = dimension;
-	ba.inputStride = inputStride;
-	ba.outputStride = outputStride;
+	ba.chosenAxis = chosenAxis;
 	return ba;
 }
diff --git a/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl b/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl
index f86cbf5d8..dbcef350e 100644
--- a/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl
+++ b/26_CentralLimitBoxBlur/app_resources/main.comp.hlsl
@@ -23,11 +23,10 @@ void main( uint3 invocationID : SV_DispatchThreadID )
 		borderColor = boxBlurParams.getBorderColor();
 	}
 
-	BufferAccessor textureAccessor = BufferAccessorCtor( 
-		boxBlurParams.inputStrides, boxBlurParams.outputStrides, boxBlurParams.inputDimensions.xyz );
+	BufferAccessor textureAccessor = BufferAccessorCtor( boxBlurParams.chosenAxis );
 
 	for( uint32_t ch = 0; ch < boxBlurParams.getChannelCount(); ++ch )
 	{
-		BoxBlur( ch, direction, boxBlurParams.radius, wrapMode, borderColor, textureAccessor );
+		BoxBlur( ch, boxBlurParams.radius, wrapMode, borderColor, textureAccessor );
 	}
 }