From dddf5755d6a7bef8d78aba37949b8badd85594fa Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Tue, 16 Jan 2024 21:34:53 -0300
Subject: [PATCH 01/13] Work on property pools example

---
 66_PropertyPools/CMakeLists.txt               |  24 ++
 66_PropertyPools/app_resources/common.hlsl    |  22 ++
 .../app_resources/shader.comp.hlsl            |  33 ++
 66_PropertyPools/config.json.template         |  28 ++
 66_PropertyPools/main.cpp                     | 292 ++++++++++++++++++
 66_PropertyPools/pipeline.groovy              |  50 +++
 CMakeLists.txt                                |   1 +
 7 files changed, 450 insertions(+)
 create mode 100644 66_PropertyPools/CMakeLists.txt
 create mode 100644 66_PropertyPools/app_resources/common.hlsl
 create mode 100644 66_PropertyPools/app_resources/shader.comp.hlsl
 create mode 100644 66_PropertyPools/config.json.template
 create mode 100644 66_PropertyPools/main.cpp
 create mode 100644 66_PropertyPools/pipeline.groovy
diff --git a/66_PropertyPools/CMakeLists.txt b/66_PropertyPools/CMakeLists.txt
new file mode 100644
index 000000000..bc1624875
--- /dev/null
+++ b/66_PropertyPools/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/66_PropertyPools/app_resources/common.hlsl b/66_PropertyPools/app_resources/common.hlsl
new file mode 100644
index 000000000..6f339aa13
--- /dev/null
+++ b/66_PropertyPools/app_resources/common.hlsl
@@ -0,0 +1,22 @@
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+// Unfortunately not every piece of C++14 metaprogramming syntax is available in HLSL 202x
+// https://github.com/microsoft/DirectXShaderCompiler/issues/5751#issuecomment-1800847954
+typedef nbl::hlsl::float32_t3 input_t;
+typedef nbl::hlsl::float32_t output_t;
+
+NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxPossibleElementCount = 1 << 20;
+
+struct PushConstantData
+{
+	uint64_t inputAddress;
+	uint64_t outputAddress;
+	uint32_t dataElementCount;
+};
+
+NBL_CONSTEXPR uint32_t WorkgroupSize = 256;
+
+#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
+
+// Yes we do have our own re-creation of C++'s STL in HLSL2021 !
+#include "nbl/builtin/hlsl/limits.hlsl"
\ No newline at end of file
diff --git a/66_PropertyPools/app_resources/shader.comp.hlsl b/66_PropertyPools/app_resources/shader.comp.hlsl
new file mode 100644
index 000000000..4aeef0e0f
--- /dev/null
+++ b/66_PropertyPools/app_resources/shader.comp.hlsl
@@ -0,0 +1,33 @@
+#include "common.hlsl"
+
+// just a small test
+#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+
+[[vk::push_constant]] PushConstantData pushConstants;
+
+// does absolutely nothing, a later example will show how it gets used
+template<typename capability_traits=nbl::hlsl::jit::device_capabilities_traits>
+void dummyTraitTest() {}
+
+[numthreads(WorkgroupSize,1,1)]
+void main(uint32_t3 ID : SV_DispatchThreadID)
+{
+	dummyTraitTest();
+	if (ID.x>=pushConstants.dataElementCount)
+		return;
+
+	const input_t self = vk::RawBufferLoad<input_t>(pushConstants.inputAddress+sizeof(input_t)*ID.x);
+
+	nbl::hlsl::Xoroshiro64StarStar rng = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(pushConstants.dataElementCount,ID.x)^0xdeadbeefu);
+
+	float32_t acc = nbl::hlsl::numeric_limits<float32_t>::max;
+	const static uint32_t OthersToTest = 15;
+	[[unroll(OthersToTest)]]
+	for (uint32_t i=0; i<OthersToTest; i++)
+	{
+		const uint32_t offset = rng() % pushConstants.dataElementCount;
+		const input_t other = vk::RawBufferLoad<input_t>(pushConstants.inputAddress+sizeof(input_t)*offset);
+		acc = min(length(other-self),acc);
+	}
+	vk::RawBufferStore<float32_t>(pushConstants.outputAddress+sizeof(float32_t)*ID.x,acc);
+}
\ No newline at end of file
diff --git a/66_PropertyPools/config.json.template b/66_PropertyPools/config.json.template
new file mode 100644
index 000000000..717d05d53
--- /dev/null
+++ b/66_PropertyPools/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
new file mode 100644
index 000000000..155ece55b
--- /dev/null
+++ b/66_PropertyPools/main.cpp
@@ -0,0 +1,292 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+
+// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
+#include "../common/MonoDeviceApplication.hpp"
+#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace video;
+
+
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/bit.hlsl"
+
+
+// In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants 
+class PropertyPoolsApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
+{
+		using device_base_t = examples::MonoDeviceApplication;
+		using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
+
+		// This is the first example that submits multiple workloads in-flight. 
+		// What the shader does is it computes the minimum distance of each point against K other random input points.
+		// Having the GPU randomly access parts of the buffer requires it to be DEVICE_LOCAL for performance.
+		// Then the CPU downloads the results and finds the median minimum distance via quick-select.
+		// This bizzare synthetic workload was specifically chosen for its unfriendliness towards simple buffer usage.
+		// The fact we have variable sized workloads and run them in a loop means we either have to dynamically
+		// suballocate from a single buffer or have K worst-case sized buffers we round robin for K-workloads in flight.
+		// Creating and destroying buffers at runtime is not an option as those are very expensive operations. 
+		// Also since CPU needs to heapify the outputs, we need to have the GPU write them into RAM not VRAM.
+		smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
+
+		// The Utility class has lots of methods to handle staging without relying on ReBAR or EXT_host_image_copy as well as more complex methods we'll cover later.
+		// Until EXT_host_image_copy becomes ubiquitous across all Nabla Core Profile devices, you need to stage image copies from an IGPUBuffer to an IGPUImage.
+		// Why use Staging for buffers in the age of ReBAR? While GPU workloads overlap the CPU, individual GPU workloads's execution might not overlap each other
+		// but their data might. In this case you want to "precisely" time the data update on the GPU timeline between the end and start of a workload.
+		// For very small updates you could use the commandbuffer updateBuffer method, but it has a size limit and the data enqueued takes up space in the commandpool.
+		// Sometimes it might be unfeasible to either have multiple copies or update references to those copies without a cascade update.
+		// One example is the transformation graph of nodes in a scene, where a copy-on-write of a node would require the update the offset/pointer held by
+		// any other node that refers to it. This quickly turns into a cascade that would force you to basically create a full copy of the entire data structure
+		// after most updates. Whereas with staging you'd "queue up" the much smaller set of updates to apply between each computation step which uses the graph.
+		// Another example are UBO and SSBO bindings, where once you run out of dynamic bindings, you can no longer easily change offsets without introducting extra indirection in shaders.
+		// Actually staging can help you re-use a commandbuffer because you don't need to re-record it if you don't need to change the offsets at which you bind!
+		// Finally ReBAR is a precious resource, my 8GB RTX 3070 only reports a 214MB Heap backing HOST_VISIBLE and DEVICE_LOCAL device local memory type.
+		smart_refctd_ptr<nbl::video::IUtilities> m_utils;
+
+		// We call them downstreaming and upstreaming, simply by how we used them so far.
+		// Meaning that upstreaming is uncached and usually ReBAR (DEVICE_LOCAL), for simple memcpy like sequential writes.
+		// While the downstreaming is CACHED and not DEVICE_LOCAL for fast random acess by the CPU.
+		// However there are cases when you'd want to use a buffer with flags identical to the default downstreaming buffer for uploads,
+		// such cases is when a CPU needs to build a data-structure in-place (due to memory constraints) before GPU accesses it,
+		// one example are Host Acceleration Structure builds (BVH building requires lots of repeated memory accesses).
+		// When choosing the memory properties of a mapped buffer consider which processor (CPU or GPU) needs faster access in event of a cache-miss.
+		nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer;
+		StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
+		// These are Buffer Device Addresses
+		uint64_t m_upStreamingBufferAddress;
+		uint64_t m_downStreamingBufferAddress;
+
+		// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
+		uint32_t m_alignment;
+		
+		// The pool cache is just a formalized way of round-robining command pools and resetting + reusing them after their most recent submit signals finished.
+		// Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools.
+		smart_refctd_ptr<nbl::video::ICommandPoolCache> m_poolCache;
+
+		// We'll run the iterations in reverse, easier to write "keep running"
+		uint32_t m_iteration = 200;
+
+	public:
+		// Yay thanks to multiple inheritance we cannot forward ctors anymore
+		PropertyPoolsApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+			system::IApplicationFramework(_localInputCWD,_localOutputCWD,_sharedInputCWD,_sharedOutputCWD) {}
+
+		// we stuff all our work here because its a "single shot" app
+		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		{
+			// Remember to call the base class initialization!
+			if (!device_base_t::onAppInitialized(std::move(system)))
+				return false;
+			if (!asset_base_t::onAppInitialized(std::move(system)))
+				return false;
+
+			// this time we load a shader directly from a file
+			smart_refctd_ptr<IGPUSpecializedShader> shader;
+			{
+				IAssetLoader::SAssetLoadParams lp = {};
+				lp.logger = m_logger.get();
+				lp.workingDirectory = ""; // virtual root
+				auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl",lp);
+				const auto assets = assetBundle.getContents();
+				if (assets.empty())
+					return logFail("Could not load shader!");
+
+				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
+				auto source = IAsset::castDown<ICPUSpecializedShader>(assets[0]);
+				// The down-cast should not fail!
+				assert(source);
+
+				IGPUObjectFromAssetConverter::SParams conversionParams = {};
+				conversionParams.device = m_device.get();
+				conversionParams.assetManager = m_assetMgr.get();
+				created_gpu_object_array<ICPUSpecializedShader> convertedGPUObjects = std::make_unique<IGPUObjectFromAssetConverter>()->getGPUObjectsFromAssets(&source,&source+1,conversionParams);
+				if (convertedGPUObjects->empty() || !convertedGPUObjects->front())
+					return logFail("Conversion of a CPU Specialized Shader to GPU failed!");
+
+				shader = convertedGPUObjects->front();
+			}
+
+			// The StreamingTransientDataBuffers are actually composed on top of another useful utility called `CAsyncSingleBufferSubAllocator`
+			// The difference is that the streaming ones are made on top of ranges of `IGPUBuffer`s backed by mappable memory, whereas the
+			// `CAsyncSingleBufferSubAllocator` just allows you suballocate subranges of any `IGPUBuffer` range with deferred/latched frees.
+			constexpr uint32_t DownstreamBufferSize = sizeof(output_t)<<24;
+			constexpr uint32_t UpstreamBufferSize = sizeof(input_t)<<24;
+			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device),smart_refctd_ptr(m_logger),DownstreamBufferSize,UpstreamBufferSize);
+			if (!m_utils)
+				return logFail("Failed to create Utilities!");
+			m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
+			m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
+			m_upStreamingBufferAddress = m_device->getBufferDeviceAddress(m_upStreamingBuffer->getBuffer());
+			m_downStreamingBufferAddress = m_device->getBufferDeviceAddress(m_downStreamingBuffer->getBuffer());
+
+			// People love Reflection but I prefer Shader Sources instead!
+			const nbl::asset::SPushConstantRange pcRange = {.stageFlags=IShader::ESS_COMPUTE,.offset=0,.size=sizeof(PushConstantData)};
+
+			// This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size
+			// and using traditional SSBO bindings would force us to update the Descriptor Set every frame.
+			// I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic
+			// only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding.
+			// Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size.
+			m_pipeline = m_device->createComputePipeline(nullptr,m_device->createPipelineLayout(&pcRange,&pcRange+1),std::move(shader));
+
+			const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
+			// The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices
+			// which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets.
+			// Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc.
+			// there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those.
+			// We'll align to max of coherent atom size even if the memory is coherent,
+			// and we also need to take into account BDA shader loads need to be aligned to the type being loaded.
+			m_alignment = core::max(deviceLimits.nonCoherentAtomSize,alignof(float));
+
+			// We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are
+			// the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously.
+			constexpr auto MaxConcurrency = 64;
+			// Since this time we don't throw the Command Pools away and we'll reset them instead, we don't create the pools with the transient flag
+			m_poolCache = make_smart_refctd_ptr<ICommandPoolCache>(m_device.get(),getComputeQueue()->getFamilyIndex(), IGPUCommandPool::ECF_NONE, MaxConcurrency);
+
+			return true;
+		}
+
+		// Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script)
+		bool keepRunning() override { return m_iteration; }
+
+		// Finally the first actual work-loop
+		void workLoopBody() override
+		{
+			m_iteration--;
+			IGPUQueue* const queue = getComputeQueue();
+
+			// Note that I'm using the sample struct with methods that have identical code which compiles as both C++ and HLSL
+			auto rng = nbl::hlsl::Xoroshiro64StarStar::construct({m_iteration^0xdeadbeefu,std::hash<string>()(_NBL_APP_NAME_)});
+
+			// we dynamically choose the number of elements for each iteration
+			const auto elementCount = rng()%MaxPossibleElementCount;
+			const uint32_t inputSize = sizeof(input_t)*elementCount;
+
+			// The allocators can do multiple allocations at once for efficiency
+			const uint32_t AllocationCount = 1;
+			// It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value
+			// this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args.
+			auto inputOffset = m_upStreamingBuffer->invalid_value;
+
+			// We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled)
+			// Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later).
+			std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
+			// note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly
+			m_upStreamingBuffer->multi_allocate(waitTill,AllocationCount,&inputOffset,&inputSize,&m_alignment);
+
+			// Generate our data in-place on the allocated staging buffer
+			{
+				auto* const inputPtr = reinterpret_cast<input_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer())+inputOffset);
+				for (auto j=0; j<elementCount; j++)
+				{
+					const nbl::hlsl::float32_t3 generated(rng(),rng(),rng());
+					// make sure our bitpatterns are in [0,1]^2 as a float
+					inputPtr[j] = generated/float(nbl::hlsl::numeric_limits<decltype(rng())>::max);
+				}
+				// Always remember to flush!
+				if (m_upStreamingBuffer->needsManualFlushOrInvalidate())
+				{
+					const IDeviceMemoryAllocation::MappedMemoryRange range(m_upStreamingBuffer->getBuffer()->getBoundMemory(),inputOffset,inputSize);
+					m_device->flushMappedMemoryRanges(1,&range);
+				}
+			}
+
+			// Obtain our command pool once one gets recycled
+			uint32_t poolIx;
+			do
+			{
+				poolIx = m_poolCache->acquirePool();
+			} while (poolIx==ICommandPoolCache::invalid_index);
+
+			// finally allocate our output range
+			const uint32_t outputSize = sizeof(output_t)*elementCount;
+			auto outputOffset = m_downStreamingBuffer->invalid_value;
+			m_downStreamingBuffer->multi_allocate(waitTill,AllocationCount,&outputOffset,&outputSize,&m_alignment);
+
+			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+			{
+				m_device->createCommandBuffers(m_poolCache->getPool(poolIx),IGPUCommandBuffer::EL_PRIMARY,1,&cmdbuf);
+				// lets record, its still a one time submit because we have to re-record with different push constants each time
+				cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
+				cmdbuf->bindComputePipeline(m_pipeline.get());
+				// This is the new fun part, pushing constants
+				const PushConstantData pc = {
+					.inputAddress=m_upStreamingBufferAddress+inputOffset,
+					.outputAddress=m_downStreamingBufferAddress+outputOffset,
+					.dataElementCount=elementCount
+				};
+				cmdbuf->pushConstants(m_pipeline->getLayout(),IShader::ESS_COMPUTE,0u,sizeof(pc),&pc);
+				// Good old trick to get rounded up divisions, in case you're not familiar
+				cmdbuf->dispatch((elementCount-1)/WorkgroupSize+1,1,1);
+				cmdbuf->end();
+			}
+
+			// TODO: redo with a single timeline semaphore
+			auto fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED);
+			{
+				IGPUQueue::SSubmitInfo submitInfo = {};
+				submitInfo.commandBufferCount = 1;
+				submitInfo.commandBuffers = &cmdbuf.get();
+
+				queue->startCapture();
+				queue->submit(1u,&submitInfo,fence.get());
+				queue->endCapture();
+			}
+				
+			// We can also actually latch our Command Pool reset and its return to the pool of free pools!
+			m_poolCache->releaseSet(m_device.get(),smart_refctd_ptr(fence),poolIx);
+
+			// As promised, we can defer an upstreaming buffer deallocation until a fence is signalled
+			// You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation.
+			m_upStreamingBuffer->multi_deallocate(AllocationCount,&inputOffset,&inputSize,smart_refctd_ptr(fence));
+
+			// Because C++17 and C++20 can't make their mind up about what to do with `this` in event of a [=] capture, lets triple ensure the m_iteration is captured by value.
+			const auto savedIterNum = m_iteration;
+				
+			// Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer.
+			// Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory.
+			// Its nice because it will also remember to invalidate our memory mapping if its not coherent.
+			auto latchedConsumer = make_smart_refctd_ptr<IUtilities::CDownstreamingDataConsumer>(
+				IDeviceMemoryAllocation::MemoryRange(outputOffset,outputSize),
+				// Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals
+				[=](const size_t dstOffset, const void* bufSrc, const size_t size)->void
+				{
+					// The unused variable is used for letting the consumer know the subsection of the output we've managed to download
+					// But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves.
+					assert(dstOffset==0 && size==outputSize);
+
+					// I can const cast, we know the mapping is just a pointer
+					output_t* const data = reinterpret_cast<output_t*>(const_cast<void*>(bufSrc));
+					auto median = data+elementCount/2;
+					std::nth_element(data,median,data+elementCount);
+
+					m_logger->log("Iteration %d Median of Minimum Distances is %f",ILogger::ELL_PERFORMANCE,savedIterNum,*median);
+				},
+				// Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it
+				// hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands.
+				// It could also be latched in the upstreaming deallocate, because its the same fence.
+				std::move(cmdbuf),m_downStreamingBuffer
+			);
+			// We put a function we want to execute 
+			m_downStreamingBuffer->multi_deallocate(AllocationCount,&outputOffset,&outputSize,std::move(fence),&latchedConsumer.get());
+		}
+
+		bool onAppTerminated() override
+		{
+			// Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated`
+			// (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain)
+			while (m_downStreamingBuffer->cull_frees()) {}
+
+			return device_base_t::onAppTerminated();
+		}
+};
+
+
+NBL_MAIN_FUNC(PropertyPoolsApp)
\ No newline at end of file
diff --git a/66_PropertyPools/pipeline.groovy b/66_PropertyPools/pipeline.groovy
new file mode 100644
index 000000000..1a7b043a4
--- /dev/null
+++ b/66_PropertyPools/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
+{
+	public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a20a33a9..09a73bfe0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,5 +65,6 @@ if(NBL_BUILD_EXAMPLES)
 	#add_subdirectory(61_UI EXCLUDE_FROM_ALL)
 	add_subdirectory(62_CAD EXCLUDE_FROM_ALL)
 	add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
+	add_subdirectory(66_PropertyPools EXCLUDE_FROM_ALL)
 	add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
 endif()
\ No newline at end of file

From 43d95c8cca36441dfdd754ba66f24b88ae18426b Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Sun, 21 Jan 2024 10:58:54 -0300
Subject: [PATCH 02/13] Add creation of property pool handler to example

---
 66_PropertyPools/main.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index 155ece55b..941536751 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -62,6 +62,8 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 		uint64_t m_upStreamingBufferAddress;
 		uint64_t m_downStreamingBufferAddress;
 
+		smart_refctd_ptr<CPropertyPoolHandler> m_propertyPoolHandler;
+
 		// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
 		uint32_t m_alignment;
 		
@@ -86,12 +88,15 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
 
+			m_propertyPoolHandler = core::make_smart_refctd_ptr<CPropertyPoolHandler>(core::smart_refctd_ptr(m_device));
+
 			// this time we load a shader directly from a file
 			smart_refctd_ptr<IGPUSpecializedShader> shader;
 			{
 				IAssetLoader::SAssetLoadParams lp = {};
 				lp.logger = m_logger.get();
 				lp.workingDirectory = ""; // virtual root
+
 				auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl",lp);
 				const auto assets = assetBundle.getContents();
 				if (assets.empty())

From 66e93fbb23c374e445ab3af66848a836b34052c1 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Wed, 24 Jan 2024 21:43:22 -0300
Subject: [PATCH 03/13] Work on doing transferProperties on example

---
 66_PropertyPools/main.cpp | 129 +++++++++++++-------------------------
 1 file changed, 45 insertions(+), 84 deletions(-)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index 941536751..e59f6385a 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -63,6 +63,11 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 		uint64_t m_downStreamingBufferAddress;
 
 		smart_refctd_ptr<CPropertyPoolHandler> m_propertyPoolHandler;
+		smart_refctd_ptr<IGPUBuffer> m_scratchBuffer;
+		smart_refctd_ptr<IGPUBuffer> m_addressBuffer;
+		smart_refctd_ptr<IGPUBuffer> m_transferSrcBuffer;
+		smart_refctd_ptr<IGPUBuffer> m_transferDstBuffer;
+		std::vector<uint16_t> m_data;
 
 		// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
 		uint32_t m_alignment;
@@ -74,6 +79,9 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 		// We'll run the iterations in reverse, easier to write "keep running"
 		uint32_t m_iteration = 200;
 
+		static constexpr uint64_t TransfersAmount = 1024;
+		static constexpr uint64_t MaxValuesPerTransfer = 512;
+
 	public:
 		// Yay thanks to multiple inheritance we cannot forward ctors anymore
 		PropertyPoolsApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
@@ -90,6 +98,27 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 
 			m_propertyPoolHandler = core::make_smart_refctd_ptr<CPropertyPoolHandler>(core::smart_refctd_ptr(m_device));
 
+			auto createBuffer = [&](uint64_t size)
+			{
+					video::IGPUBuffer::SCreationParams creationParams;
+					creationParams.size = size;
+					creationParams.usage = core::bitflag(asset::IBuffer::EUF_STORAGE_BUFFER_BIT) | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;
+
+					auto buffer = m_device->createBuffer(std::move(creationParams));
+					nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buffer->getMemoryReqs();
+					m_device->allocate(reqs, buffer.get(), nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
+
+					return buffer;
+			};
+
+			m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount);
+			m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer);
+			m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer);
+			m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer);
+
+			for (uint16_t i = 0; i < uint16_t((uint32_t(1) << 16) - 1); i++)
+				m_data.push_back(i);
+
 			// this time we load a shader directly from a file
 			smart_refctd_ptr<IGPUSpecializedShader> shader;
 			{
@@ -167,42 +196,6 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 			m_iteration--;
 			IGPUQueue* const queue = getComputeQueue();
 
-			// Note that I'm using the sample struct with methods that have identical code which compiles as both C++ and HLSL
-			auto rng = nbl::hlsl::Xoroshiro64StarStar::construct({m_iteration^0xdeadbeefu,std::hash<string>()(_NBL_APP_NAME_)});
-
-			// we dynamically choose the number of elements for each iteration
-			const auto elementCount = rng()%MaxPossibleElementCount;
-			const uint32_t inputSize = sizeof(input_t)*elementCount;
-
-			// The allocators can do multiple allocations at once for efficiency
-			const uint32_t AllocationCount = 1;
-			// It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value
-			// this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args.
-			auto inputOffset = m_upStreamingBuffer->invalid_value;
-
-			// We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled)
-			// Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later).
-			std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
-			// note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly
-			m_upStreamingBuffer->multi_allocate(waitTill,AllocationCount,&inputOffset,&inputSize,&m_alignment);
-
-			// Generate our data in-place on the allocated staging buffer
-			{
-				auto* const inputPtr = reinterpret_cast<input_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer())+inputOffset);
-				for (auto j=0; j<elementCount; j++)
-				{
-					const nbl::hlsl::float32_t3 generated(rng(),rng(),rng());
-					// make sure our bitpatterns are in [0,1]^2 as a float
-					inputPtr[j] = generated/float(nbl::hlsl::numeric_limits<decltype(rng())>::max);
-				}
-				// Always remember to flush!
-				if (m_upStreamingBuffer->needsManualFlushOrInvalidate())
-				{
-					const IDeviceMemoryAllocation::MappedMemoryRange range(m_upStreamingBuffer->getBuffer()->getBoundMemory(),inputOffset,inputSize);
-					m_device->flushMappedMemoryRanges(1,&range);
-				}
-			}
-
 			// Obtain our command pool once one gets recycled
 			uint32_t poolIx;
 			do
@@ -210,26 +203,28 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 				poolIx = m_poolCache->acquirePool();
 			} while (poolIx==ICommandPoolCache::invalid_index);
 
-			// finally allocate our output range
-			const uint32_t outputSize = sizeof(output_t)*elementCount;
-			auto outputOffset = m_downStreamingBuffer->invalid_value;
-			m_downStreamingBuffer->multi_allocate(waitTill,AllocationCount,&outputOffset,&outputSize,&m_alignment);
-
 			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
 			{
 				m_device->createCommandBuffers(m_poolCache->getPool(poolIx),IGPUCommandBuffer::EL_PRIMARY,1,&cmdbuf);
 				// lets record, its still a one time submit because we have to re-record with different push constants each time
 				cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
 				cmdbuf->bindComputePipeline(m_pipeline.get());
-				// This is the new fun part, pushing constants
-				const PushConstantData pc = {
-					.inputAddress=m_upStreamingBufferAddress+inputOffset,
-					.outputAddress=m_downStreamingBufferAddress+outputOffset,
-					.dataElementCount=elementCount
-				};
-				cmdbuf->pushConstants(m_pipeline->getLayout(),IShader::ESS_COMPUTE,0u,sizeof(pc),&pc);
-				// Good old trick to get rounded up divisions, in case you're not familiar
-				cmdbuf->dispatch((elementCount-1)/WorkgroupSize+1,1,1);
+
+				// COMMAND RECORDING
+				cmdbuf->updateBuffer(m_transferSrcBuffer.get(), 0, sizeof(uint16_t) * m_data.size(), &m_data[0]);
+				CPropertyPoolHandler::TransferRequest transferRequest;
+				transferRequest.memblock = asset::SBufferRange<video::IGPUBuffer> { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr<video::IGPUBuffer>(m_transferSrcBuffer) };
+				transferRequest.elementSize = m_data.size();
+				transferRequest.elementCount = 1;
+				transferRequest.buffer = asset::SBufferBinding<video::IGPUBuffer> { 0, core::smart_refctd_ptr<video::IGPUBuffer>(m_transferDstBuffer) };
+
+				m_propertyPoolHandler->transferProperties(cmdbuf.get(), nullptr,
+					asset::SBufferBinding<video::IGPUBuffer>{0, core::smart_refctd_ptr(m_scratchBuffer)}, 
+					asset::SBufferBinding<video::IGPUBuffer>{0, core::smart_refctd_ptr(m_addressBuffer)}, 
+					&transferRequest, &transferRequest + 1,
+					m_logger.get(), 0, MaxValuesPerTransfer
+					);
+
 				cmdbuf->end();
 			}
 
@@ -247,40 +242,6 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 				
 			// We can also actually latch our Command Pool reset and its return to the pool of free pools!
 			m_poolCache->releaseSet(m_device.get(),smart_refctd_ptr(fence),poolIx);
-
-			// As promised, we can defer an upstreaming buffer deallocation until a fence is signalled
-			// You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation.
-			m_upStreamingBuffer->multi_deallocate(AllocationCount,&inputOffset,&inputSize,smart_refctd_ptr(fence));
-
-			// Because C++17 and C++20 can't make their mind up about what to do with `this` in event of a [=] capture, lets triple ensure the m_iteration is captured by value.
-			const auto savedIterNum = m_iteration;
-				
-			// Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer.
-			// Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory.
-			// Its nice because it will also remember to invalidate our memory mapping if its not coherent.
-			auto latchedConsumer = make_smart_refctd_ptr<IUtilities::CDownstreamingDataConsumer>(
-				IDeviceMemoryAllocation::MemoryRange(outputOffset,outputSize),
-				// Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals
-				[=](const size_t dstOffset, const void* bufSrc, const size_t size)->void
-				{
-					// The unused variable is used for letting the consumer know the subsection of the output we've managed to download
-					// But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves.
-					assert(dstOffset==0 && size==outputSize);
-
-					// I can const cast, we know the mapping is just a pointer
-					output_t* const data = reinterpret_cast<output_t*>(const_cast<void*>(bufSrc));
-					auto median = data+elementCount/2;
-					std::nth_element(data,median,data+elementCount);
-
-					m_logger->log("Iteration %d Median of Minimum Distances is %f",ILogger::ELL_PERFORMANCE,savedIterNum,*median);
-				},
-				// Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it
-				// hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands.
-				// It could also be latched in the upstreaming deallocate, because its the same fence.
-				std::move(cmdbuf),m_downStreamingBuffer
-			);
-			// We put a function we want to execute 
-			m_downStreamingBuffer->multi_deallocate(AllocationCount,&outputOffset,&outputSize,std::move(fence),&latchedConsumer.get());
 		}
 
 		bool onAppTerminated() override

From 56f855debea003f6ef80a55bd2a8ec5b6975226e Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Wed, 24 Jan 2024 23:12:21 -0300
Subject: [PATCH 04/13] Work on property pool example

---
 66_PropertyPools/main.cpp | 183 +++++++++++++++++++++++++++++++++++---
 1 file changed, 172 insertions(+), 11 deletions(-)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index e59f6385a..e1ab9d7b3 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -3,13 +3,147 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 
-// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
-#include "../common/MonoDeviceApplication.hpp"
+#include "nbl/video/surface/CSurfaceVulkan.h"
+
+#include "../common/BasicMultiQueueApplication.hpp"
 #include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
+namespace nbl::examples
+{
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace ui;
+using namespace asset;
+using namespace video;
+
+// Virtual Inheritance because apps might end up doing diamond inheritance
+class WindowedApplication : public virtual BasicMultiQueueApplication
+{
+		using base_t = BasicMultiQueueApplication;
+
+	public:
+		using base_t::base_t;
+
+		virtual video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override
+		{
+			auto retval = base_t::getAPIFeaturesToEnable();
+			// We only support one swapchain mode, surface, the other one is Display which we have not implemented yet.
+			retval.swapchainMode = video::E_SWAPCHAIN_MODE::ESM_SURFACE;
+			return retval;
+		}
+
+		// New function, we neeed to know about surfaces to create ahead of time
+		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const = 0;
+
+		virtual core::set<video::IPhysicalDevice*> filterDevices(const core::SRange<video::IPhysicalDevice* const>& physicalDevices) const
+		{
+			const auto firstFilter = base_t::filterDevices(physicalDevices);
+
+			video::SPhysicalDeviceFilter deviceFilter = {};
+			
+			const auto surfaces = getSurfaces();
+			deviceFilter.requiredSurfaceCompatibilities = surfaces.data();
+			deviceFilter.requiredSurfaceCompatibilitiesCount = surfaces.size();
+
+			return deviceFilter(physicalDevices);
+		}
+		
+		virtual bool onAppInitialized(smart_refctd_ptr<ISystem>&& system)
+		{
+			// Remember to call the base class initialization!
+			if (!base_t::onAppInitialized(std::move(system)))
+				return false;
+
+		#ifdef _NBL_PLATFORM_WINDOWS_
+			m_winMgr = nbl::ui::IWindowManagerWin32::create();
+		#else
+			#error "Unimplemented!"
+		#endif
+		}
+
+		core::smart_refctd_ptr<ui::IWindowManager> m_winMgr;
+};
+
+
+// Before we get onto creating a window, we need to discuss how Nabla handles input, clipboards and cursor control
+class IWindowClosedCallback : public virtual nbl::ui::IWindow::IEventCallback
+{
+	public:
+		IWindowClosedCallback() : m_gotWindowClosedMsg(false) {}
+
+		// unless you create a separate callback per window, both will "trip" this condition
+		bool windowGotClosed() const {return m_gotWindowClosedMsg;}
+
+	private:
+		bool onWindowClosed_impl() override
+		{
+			m_gotWindowClosedMsg = true;
+			return true;
+		}
+
+		bool m_gotWindowClosedMsg;
+};
+
+// We inherit from an application that tries to find Graphics and Compute queues
+// because applications with presentable images often want to perform Graphics family operations
+// Virtual Inheritance because apps might end up doing diamond inheritance
+class SingleNonResizableWindowApplication : public virtual WindowedApplication
+{
+		using base_t = WindowedApplication;
+
+	protected:
+		virtual IWindow::SCreationParams getWindowCreationParams() const
+		{
+			IWindow::SCreationParams params = {};
+			params.callback = make_smart_refctd_ptr<IWindowClosedCallback>();
+			params.width = 640;
+			params.height = 480;
+			params.x = 32;
+			params.y = 32;
+			params.flags = IWindow::ECF_NONE;
+			params.windowCaption = "SingleNonResizableWindowApplication";
+			return params;
+		}
+
+		core::smart_refctd_ptr<ui::IWindow> m_window;
+		core::smart_refctd_ptr<video::ISurfaceVulkan> m_surface;
+
+	public:
+		using base_t::base_t;
+
+		virtual bool onAppInitialized(smart_refctd_ptr<nbl::system::ISystem>&& system) override
+		{
+			// Remember to call the base class initialization!
+			if (!base_t::onAppInitialized(std::move(system)))
+				return false;
+
+			m_window = m_winMgr->createWindow(getWindowCreationParams());
+			m_surface = video::CSurfaceVulkanWin32::create(core::smart_refctd_ptr(m_api),core::smart_refctd_ptr_static_cast<ui::IWindowWin32>(m_window));
+			return true;
+		}
+
+		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const
+		{
+			return {{m_surface.get()/*,EQF_NONE*/}};
+		}
+
+		virtual bool keepRunning() override
+		{
+			if (!m_window || reinterpret_cast<const IWindowClosedCallback*>(m_window->getEventCallback())->windowGotClosed())
+				return false;
+
+			return true;
+		}
+};
+}
+
+
 using namespace nbl;
 using namespace core;
 using namespace system;
+using namespace ui;
 using namespace asset;
 using namespace video;
 
@@ -19,7 +153,7 @@ using namespace video;
 
 
 // In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants 
-class PropertyPoolsApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
+class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
 {
 		using device_base_t = examples::MonoDeviceApplication;
 		using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
@@ -98,23 +232,29 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 
 			m_propertyPoolHandler = core::make_smart_refctd_ptr<CPropertyPoolHandler>(core::smart_refctd_ptr(m_device));
 
-			auto createBuffer = [&](uint64_t size)
+			auto createBuffer = [&](uint64_t size, core::bitflag<asset::IBuffer::E_USAGE_FLAGS> flags, const char* name, bool hostVisible)
 			{
 					video::IGPUBuffer::SCreationParams creationParams;
-					creationParams.size = size;
-					creationParams.usage = core::bitflag(asset::IBuffer::EUF_STORAGE_BUFFER_BIT) | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;
+					creationParams.size = ((size + 3) / 4) * 4; // Align
+					creationParams.usage = flags
+						| asset::IBuffer::EUF_STORAGE_BUFFER_BIT
+						| asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT 
+						| asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;
 
 					auto buffer = m_device->createBuffer(std::move(creationParams));
 					nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buffer->getMemoryReqs();
+					if (hostVisible) 
+						reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
 					m_device->allocate(reqs, buffer.get(), nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
+					buffer->setObjectDebugName(name);
 
 					return buffer;
 			};
 
-			m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount);
-			m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer);
-			m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer);
-			m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer);
+			m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_scratchBuffer", false);
+			m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_addressBuffer", false);
+			m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_transferSrcBuffer", false);
+			m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_transferDstBuffer", true);
 
 			for (uint16_t i = 0; i < uint16_t((uint32_t(1) << 16) - 1); i++)
 				m_data.push_back(i);
@@ -211,7 +351,12 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 				cmdbuf->bindComputePipeline(m_pipeline.get());
 
 				// COMMAND RECORDING
-				cmdbuf->updateBuffer(m_transferSrcBuffer.get(), 0, sizeof(uint16_t) * m_data.size(), &m_data[0]);
+				uint32_t dataSize = (((sizeof(uint16_t) * m_data.size()) + 3) / 4) * 4;
+				uint32_t maxUpload = 65536;
+				for (uint32_t offset = 0; offset < dataSize; offset += maxUpload)
+				{
+					cmdbuf->updateBuffer(m_transferSrcBuffer.get(), offset, maxUpload, &m_data[offset / sizeof(uint16_t)]);
+				}
 				CPropertyPoolHandler::TransferRequest transferRequest;
 				transferRequest.memblock = asset::SBufferRange<video::IGPUBuffer> { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr<video::IGPUBuffer>(m_transferSrcBuffer) };
 				transferRequest.elementSize = m_data.size();
@@ -239,6 +384,22 @@ class PropertyPoolsApp final : public examples::MonoDeviceApplication, public ex
 				queue->submit(1u,&submitInfo,fence.get());
 				queue->endCapture();
 			}
+
+			{
+				// Readback ds
+				auto mem = m_transferDstBuffer->getBoundMemory();
+				assert(mem->isMappable());
+				auto ptr = m_device->mapMemory(nbl::video::IDeviceMemoryAllocation::MappedMemoryRange(mem, 0, mem->getAllocationSize()), video::IDeviceMemoryAllocation::EMCAF_READ);
+				auto uint16_t_ptr = static_cast<uint16_t*>(ptr);
+
+				for (uint32_t i = 0; i < 128; i++)
+				{
+					uint16_t value = uint16_t_ptr[i];
+					std::printf("%i, ", value);
+				}
+				std::printf("\n");
+				m_device->unmapMemory(mem);
+			}
 				
 			// We can also actually latch our Command Pool reset and its return to the pool of free pools!
 			m_poolCache->releaseSet(m_device.get(),smart_refctd_ptr(fence),poolIx);

From 3adca44133c70815bea718b3c925197c7ff52f63 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Sat, 27 Jan 2024 18:38:46 -0300
Subject: [PATCH 05/13] Fix vulkan_1_3 incompatibilities

---
 66_PropertyPools/main.cpp | 141 +++++++++++++++++++++-----------------
 1 file changed, 80 insertions(+), 61 deletions(-)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index e1ab9d7b3..d3d9822cd 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -44,8 +44,7 @@ class WindowedApplication : public virtual BasicMultiQueueApplication
 			video::SPhysicalDeviceFilter deviceFilter = {};
 			
 			const auto surfaces = getSurfaces();
-			deviceFilter.requiredSurfaceCompatibilities = surfaces.data();
-			deviceFilter.requiredSurfaceCompatibilitiesCount = surfaces.size();
+			deviceFilter.requiredSurfaceCompatibilities = { surfaces.data(), surfaces.size() };
 
 			return deviceFilter(physicalDevices);
 		}
@@ -210,8 +209,10 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 		// Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools.
 		smart_refctd_ptr<nbl::video::ICommandPoolCache> m_poolCache;
 
-		// We'll run the iterations in reverse, easier to write "keep running"
-		uint32_t m_iteration = 200;
+		// This example really lets the advantages of a timeline semaphore shine through!
+		smart_refctd_ptr<ISemaphore> m_timeline;
+		uint64_t m_iteration = 0;
+		constexpr static inline uint64_t MaxIterations = 200;
 
 		static constexpr uint64_t TransfersAmount = 1024;
 		static constexpr uint64_t MaxValuesPerTransfer = 512;
@@ -234,21 +235,21 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 
 			auto createBuffer = [&](uint64_t size, core::bitflag<asset::IBuffer::E_USAGE_FLAGS> flags, const char* name, bool hostVisible)
 			{
-					video::IGPUBuffer::SCreationParams creationParams;
-					creationParams.size = ((size + 3) / 4) * 4; // Align
-					creationParams.usage = flags
-						| asset::IBuffer::EUF_STORAGE_BUFFER_BIT
-						| asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT 
-						| asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;
-
-					auto buffer = m_device->createBuffer(std::move(creationParams));
-					nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buffer->getMemoryReqs();
-					if (hostVisible) 
-						reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
-					m_device->allocate(reqs, buffer.get(), nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
-					buffer->setObjectDebugName(name);
-
-					return buffer;
+				video::IGPUBuffer::SCreationParams creationParams;
+				creationParams.size = ((size + 3) / 4) * 4; // Align
+				creationParams.usage = flags
+					| asset::IBuffer::EUF_STORAGE_BUFFER_BIT
+					| asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT 
+					| asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;
+
+				auto buffer = m_device->createBuffer(std::move(creationParams));
+				nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buffer->getMemoryReqs();
+				if (hostVisible) 
+					reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
+				m_device->allocate(reqs, buffer.get(), nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
+				buffer->setObjectDebugName(name);
+
+				return buffer;
 			};
 
 			m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_scratchBuffer", false);
@@ -260,30 +261,25 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 				m_data.push_back(i);
 
 			// this time we load a shader directly from a file
-			smart_refctd_ptr<IGPUSpecializedShader> shader;
+			smart_refctd_ptr<IGPUShader> shader;
 			{
 				IAssetLoader::SAssetLoadParams lp = {};
 				lp.logger = m_logger.get();
 				lp.workingDirectory = ""; // virtual root
-
 				auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl",lp);
 				const auto assets = assetBundle.getContents();
 				if (assets.empty())
 					return logFail("Could not load shader!");
 
 				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-				auto source = IAsset::castDown<ICPUSpecializedShader>(assets[0]);
+				auto source = IAsset::castDown<ICPUShader>(assets[0]);
 				// The down-cast should not fail!
 				assert(source);
 
-				IGPUObjectFromAssetConverter::SParams conversionParams = {};
-				conversionParams.device = m_device.get();
-				conversionParams.assetManager = m_assetMgr.get();
-				created_gpu_object_array<ICPUSpecializedShader> convertedGPUObjects = std::make_unique<IGPUObjectFromAssetConverter>()->getGPUObjectsFromAssets(&source,&source+1,conversionParams);
-				if (convertedGPUObjects->empty() || !convertedGPUObjects->front())
-					return logFail("Conversion of a CPU Specialized Shader to GPU failed!");
-
-				shader = convertedGPUObjects->front();
+				// this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple
+				shader = m_device->createShader(source.get());
+				if (!shader)
+					return logFail("Creation of a GPU Shader to from CPU Shader source failed!");
 			}
 
 			// The StreamingTransientDataBuffers are actually composed on top of another useful utility called `CAsyncSingleBufferSubAllocator`
@@ -296,8 +292,8 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 				return logFail("Failed to create Utilities!");
 			m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
 			m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
-			m_upStreamingBufferAddress = m_device->getBufferDeviceAddress(m_upStreamingBuffer->getBuffer());
-			m_downStreamingBufferAddress = m_device->getBufferDeviceAddress(m_downStreamingBuffer->getBuffer());
+			m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress();
+			m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress();
 
 			// People love Reflection but I prefer Shader Sources instead!
 			const nbl::asset::SPushConstantRange pcRange = {.stageFlags=IShader::ESS_COMPUTE,.offset=0,.size=sizeof(PushConstantData)};
@@ -307,7 +303,14 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 			// I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic
 			// only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding.
 			// Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size.
-			m_pipeline = m_device->createComputePipeline(nullptr,m_device->createPipelineLayout(&pcRange,&pcRange+1),std::move(shader));
+			{
+				auto layout = m_device->createPipelineLayout({&pcRange,1});
+				IGPUComputePipeline::SCreationParams params = {};
+				params.layout = layout.get();
+				params.shader.shader = shader.get();
+				if (!m_device->createComputePipelines(nullptr,{&params,1},&m_pipeline))
+					return logFail("Failed to create compute pipeline!\n");
+			}
 
 			const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
 			// The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices
@@ -321,9 +324,12 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 			// We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are
 			// the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously.
 			constexpr auto MaxConcurrency = 64;
+
 			// Since this time we don't throw the Command Pools away and we'll reset them instead, we don't create the pools with the transient flag
-			m_poolCache = make_smart_refctd_ptr<ICommandPoolCache>(m_device.get(),getComputeQueue()->getFamilyIndex(), IGPUCommandPool::ECF_NONE, MaxConcurrency);
+			m_poolCache = ICommandPoolCache::create(core::smart_refctd_ptr(m_device),getComputeQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::NONE,MaxConcurrency);
 
+			// In contrast to fences, we just need one semaphore to rule all dispatches
+			m_timeline = m_device->createSemaphore(m_iteration);
 			return true;
 		}
 
@@ -334,7 +340,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 		void workLoopBody() override
 		{
 			m_iteration--;
-			IGPUQueue* const queue = getComputeQueue();
+			IQueue* const queue = getComputeQueue();
 
 			// Obtain our command pool once one gets recycled
 			uint32_t poolIx;
@@ -345,9 +351,9 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 
 			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
 			{
-				m_device->createCommandBuffers(m_poolCache->getPool(poolIx),IGPUCommandBuffer::EL_PRIMARY,1,&cmdbuf);
+				m_poolCache->getPool(poolIx)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1},core::smart_refctd_ptr(m_logger));
 				// lets record, its still a one time submit because we have to re-record with different push constants each time
-				cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
+				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 				cmdbuf->bindComputePipeline(m_pipeline.get());
 
 				// COMMAND RECORDING
@@ -355,7 +361,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 				uint32_t maxUpload = 65536;
 				for (uint32_t offset = 0; offset < dataSize; offset += maxUpload)
 				{
-					cmdbuf->updateBuffer(m_transferSrcBuffer.get(), offset, maxUpload, &m_data[offset / sizeof(uint16_t)]);
+					cmdbuf->updateBuffer({ offset, maxUpload, core::smart_refctd_ptr<video::IGPUBuffer>(m_transferSrcBuffer) }, &m_data[offset / sizeof(uint16_t)]);
 				}
 				CPropertyPoolHandler::TransferRequest transferRequest;
 				transferRequest.memblock = asset::SBufferRange<video::IGPUBuffer> { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr<video::IGPUBuffer>(m_transferSrcBuffer) };
@@ -363,7 +369,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 				transferRequest.elementCount = 1;
 				transferRequest.buffer = asset::SBufferBinding<video::IGPUBuffer> { 0, core::smart_refctd_ptr<video::IGPUBuffer>(m_transferDstBuffer) };
 
-				m_propertyPoolHandler->transferProperties(cmdbuf.get(), nullptr,
+				m_propertyPoolHandler->transferProperties(cmdbuf.get(),
 					asset::SBufferBinding<video::IGPUBuffer>{0, core::smart_refctd_ptr(m_scratchBuffer)}, 
 					asset::SBufferBinding<video::IGPUBuffer>{0, core::smart_refctd_ptr(m_addressBuffer)}, 
 					&transferRequest, &transferRequest + 1,
@@ -373,36 +379,49 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 				cmdbuf->end();
 			}
 
-			// TODO: redo with a single timeline semaphore
-			auto fence = m_device->createFence(IGPUFence::ECF_UNSIGNALED);
+
+			const auto savedIterNum = m_iteration++;
 			{
-				IGPUQueue::SSubmitInfo submitInfo = {};
-				submitInfo.commandBufferCount = 1;
-				submitInfo.commandBuffers = &cmdbuf.get();
+				const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
+				{
+					.cmdbuf = cmdbuf.get()
+				};
+				const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
+				{
+					.semaphore = m_timeline.get(),
+					.value = m_iteration,
+					.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+				};
+				// Generally speaking we don't need to wait on any semaphore because in this example every dispatch gets its own clean piece of memory to use
+				// from the point of view of the GPU. Implicit domain operations between Host and Device happen upon a submit and a semaphore/fence signal operation,
+				// this ensures we can touch the input and get accurate values from the output memory using the CPU before and after respectively, each submit becoming PENDING.
+				// If we actually cared about this submit seeing the memory accesses of a previous dispatch we could add a semaphore wait
+				const IQueue::SSubmitInfo submitInfo = {
+					.waitSemaphores = {},
+					.commandBuffers = {&cmdbufInfo,1},
+					.signalSemaphores = {&signalInfo,1}
+				};
 
 				queue->startCapture();
-				queue->submit(1u,&submitInfo,fence.get());
+				queue->submit({ &submitInfo,1 });
 				queue->endCapture();
 			}
 
 			{
-				// Readback ds
-				auto mem = m_transferDstBuffer->getBoundMemory();
-				assert(mem->isMappable());
-				auto ptr = m_device->mapMemory(nbl::video::IDeviceMemoryAllocation::MappedMemoryRange(mem, 0, mem->getAllocationSize()), video::IDeviceMemoryAllocation::EMCAF_READ);
-				auto uint16_t_ptr = static_cast<uint16_t*>(ptr);
-
-				for (uint32_t i = 0; i < 128; i++)
-				{
-					uint16_t value = uint16_t_ptr[i];
-					std::printf("%i, ", value);
-				}
-				std::printf("\n");
-				m_device->unmapMemory(mem);
+				//// Readback ds
+				//auto mem = m_transferDstBuffer->getBoundMemory();
+				//assert(mem->isMappable());
+				//auto ptr = m_device->mapMemory(nbl::video::IDeviceMemoryAllocation::MappedMemoryRange(mem, 0, mem->getAllocationSize()), video::IDeviceMemoryAllocation::EMCAF_READ);
+				//auto uint16_t_ptr = static_cast<uint16_t*>(ptr);
+
+				//for (uint32_t i = 0; i < 128; i++)
+				//{
+				//	uint16_t value = uint16_t_ptr[i];
+				//	std::printf("%i, ", value);
+				//}
+				//std::printf("\n");
+				//m_device->unmapMemory(mem);
 			}
-				
-			// We can also actually latch our Command Pool reset and its return to the pool of free pools!
-			m_poolCache->releaseSet(m_device.get(),smart_refctd_ptr(fence),poolIx);
 		}
 
 		bool onAppTerminated() override

From e8e512f027614057749fd6ff483c8e98be407a15 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Sat, 27 Jan 2024 21:35:00 -0300
Subject: [PATCH 06/13] Update property pool example for vulkan_1_3

---
 66_PropertyPools/main.cpp | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index d3d9822cd..ff1e47b77 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -334,12 +334,11 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 		}
 
 		// Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script)
-		bool keepRunning() override { return m_iteration; }
+		bool keepRunning() override { return m_iteration<MaxIterations; }
 
 		// Finally the first actual work-loop
 		void workLoopBody() override
 		{
-			m_iteration--;
 			IQueue* const queue = getComputeQueue();
 
 			// Obtain our command pool once one gets recycled
@@ -408,19 +407,20 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 			}
 
 			{
-				//// Readback ds
-				//auto mem = m_transferDstBuffer->getBoundMemory();
-				//assert(mem->isMappable());
-				//auto ptr = m_device->mapMemory(nbl::video::IDeviceMemoryAllocation::MappedMemoryRange(mem, 0, mem->getAllocationSize()), video::IDeviceMemoryAllocation::EMCAF_READ);
-				//auto uint16_t_ptr = static_cast<uint16_t*>(ptr);
-
-				//for (uint32_t i = 0; i < 128; i++)
-				//{
-				//	uint16_t value = uint16_t_ptr[i];
-				//	std::printf("%i, ", value);
-				//}
-				//std::printf("\n");
-				//m_device->unmapMemory(mem);
+				// Readback ds
+				auto mem = m_transferDstBuffer->getBoundMemory();
+				void* ptr = mem.memory->map({ mem.offset, mem.memory->getAllocationSize() });
+
+				auto uint16_t_ptr = reinterpret_cast<uint16_t*>(ptr);
+
+				for (uint32_t i = 0; i < 128; i++)
+				{
+					uint16_t value = uint16_t_ptr[i];
+					std::printf("%i, ", value);
+				}
+				std::printf("\n");
+				bool success = mem.memory->unmap();
+				assert(success);
 			}
 		}
 

From f8340306a6a29089fce5a9a106bca20a5ad336c1 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Mon, 29 Jan 2024 15:12:00 -0300
Subject: [PATCH 07/13] WIP testing

---
 66_PropertyPools/main.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index ff1e47b77..5230ae552 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -252,7 +252,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 				return buffer;
 			};
 
-			m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_scratchBuffer", false);
+			m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_scratchBuffer", true);
 			m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_addressBuffer", false);
 			m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_transferSrcBuffer", false);
 			m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_transferDstBuffer", true);
@@ -408,17 +408,18 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 
 			{
 				// Readback ds
-				auto mem = m_transferDstBuffer->getBoundMemory();
+				auto mem = m_scratchBuffer->getBoundMemory();
 				void* ptr = mem.memory->map({ mem.offset, mem.memory->getAllocationSize() });
 
-				auto uint16_t_ptr = reinterpret_cast<uint16_t*>(ptr);
-
-				for (uint32_t i = 0; i < 128; i++)
+				for (uint32_t i = 0; i < sizeof(nbl::hlsl::property_pools::TransferRequest) * 10; i++)
 				{
-					uint16_t value = uint16_t_ptr[i];
+					uint16_t value = reinterpret_cast<uint16_t*>(ptr)[i];
 					std::printf("%i, ", value);
 				}
 				std::printf("\n");
+				std::printf("should be %I64i: %I64i\n", m_transferSrcBuffer->getDeviceAddress(), reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 3)[0]);
+				std::printf("should be %I64i: %I64i\n", m_transferDstBuffer->getDeviceAddress(), reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 4)[0]);
+				std::printf("should be 3: %i\n", reinterpret_cast<uint16_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 5)[0]);
 				bool success = mem.memory->unmap();
 				assert(success);
 			}

From 9682dee73e84b105b0df8a09504cdbea7532a312 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Mon, 12 Feb 2024 17:44:06 -0300
Subject: [PATCH 08/13] WIP suballocated descriptor set

---
 66_PropertyPools/main.cpp | 42 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index 5230ae552..443979b02 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -4,6 +4,7 @@
 
 
 #include "nbl/video/surface/CSurfaceVulkan.h"
+#include "nbl/video/alloc/SubAllocatedDescriptorSet.h"
 
 #include "../common/BasicMultiQueueApplication.hpp"
 #include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
@@ -202,6 +203,8 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 		smart_refctd_ptr<IGPUBuffer> m_transferDstBuffer;
 		std::vector<uint16_t> m_data;
 
+		smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet<core::GeneralpurposeAddressAllocator<uint32_t>>> m_subAllocDescriptorSet;
+
 		// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
 		uint32_t m_alignment;
 		
@@ -217,6 +220,10 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 		static constexpr uint64_t TransfersAmount = 1024;
 		static constexpr uint64_t MaxValuesPerTransfer = 512;
 
+		constexpr static inline uint32_t maxDescriptorSetAllocationAlignment = 64u*1024u; // if you need larger alignments then you're not right in the head
+		constexpr static inline uint32_t minDescriptorSetAllocationSize = 1u;
+
+
 	public:
 		// Yay thanks to multiple inheritance we cannot forward ctors anymore
 		PropertyPoolsApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
@@ -225,6 +232,8 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 		// we stuff all our work here because its a "single shot" app
 		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 		{
+			using nbl::video::IGPUDescriptorSetLayout;
+
 			// Remember to call the base class initialization!
 			if (!device_base_t::onAppInitialized(std::move(system)))
 				return false;
@@ -330,6 +339,35 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 
 			// In contrast to fences, we just need one semaphore to rule all dispatches
 			m_timeline = m_device->createSemaphore(m_iteration);
+
+
+			// Descriptor set sub allocator
+
+			video::IGPUDescriptorSetLayout::SBinding bindings[1];
+			{
+				bindings[0].binding = 0;
+				bindings[0].count = 65535u;
+				bindings[0].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) 
+					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT 
+					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT;
+				bindings[0].type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE;
+				bindings[0].stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE;
+			}
+
+			std::span<video::IGPUDescriptorSetLayout::SBinding> bindingsSpan(bindings);
+
+			// TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1)
+			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet<core::GeneralpurposeAddressAllocator<uint32_t>>>(
+				bindings, maxDescriptorSetAllocationAlignment, minDescriptorSetAllocationSize
+			);
+
+			uint32_t allocation = -1;
+			uint32_t size = 10;
+			uint32_t alignment = 1;
+			subAllocatedDescriptorSet->multi_allocate(1, &allocation, &size, &alignment);
+			m_logger->log("Allocation: %d\n", system::ILogger::ELL_ERROR, allocation);
+			assert(allocation);
+
 			return true;
 		}
 
@@ -417,8 +455,8 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 					std::printf("%i, ", value);
 				}
 				std::printf("\n");
-				std::printf("should be %I64i: %I64i\n", m_transferSrcBuffer->getDeviceAddress(), reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 3)[0]);
-				std::printf("should be %I64i: %I64i\n", m_transferDstBuffer->getDeviceAddress(), reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 4)[0]);
+				std::printf("should be %I64i (alignment: %I64i): %I64i\n", m_transferSrcBuffer->getDeviceAddress(), m_transferSrcBuffer->getDeviceAddress() & 7, reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 3)[0]);
+				std::printf("should be %I64i (alignment: %I64i): %I64i\n", m_transferDstBuffer->getDeviceAddress(), m_transferDstBuffer->getDeviceAddress() & 7, reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 4)[0]);
 				std::printf("should be 3: %i\n", reinterpret_cast<uint16_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 5)[0]);
 				bool success = mem.memory->unmap();
 				assert(success);

From 48be8e8350826ef75142eb92d07c17c472337fa4 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Tue, 13 Feb 2024 10:23:49 -0300
Subject: [PATCH 09/13] Testing sub allocator descriptor set allocations

---
 66_PropertyPools/main.cpp | 48 +++++++++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index 443979b02..b6b82f754 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -346,7 +346,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 			video::IGPUDescriptorSetLayout::SBinding bindings[1];
 			{
 				bindings[0].binding = 0;
-				bindings[0].count = 65535u;
+				bindings[0].count = 65536u;
 				bindings[0].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) 
 					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT 
 					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT;
@@ -361,12 +361,46 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 				bindings, maxDescriptorSetAllocationAlignment, minDescriptorSetAllocationSize
 			);
 
-			uint32_t allocation = -1;
-			uint32_t size = 10;
-			uint32_t alignment = 1;
-			subAllocatedDescriptorSet->multi_allocate(1, &allocation, &size, &alignment);
-			m_logger->log("Allocation: %d\n", system::ILogger::ELL_ERROR, allocation);
-			assert(allocation);
+			std::vector<uint32_t> allocation, size;
+			{
+				for (uint32_t i = 0; i < 512; i++)
+				{
+					allocation.push_back(core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+					size.push_back(4);
+				}
+				subAllocatedDescriptorSet->multi_allocate(allocation.size(), &allocation[0], &size[0]);
+				for (uint32_t i = 0; i < allocation.size(); i++)
+				{
+					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
+					assert(allocation[i] != core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+				}
+			}
+			{
+				std::vector<uint32_t> addr, freeSize;
+				for (uint32_t i = 0; i < 512; i+=2)
+				{
+					addr.push_back(allocation[i]);
+					freeSize.push_back(4);
+				}
+				subAllocatedDescriptorSet->multi_deallocate(addr.size(), &addr[0], &freeSize[0]);
+			}
+
+			m_logger->log("Freed some allocations", system::ILogger::ELL_INFO);
+			allocation.clear();
+			size.clear();
+			{
+				for (uint32_t i = 0; i < 512; i++)
+				{
+					allocation.push_back(core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+					size.push_back(2);
+				}
+				subAllocatedDescriptorSet->multi_allocate(allocation.size(), &allocation[0], &size[0]);
+				for (uint32_t i = 0; i < allocation.size(); i++)
+				{
+					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
+					assert(allocation[i] != core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
+				}
+			}
 
 			return true;
 		}

From 7bc9f35bf054711e92f0fef1ed4d4df5f62bcb31 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Wed, 14 Feb 2024 16:31:40 -0300
Subject: [PATCH 10/13] Work on property pool example fixes

---
 66_PropertyPools/main.cpp | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index b6b82f754..dc16dfeae 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -436,9 +436,11 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 				}
 				CPropertyPoolHandler::TransferRequest transferRequest;
 				transferRequest.memblock = asset::SBufferRange<video::IGPUBuffer> { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr<video::IGPUBuffer>(m_transferSrcBuffer) };
-				transferRequest.elementSize = m_data.size();
-				transferRequest.elementCount = 1;
+				transferRequest.elementSize = 1;
+				transferRequest.elementCount = m_data.size();
 				transferRequest.buffer = asset::SBufferBinding<video::IGPUBuffer> { 0, core::smart_refctd_ptr<video::IGPUBuffer>(m_transferDstBuffer) };
+				transferRequest.srcAddressesOffset = IPropertyPool::invalid;
+				transferRequest.dstAddressesOffset = IPropertyPool::invalid;
 
 				m_propertyPoolHandler->transferProperties(cmdbuf.get(),
 					asset::SBufferBinding<video::IGPUBuffer>{0, core::smart_refctd_ptr(m_scratchBuffer)}, 
@@ -447,7 +449,8 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 					m_logger.get(), 0, MaxValuesPerTransfer
 					);
 
-				cmdbuf->end();
+				auto result = cmdbuf->end();
+				assert(result);
 			}
 
 
@@ -474,13 +477,18 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 				};
 
 				queue->startCapture();
-				queue->submit({ &submitInfo,1 });
+				auto statusCode = queue->submit({ &submitInfo,1 });
 				queue->endCapture();
+				assert(statusCode == IQueue::RESULT::SUCCESS);
 			}
 
 			{
+				ISemaphore::SWaitInfo infos[1] = {{.semaphore=m_timeline.get(),.value=m_iteration}};
+				m_device->blockForSemaphores(infos);
 				// Readback ds
-				auto mem = m_scratchBuffer->getBoundMemory();
+				// TODO: This should readback the m_transferDstBuffer instead
+				// (we'll read back the destination buffer and check that copy went through as expected)
+				auto mem = m_transferDstBuffer->getBoundMemory(); // Scratch buffer has the transfer requests
 				void* ptr = mem.memory->map({ mem.offset, mem.memory->getAllocationSize() });
 
 				for (uint32_t i = 0; i < sizeof(nbl::hlsl::property_pools::TransferRequest) * 10; i++)
@@ -489,9 +497,15 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 					std::printf("%i, ", value);
 				}
 				std::printf("\n");
-				std::printf("should be %I64i (alignment: %I64i): %I64i\n", m_transferSrcBuffer->getDeviceAddress(), m_transferSrcBuffer->getDeviceAddress() & 7, reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 3)[0]);
-				std::printf("should be %I64i (alignment: %I64i): %I64i\n", m_transferDstBuffer->getDeviceAddress(), m_transferDstBuffer->getDeviceAddress() & 7, reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 4)[0]);
-				std::printf("should be 3: %i\n", reinterpret_cast<uint16_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 5)[0]);
+				//std::printf("srcAddr %I64i (alignment: %I64i): %I64i\n", m_transferSrcBuffer->getDeviceAddress(), m_transferSrcBuffer->getDeviceAddress() & 7, reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 3)[0]);
+				//std::printf("dstAddr %I64i (alignment: %I64i): %I64i\n", m_transferDstBuffer->getDeviceAddress(), m_transferDstBuffer->getDeviceAddress() & 7, reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 4)[0]);
+				//std::printf("srcIndexAddr %I64i\n", reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 5)[0]);
+				//std::printf("dstIndexAddr %I64i\n", reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 6)[0]);
+				//std::printf("elementCount %I64i\n", reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 7)[0]);
+				//std::printf("propertySize %i\n", reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 8)[0]);
+				//std::printf("fill %i\n", reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 9)[0]);
+				//std::printf("srcIndexSizeLog2 %i\n", reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 10)[0]);
+				//std::printf("dstIndexSizeLog2 %i\n", reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 11)[0]);
 				bool success = mem.memory->unmap();
 				assert(success);
 			}

From 102aa472c52581ce1297c234df7ac3f73c74c7cf Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Wed, 14 Feb 2024 22:26:00 -0300
Subject: [PATCH 11/13] WIP example

---
 66_PropertyPools/main.cpp | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index dc16dfeae..f17d7cf58 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -446,7 +446,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 					asset::SBufferBinding<video::IGPUBuffer>{0, core::smart_refctd_ptr(m_scratchBuffer)}, 
 					asset::SBufferBinding<video::IGPUBuffer>{0, core::smart_refctd_ptr(m_addressBuffer)}, 
 					&transferRequest, &transferRequest + 1,
-					m_logger.get(), 0, MaxValuesPerTransfer
+					m_logger.get(), 0, m_data.size()
 					);
 
 				auto result = cmdbuf->end();
@@ -485,27 +485,21 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 			{
 				ISemaphore::SWaitInfo infos[1] = {{.semaphore=m_timeline.get(),.value=m_iteration}};
 				m_device->blockForSemaphores(infos);
+
 				// Readback ds
-				// TODO: This should readback the m_transferDstBuffer instead
 				// (we'll read back the destination buffer and check that copy went through as expected)
 				auto mem = m_transferDstBuffer->getBoundMemory(); // Scratch buffer has the transfer requests
 				void* ptr = mem.memory->map({ mem.offset, mem.memory->getAllocationSize() });
 
-				for (uint32_t i = 0; i < sizeof(nbl::hlsl::property_pools::TransferRequest) * 10; i++)
+				for (uint32_t i = 0; i < 1024; /*m_data.size();*/ i++)
 				{
-					uint16_t value = reinterpret_cast<uint16_t*>(ptr)[i];
-					std::printf("%i, ", value);
+					uint16_t expected = reinterpret_cast<uint16_t*>(ptr)[i];
+					uint16_t actual = m_data[i];
+					std::printf("%i, ", expected);
+					//assert(expected == actual);
 				}
 				std::printf("\n");
-				//std::printf("srcAddr %I64i (alignment: %I64i): %I64i\n", m_transferSrcBuffer->getDeviceAddress(), m_transferSrcBuffer->getDeviceAddress() & 7, reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 3)[0]);
-				//std::printf("dstAddr %I64i (alignment: %I64i): %I64i\n", m_transferDstBuffer->getDeviceAddress(), m_transferDstBuffer->getDeviceAddress() & 7, reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 4)[0]);
-				//std::printf("srcIndexAddr %I64i\n", reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 5)[0]);
-				//std::printf("dstIndexAddr %I64i\n", reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 6)[0]);
-				//std::printf("elementCount %I64i\n", reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 7)[0]);
-				//std::printf("propertySize %i\n", reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 8)[0]);
-				//std::printf("fill %i\n", reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 9)[0]);
-				//std::printf("srcIndexSizeLog2 %i\n", reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 10)[0]);
-				//std::printf("dstIndexSizeLog2 %i\n", reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(ptr) + 40 * 11)[0]);
+				_NBL_DEBUG_BREAK_IF(true);
 				bool success = mem.memory->unmap();
 				assert(success);
 			}

From ac178253475bf4a7fd172d6cb54d39894c847822 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Fri, 16 Feb 2024 15:46:20 -0300
Subject: [PATCH 12/13] Remove unused things from example 05

---
 66_PropertyPools/app_resources/common.hlsl |   2 -
 66_PropertyPools/main.cpp                  | 187 +--------------------
 2 files changed, 3 insertions(+), 186 deletions(-)

diff --git a/66_PropertyPools/app_resources/common.hlsl b/66_PropertyPools/app_resources/common.hlsl
index 6f339aa13..456dc6740 100644
--- a/66_PropertyPools/app_resources/common.hlsl
+++ b/66_PropertyPools/app_resources/common.hlsl
@@ -16,7 +16,5 @@ struct PushConstantData
 
 NBL_CONSTEXPR uint32_t WorkgroupSize = 256;
 
-#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
-
 // Yes we do have our own re-creation of C++'s STL in HLSL2021 !
 #include "nbl/builtin/hlsl/limits.hlsl"
\ No newline at end of file
diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index f17d7cf58..c69a6abef 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -158,55 +158,12 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 		using device_base_t = examples::MonoDeviceApplication;
 		using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
 
-		// This is the first example that submits multiple workloads in-flight. 
-		// What the shader does is it computes the minimum distance of each point against K other random input points.
-		// Having the GPU randomly access parts of the buffer requires it to be DEVICE_LOCAL for performance.
-		// Then the CPU downloads the results and finds the median minimum distance via quick-select.
-		// This bizzare synthetic workload was specifically chosen for its unfriendliness towards simple buffer usage.
-		// The fact we have variable sized workloads and run them in a loop means we either have to dynamically
-		// suballocate from a single buffer or have K worst-case sized buffers we round robin for K-workloads in flight.
-		// Creating and destroying buffers at runtime is not an option as those are very expensive operations. 
-		// Also since CPU needs to heapify the outputs, we need to have the GPU write them into RAM not VRAM.
-		smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
-
-		// The Utility class has lots of methods to handle staging without relying on ReBAR or EXT_host_image_copy as well as more complex methods we'll cover later.
-		// Until EXT_host_image_copy becomes ubiquitous across all Nabla Core Profile devices, you need to stage image copies from an IGPUBuffer to an IGPUImage.
-		// Why use Staging for buffers in the age of ReBAR? While GPU workloads overlap the CPU, individual GPU workloads's execution might not overlap each other
-		// but their data might. In this case you want to "precisely" time the data update on the GPU timeline between the end and start of a workload.
-		// For very small updates you could use the commandbuffer updateBuffer method, but it has a size limit and the data enqueued takes up space in the commandpool.
-		// Sometimes it might be unfeasible to either have multiple copies or update references to those copies without a cascade update.
-		// One example is the transformation graph of nodes in a scene, where a copy-on-write of a node would require the update the offset/pointer held by
-		// any other node that refers to it. This quickly turns into a cascade that would force you to basically create a full copy of the entire data structure
-		// after most updates. Whereas with staging you'd "queue up" the much smaller set of updates to apply between each computation step which uses the graph.
-		// Another example are UBO and SSBO bindings, where once you run out of dynamic bindings, you can no longer easily change offsets without introducting extra indirection in shaders.
-		// Actually staging can help you re-use a commandbuffer because you don't need to re-record it if you don't need to change the offsets at which you bind!
-		// Finally ReBAR is a precious resource, my 8GB RTX 3070 only reports a 214MB Heap backing HOST_VISIBLE and DEVICE_LOCAL device local memory type.
-		smart_refctd_ptr<nbl::video::IUtilities> m_utils;
-
-		// We call them downstreaming and upstreaming, simply by how we used them so far.
-		// Meaning that upstreaming is uncached and usually ReBAR (DEVICE_LOCAL), for simple memcpy like sequential writes.
-		// While the downstreaming is CACHED and not DEVICE_LOCAL for fast random acess by the CPU.
-		// However there are cases when you'd want to use a buffer with flags identical to the default downstreaming buffer for uploads,
-		// such cases is when a CPU needs to build a data-structure in-place (due to memory constraints) before GPU accesses it,
-		// one example are Host Acceleration Structure builds (BVH building requires lots of repeated memory accesses).
-		// When choosing the memory properties of a mapped buffer consider which processor (CPU or GPU) needs faster access in event of a cache-miss.
-		nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer;
-		StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
-		// These are Buffer Device Addresses
-		uint64_t m_upStreamingBufferAddress;
-		uint64_t m_downStreamingBufferAddress;
-
 		smart_refctd_ptr<CPropertyPoolHandler> m_propertyPoolHandler;
 		smart_refctd_ptr<IGPUBuffer> m_scratchBuffer;
 		smart_refctd_ptr<IGPUBuffer> m_addressBuffer;
 		smart_refctd_ptr<IGPUBuffer> m_transferSrcBuffer;
 		smart_refctd_ptr<IGPUBuffer> m_transferDstBuffer;
 		std::vector<uint16_t> m_data;
-
-		smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet<core::GeneralpurposeAddressAllocator<uint32_t>>> m_subAllocDescriptorSet;
-
-		// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
-		uint32_t m_alignment;
 		
 		// The pool cache is just a formalized way of round-robining command pools and resetting + reusing them after their most recent submit signals finished.
 		// Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools.
@@ -220,9 +177,6 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 		static constexpr uint64_t TransfersAmount = 1024;
 		static constexpr uint64_t MaxValuesPerTransfer = 512;
 
-		constexpr static inline uint32_t maxDescriptorSetAllocationAlignment = 64u*1024u; // if you need larger alignments then you're not right in the head
-		constexpr static inline uint32_t minDescriptorSetAllocationSize = 1u;
-
 
 	public:
 		// Yay thanks to multiple inheritance we cannot forward ctors anymore
@@ -269,67 +223,6 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 			for (uint16_t i = 0; i < uint16_t((uint32_t(1) << 16) - 1); i++)
 				m_data.push_back(i);
 
-			// this time we load a shader directly from a file
-			smart_refctd_ptr<IGPUShader> shader;
-			{
-				IAssetLoader::SAssetLoadParams lp = {};
-				lp.logger = m_logger.get();
-				lp.workingDirectory = ""; // virtual root
-				auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl",lp);
-				const auto assets = assetBundle.getContents();
-				if (assets.empty())
-					return logFail("Could not load shader!");
-
-				// lets go straight from ICPUSpecializedShader to IGPUSpecializedShader
-				auto source = IAsset::castDown<ICPUShader>(assets[0]);
-				// The down-cast should not fail!
-				assert(source);
-
-				// this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple
-				shader = m_device->createShader(source.get());
-				if (!shader)
-					return logFail("Creation of a GPU Shader to from CPU Shader source failed!");
-			}
-
-			// The StreamingTransientDataBuffers are actually composed on top of another useful utility called `CAsyncSingleBufferSubAllocator`
-			// The difference is that the streaming ones are made on top of ranges of `IGPUBuffer`s backed by mappable memory, whereas the
-			// `CAsyncSingleBufferSubAllocator` just allows you suballocate subranges of any `IGPUBuffer` range with deferred/latched frees.
-			constexpr uint32_t DownstreamBufferSize = sizeof(output_t)<<24;
-			constexpr uint32_t UpstreamBufferSize = sizeof(input_t)<<24;
-			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device),smart_refctd_ptr(m_logger),DownstreamBufferSize,UpstreamBufferSize);
-			if (!m_utils)
-				return logFail("Failed to create Utilities!");
-			m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
-			m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
-			m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress();
-			m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress();
-
-			// People love Reflection but I prefer Shader Sources instead!
-			const nbl::asset::SPushConstantRange pcRange = {.stageFlags=IShader::ESS_COMPUTE,.offset=0,.size=sizeof(PushConstantData)};
-
-			// This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size
-			// and using traditional SSBO bindings would force us to update the Descriptor Set every frame.
-			// I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic
-			// only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding.
-			// Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size.
-			{
-				auto layout = m_device->createPipelineLayout({&pcRange,1});
-				IGPUComputePipeline::SCreationParams params = {};
-				params.layout = layout.get();
-				params.shader.shader = shader.get();
-				if (!m_device->createComputePipelines(nullptr,{&params,1},&m_pipeline))
-					return logFail("Failed to create compute pipeline!\n");
-			}
-
-			const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
-			// The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices
-			// which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets.
-			// Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc.
-			// there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those.
-			// We'll align to max of coherent atom size even if the memory is coherent,
-			// and we also need to take into account BDA shader loads need to be aligned to the type being loaded.
-			m_alignment = core::max(deviceLimits.nonCoherentAtomSize,alignof(float));
-
 			// We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are
 			// the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously.
 			constexpr auto MaxConcurrency = 64;
@@ -339,69 +232,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 
 			// In contrast to fences, we just need one semaphore to rule all dispatches
 			m_timeline = m_device->createSemaphore(m_iteration);
-
-
-			// Descriptor set sub allocator
-
-			video::IGPUDescriptorSetLayout::SBinding bindings[1];
-			{
-				bindings[0].binding = 0;
-				bindings[0].count = 65536u;
-				bindings[0].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT) 
-					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT 
-					| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT;
-				bindings[0].type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE;
-				bindings[0].stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE;
-			}
-
-			std::span<video::IGPUDescriptorSetLayout::SBinding> bindingsSpan(bindings);
-
-			// TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1)
-			auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet<core::GeneralpurposeAddressAllocator<uint32_t>>>(
-				bindings, maxDescriptorSetAllocationAlignment, minDescriptorSetAllocationSize
-			);
-
-			std::vector<uint32_t> allocation, size;
-			{
-				for (uint32_t i = 0; i < 512; i++)
-				{
-					allocation.push_back(core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
-					size.push_back(4);
-				}
-				subAllocatedDescriptorSet->multi_allocate(allocation.size(), &allocation[0], &size[0]);
-				for (uint32_t i = 0; i < allocation.size(); i++)
-				{
-					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
-					assert(allocation[i] != core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
-				}
-			}
-			{
-				std::vector<uint32_t> addr, freeSize;
-				for (uint32_t i = 0; i < 512; i+=2)
-				{
-					addr.push_back(allocation[i]);
-					freeSize.push_back(4);
-				}
-				subAllocatedDescriptorSet->multi_deallocate(addr.size(), &addr[0], &freeSize[0]);
-			}
-
-			m_logger->log("Freed some allocations", system::ILogger::ELL_INFO);
-			allocation.clear();
-			size.clear();
-			{
-				for (uint32_t i = 0; i < 512; i++)
-				{
-					allocation.push_back(core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
-					size.push_back(2);
-				}
-				subAllocatedDescriptorSet->multi_allocate(allocation.size(), &allocation[0], &size[0]);
-				for (uint32_t i = 0; i < allocation.size(); i++)
-				{
-					m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
-					assert(allocation[i] != core::GeneralpurposeAddressAllocator<uint32_t>::invalid_address);
-				}
-			}
-
+			
 			return true;
 		}
 
@@ -425,7 +256,6 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 				m_poolCache->getPool(poolIx)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1},core::smart_refctd_ptr(m_logger));
 				// lets record, its still a one time submit because we have to re-record with different push constants each time
 				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-				cmdbuf->bindComputePipeline(m_pipeline.get());
 
 				// COMMAND RECORDING
 				uint32_t dataSize = (((sizeof(uint16_t) * m_data.size()) + 3) / 4) * 4;
@@ -437,7 +267,7 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 				CPropertyPoolHandler::TransferRequest transferRequest;
 				transferRequest.memblock = asset::SBufferRange<video::IGPUBuffer> { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr<video::IGPUBuffer>(m_transferSrcBuffer) };
 				transferRequest.elementSize = 1;
-				transferRequest.elementCount = m_data.size();
+				transferRequest.elementCount = (m_data.size() * sizeof(uint16_t)) / sizeof(uint32_t);
 				transferRequest.buffer = asset::SBufferBinding<video::IGPUBuffer> { 0, core::smart_refctd_ptr<video::IGPUBuffer>(m_transferDstBuffer) };
 				transferRequest.srcAddressesOffset = IPropertyPool::invalid;
 				transferRequest.dstAddressesOffset = IPropertyPool::invalid;
@@ -496,24 +326,13 @@ class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplicat
 					uint16_t expected = reinterpret_cast<uint16_t*>(ptr)[i];
 					uint16_t actual = m_data[i];
 					std::printf("%i, ", expected);
-					//assert(expected == actual);
+					assert(expected == actual);
 				}
 				std::printf("\n");
-				_NBL_DEBUG_BREAK_IF(true);
 				bool success = mem.memory->unmap();
 				assert(success);
 			}
 		}
-
-		bool onAppTerminated() override
-		{
-			// Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated`
-			// (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain)
-			while (m_downStreamingBuffer->cull_frees()) {}
-
-			return device_base_t::onAppTerminated();
-		}
 };
 
-
 NBL_MAIN_FUNC(PropertyPoolsApp)
\ No newline at end of file

From e7b1f9bc5236f457eaa27c2771d875f4564c95f7 Mon Sep 17 00:00:00 2001
From: deprilula28 <deprilula28@gmail.com>
Date: Fri, 16 Feb 2024 15:48:18 -0300
Subject: [PATCH 13/13] Remove window app stuff

---
 66_PropertyPools/main.cpp | 135 +-------------------------------------
 1 file changed, 1 insertion(+), 134 deletions(-)

diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp
index c69a6abef..2e28ca527 100644
--- a/66_PropertyPools/main.cpp
+++ b/66_PropertyPools/main.cpp
@@ -9,137 +9,6 @@
 #include "../common/BasicMultiQueueApplication.hpp"
 #include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
-namespace nbl::examples
-{
-
-using namespace nbl;
-using namespace core;
-using namespace system;
-using namespace ui;
-using namespace asset;
-using namespace video;
-
-// Virtual Inheritance because apps might end up doing diamond inheritance
-class WindowedApplication : public virtual BasicMultiQueueApplication
-{
-		using base_t = BasicMultiQueueApplication;
-
-	public:
-		using base_t::base_t;
-
-		virtual video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override
-		{
-			auto retval = base_t::getAPIFeaturesToEnable();
-			// We only support one swapchain mode, surface, the other one is Display which we have not implemented yet.
-			retval.swapchainMode = video::E_SWAPCHAIN_MODE::ESM_SURFACE;
-			return retval;
-		}
-
-		// New function, we neeed to know about surfaces to create ahead of time
-		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const = 0;
-
-		virtual core::set<video::IPhysicalDevice*> filterDevices(const core::SRange<video::IPhysicalDevice* const>& physicalDevices) const
-		{
-			const auto firstFilter = base_t::filterDevices(physicalDevices);
-
-			video::SPhysicalDeviceFilter deviceFilter = {};
-			
-			const auto surfaces = getSurfaces();
-			deviceFilter.requiredSurfaceCompatibilities = { surfaces.data(), surfaces.size() };
-
-			return deviceFilter(physicalDevices);
-		}
-		
-		virtual bool onAppInitialized(smart_refctd_ptr<ISystem>&& system)
-		{
-			// Remember to call the base class initialization!
-			if (!base_t::onAppInitialized(std::move(system)))
-				return false;
-
-		#ifdef _NBL_PLATFORM_WINDOWS_
-			m_winMgr = nbl::ui::IWindowManagerWin32::create();
-		#else
-			#error "Unimplemented!"
-		#endif
-		}
-
-		core::smart_refctd_ptr<ui::IWindowManager> m_winMgr;
-};
-
-
-// Before we get onto creating a window, we need to discuss how Nabla handles input, clipboards and cursor control
-class IWindowClosedCallback : public virtual nbl::ui::IWindow::IEventCallback
-{
-	public:
-		IWindowClosedCallback() : m_gotWindowClosedMsg(false) {}
-
-		// unless you create a separate callback per window, both will "trip" this condition
-		bool windowGotClosed() const {return m_gotWindowClosedMsg;}
-
-	private:
-		bool onWindowClosed_impl() override
-		{
-			m_gotWindowClosedMsg = true;
-			return true;
-		}
-
-		bool m_gotWindowClosedMsg;
-};
-
-// We inherit from an application that tries to find Graphics and Compute queues
-// because applications with presentable images often want to perform Graphics family operations
-// Virtual Inheritance because apps might end up doing diamond inheritance
-class SingleNonResizableWindowApplication : public virtual WindowedApplication
-{
-		using base_t = WindowedApplication;
-
-	protected:
-		virtual IWindow::SCreationParams getWindowCreationParams() const
-		{
-			IWindow::SCreationParams params = {};
-			params.callback = make_smart_refctd_ptr<IWindowClosedCallback>();
-			params.width = 640;
-			params.height = 480;
-			params.x = 32;
-			params.y = 32;
-			params.flags = IWindow::ECF_NONE;
-			params.windowCaption = "SingleNonResizableWindowApplication";
-			return params;
-		}
-
-		core::smart_refctd_ptr<ui::IWindow> m_window;
-		core::smart_refctd_ptr<video::ISurfaceVulkan> m_surface;
-
-	public:
-		using base_t::base_t;
-
-		virtual bool onAppInitialized(smart_refctd_ptr<nbl::system::ISystem>&& system) override
-		{
-			// Remember to call the base class initialization!
-			if (!base_t::onAppInitialized(std::move(system)))
-				return false;
-
-			m_window = m_winMgr->createWindow(getWindowCreationParams());
-			m_surface = video::CSurfaceVulkanWin32::create(core::smart_refctd_ptr(m_api),core::smart_refctd_ptr_static_cast<ui::IWindowWin32>(m_window));
-			return true;
-		}
-
-		virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const
-		{
-			return {{m_surface.get()/*,EQF_NONE*/}};
-		}
-
-		virtual bool keepRunning() override
-		{
-			if (!m_window || reinterpret_cast<const IWindowClosedCallback*>(m_window->getEventCallback())->windowGotClosed())
-				return false;
-
-			return true;
-		}
-};
-}
-
-
 using namespace nbl;
 using namespace core;
 using namespace system;
@@ -147,13 +16,11 @@ using namespace ui;
 using namespace asset;
 using namespace video;
 
-
 #include "app_resources/common.hlsl"
 #include "nbl/builtin/hlsl/bit.hlsl"
 
-
 // In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants 
-class PropertyPoolsApp final : public examples::SingleNonResizableWindowApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
+class PropertyPoolsApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
 {
 		using device_base_t = examples::MonoDeviceApplication;
 		using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;