From f2ea51d0b3e3388c0f9bae03602ec3b1f658c124 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Sun, 23 Mar 2025 19:29:49 -0300
Subject: [PATCH 1/9] Morton code tests

---
 CMakeLists.txt                       |  3 +-
 XX_Mortons/CMakeLists.txt            | 24 ++++++++++
 XX_Mortons/app_resources/shader.hlsl |  7 +++
 XX_Mortons/config.json.template      | 28 +++++++++++
 XX_Mortons/main.cpp                  | 69 ++++++++++++++++++++++++++++
 XX_Mortons/pipeline.groovy           | 50 ++++++++++++++++++++
 6 files changed, 180 insertions(+), 1 deletion(-)
 create mode 100644 XX_Mortons/CMakeLists.txt
 create mode 100644 XX_Mortons/app_resources/shader.hlsl
 create mode 100644 XX_Mortons/config.json.template
 create mode 100644 XX_Mortons/main.cpp
 create mode 100644 XX_Mortons/pipeline.groovy

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb03f95a4..7fcddfc18 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,7 +95,8 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(67_RayQueryGeometry EXCLUDE_FROM_ALL)
 	add_subdirectory(68_JpegLoading EXCLUDE_FROM_ALL)
 
-  add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
+  	add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
+	add_subdirectory(XX_Mortons EXCLUDE_FROM_ALL)
 
 	NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}")
 endif()
diff --git a/XX_Mortons/CMakeLists.txt b/XX_Mortons/CMakeLists.txt
new file mode 100644
index 000000000..a434ff32a
--- /dev/null
+++ b/XX_Mortons/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/XX_Mortons/app_resources/shader.hlsl b/XX_Mortons/app_resources/shader.hlsl
new file mode 100644
index 000000000..a24a78191
--- /dev/null
+++ b/XX_Mortons/app_resources/shader.hlsl
@@ -0,0 +1,7 @@
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+
+[numthreads(512, 1, 1)]
+void main(uint32_t3 ID : SV_DispatchThreadID)
+{
+	printf("%d %d", nbl::hlsl::morton::impl::decode_masks_array<uint32_t, 2>::Masks[0], nbl::hlsl::morton::impl::decode_masks_array<uint32_t, 2>::Masks[1]);
+}
\ No newline at end of file
diff --git a/XX_Mortons/config.json.template b/XX_Mortons/config.json.template
new file mode 100644
index 000000000..717d05d53
--- /dev/null
+++ b/XX_Mortons/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/XX_Mortons/main.cpp b/XX_Mortons/main.cpp
new file mode 100644
index 000000000..881c84417
--- /dev/null
+++ b/XX_Mortons/main.cpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+
+// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
+#include "nbl/application_templates/MonoDeviceApplication.hpp"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+#include <bitset>
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace video;
+
+
+// this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platforms
+class MortonTestApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+{
+		using device_base_t = application_templates::MonoDeviceApplication;
+		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+
+		inline core::smart_refctd_ptr<video::IGPUShader> createShader(
+			const char* includeMainName)
+		{
+			std::string prelude = "#include \"";
+			auto CPUShader = core::make_smart_refctd_ptr<ICPUShader>((prelude + includeMainName + "\"\n").c_str(), IShader::E_SHADER_STAGE::ESS_COMPUTE, IShader::E_CONTENT_TYPE::ECT_HLSL, includeMainName);
+			assert(CPUShader);
+			return m_device->createShader(CPUShader.get());
+		}
+	public:
+		MortonTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+			system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+		// we stuff all our work here because its a "single shot" app
+		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		{
+			// Remember to call the base class initialization!
+			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+				return false;
+			if (!asset_base_t::onAppInitialized(std::move(system)))
+				return false;
+
+			createShader("app_resources/shader.hlsl");
+
+			const auto masksArray = hlsl::morton::impl::decode_masks_array<uint32_t, 3>::Masks;
+			for (auto i = 0u; i < 3; i++)
+			{
+				std::cout << std::bitset<32>(masksArray[i]) << std::endl;
+			}
+
+			return true;
+		}
+
+		// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
+		void workLoopBody() override {}
+
+		// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
+		bool keepRunning() override {return false;}
+
+	private:
+		smart_refctd_ptr<nbl::video::CVulkanConnection> m_api;
+};
+
+
+NBL_MAIN_FUNC(MortonTestApp)
\ No newline at end of file
diff --git a/XX_Mortons/pipeline.groovy b/XX_Mortons/pipeline.groovy
new file mode 100644
index 000000000..1a7b043a4
--- /dev/null
+++ b/XX_Mortons/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
+{
+	public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file

From 8f4e4529ca6f31ace6498cf9ac4284c14dbdf652 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Tue, 25 Mar 2025 10:44:31 -0300
Subject: [PATCH 2/9] Morton codes creating properly

---
 XX_Mortons/app_resources/common.hlsl |  10 ++
 XX_Mortons/app_resources/shader.hlsl |  15 +-
 XX_Mortons/main.cpp                  | 241 ++++++++++++++++++++++++++-
 3 files changed, 259 insertions(+), 7 deletions(-)
 create mode 100644 XX_Mortons/app_resources/common.hlsl

diff --git a/XX_Mortons/app_resources/common.hlsl b/XX_Mortons/app_resources/common.hlsl
new file mode 100644
index 000000000..3a9fca3fa
--- /dev/null
+++ b/XX_Mortons/app_resources/common.hlsl
@@ -0,0 +1,10 @@
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+
+NBL_CONSTEXPR uint32_t bufferSize = 256;
+using scalar_t = int32_t;
+using unsigned_scalar_t = nbl::hlsl::make_unsigned_t<scalar_t>;
+
+struct PushConstantData
+{
+	uint64_t deviceBufferAddress;
+};
\ No newline at end of file
diff --git a/XX_Mortons/app_resources/shader.hlsl b/XX_Mortons/app_resources/shader.hlsl
index a24a78191..d1f7c967e 100644
--- a/XX_Mortons/app_resources/shader.hlsl
+++ b/XX_Mortons/app_resources/shader.hlsl
@@ -1,7 +1,16 @@
-#include "nbl/builtin/hlsl/math/morton.hlsl"
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/bda/legacy_bda_accessor.hlsl"
 
-[numthreads(512, 1, 1)]
+[[vk::push_constant]] PushConstantData pushConstants;
+
+using namespace nbl::hlsl;
+
+[numthreads(bufferSize, 1, 1)]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
-	printf("%d %d", nbl::hlsl::morton::impl::decode_masks_array<uint32_t, 2>::Masks[0], nbl::hlsl::morton::impl::decode_masks_array<uint32_t, 2>::Masks[1]);
+	LegacyBdaAccessor<unsigned_scalar_t> accessor = LegacyBdaAccessor<unsigned_scalar_t>::create(pushConstants.deviceBufferAddress);
+	
+	morton::code<int32_t, 2> foo = morton::code<int32_t, 2>::create(vector<int32_t, 2>(-32768, -1));
+
+	accessor.set(0, foo.value);
 }
\ No newline at end of file
diff --git a/XX_Mortons/main.cpp b/XX_Mortons/main.cpp
index 881c84417..860b581d2 100644
--- a/XX_Mortons/main.cpp
+++ b/XX_Mortons/main.cpp
@@ -7,7 +7,7 @@
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
-#include "nbl/builtin/hlsl/math/morton.hlsl"
+#include "app_resources/common.hlsl"
 #include <bitset>
 
 using namespace nbl;
@@ -16,7 +16,6 @@ using namespace system;
 using namespace asset;
 using namespace video;
 
-
 // this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platforms
 class MortonTestApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
 {
@@ -44,14 +43,221 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
 
-			createShader("app_resources/shader.hlsl");
+			auto shader = createShader("app_resources/shader.hlsl");
+
+			// Create massive upload/download buffers
+			constexpr uint32_t DownstreamBufferSize = sizeof(unsigned_scalar_t) << 23;
+			constexpr uint32_t UpstreamBufferSize = sizeof(unsigned_scalar_t) << 23;
+
+			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize, UpstreamBufferSize);
+			if (!m_utils)
+				return logFail("Failed to create Utilities!");
+			m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
+			m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
+			m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress();
+			m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress();
+
+			// Create device-local buffer
+			{
+				IGPUBuffer::SCreationParams deviceLocalBufferParams = {};
+
+				IQueue* const queue = getComputeQueue();
+				uint32_t queueFamilyIndex = queue->getFamilyIndex();
+
+				deviceLocalBufferParams.queueFamilyIndexCount = 1;
+				deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex;
+				deviceLocalBufferParams.size = sizeof(unsigned_scalar_t) * bufferSize;
+				deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+				m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams));
+				auto mreqs = m_deviceLocalBuffer->getMemoryReqs();
+				mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+				auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
+
+				m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress();
+			}
+
+			const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(PushConstantData) };
+
+			{
+				auto layout = m_device->createPipelineLayout({ &pcRange,1 });
+				IGPUComputePipeline::SCreationParams params = {};
+				params.layout = layout.get();
+				params.shader.shader = shader.get();
+				params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize));
+				params.shader.requireFullSubgroups = true;
+				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
+					return logFail("Failed to create compute pipeline!\n");
+			}
+
+			const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
+			// The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices
+			// which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets.
+			// Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc.
+			// there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those.
+			// We'll align to max of coherent atom size even if the memory is coherent,
+			// and we also need to take into account BDA shader loads need to be aligned to the type being loaded.
+			m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(float));
+
+			// Semaphor used here to know the FFT is done before download
+			m_timeline = m_device->createSemaphore(semaphorValue);
+
+			IQueue* const queue = getComputeQueue();
+
+			const uint32_t inputSize = sizeof(unsigned_scalar_t) * bufferSize;
+
+			// Just need a single suballocation in this example
+			const uint32_t AllocationCount = 1;
+
+			// It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value
+			// this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args.
+			auto inputOffset = m_upStreamingBuffer->invalid_value;
+
+			// We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled)
+			// Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later).
+			std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
+			// note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly
+			m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment);
+
+			// Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example!
+			{
+				auto* const inputPtr = reinterpret_cast<unsigned_scalar_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer()) + inputOffset);
+				for (auto j = 0; j < bufferSize; j++)
+				{
+					unsigned_scalar_t x = j > 0 ? 0.f : 2.f;
+					unsigned_scalar_t y = 0;
+
+					/*
+					unsigned_scalar_t x = 1.f;
+					unsigned_scalar_t y = 0.f;
+					*/
+
+					inputPtr[2 * j] = x;
+					inputPtr[2 * j + 1] = y;
+				}
+				// Always remember to flush!
+				if (m_upStreamingBuffer->needsManualFlushOrInvalidate())
+				{
+					const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory();
+					const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize);
+					m_device->flushMappedMemoryRanges(1, &range);
+				}
+			}
+
+			// finally allocate our output range
+			const uint32_t outputSize = inputSize;
 
+			auto outputOffset = m_downStreamingBuffer->invalid_value;
+			m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment);
+
+			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+			{
+				smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+				if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) {
+					return logFail("Failed to create Command Buffers!\n");
+				}
+				cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger));
+				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+				cmdbuf->bindComputePipeline(m_pipeline.get());
+				// This is the new fun part, pushing constants
+				const PushConstantData pc = { .deviceBufferAddress = m_deviceLocalBufferAddress };
+				IGPUCommandBuffer::SBufferCopy copyInfo = {};
+				copyInfo.srcOffset = 0;
+				copyInfo.dstOffset = 0;
+				copyInfo.size = m_deviceLocalBuffer->getSize();
+				cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, &copyInfo);
+				cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+				// Remember we do a single workgroup per 1D array in these parts
+				cmdbuf->dispatch(1, 1, 1);
+
+				// Pipeline barrier: wait for FFT shader to be done before copying to downstream buffer 
+				IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo = {};
+
+				decltype(pipelineBarrierInfo)::buffer_barrier_t barrier = {};
+				pipelineBarrierInfo.bufBarriers = { &barrier, 1u };
+
+				barrier.range.buffer = m_deviceLocalBuffer;
+
+				barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+				barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS;
+				barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
+				barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS;
+
+				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo);
+				cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, &copyInfo);
+				cmdbuf->end();
+			}
+
+			semaphorValue++;
+			{
+				const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
+				{
+					.cmdbuf = cmdbuf.get()
+				};
+				const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
+				{
+					.semaphore = m_timeline.get(),
+					.value = semaphorValue,
+					.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+				};
+
+				const IQueue::SSubmitInfo submitInfo = {
+					.waitSemaphores = {},
+					.commandBuffers = {&cmdbufInfo,1},
+					.signalSemaphores = {&signalInfo,1}
+				};
+
+				m_api->startCapture();
+				queue->submit({ &submitInfo,1 });
+				m_api->endCapture();
+			}
+
+			// We let all latches know what semaphore and counter value has to be passed for the functors to execute
+			const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue };
+
+			// As promised, we can defer an upstreaming buffer deallocation until a fence is signalled
+			// You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation.
+			m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait);
+
+			// Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer.
+			// Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory.
+			// Its nice because it will also remember to invalidate our memory mapping if its not coherent.
+			auto latchedConsumer = make_smart_refctd_ptr<IUtilities::CDownstreamingDataConsumer>(
+				IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize),
+				// Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals
+				[=](const size_t dstOffset, const void* bufSrc, const size_t size)->void
+				{
+					// The unused variable is used for letting the consumer know the subsection of the output we've managed to download
+					// But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves.
+					assert(dstOffset == 0 && size == outputSize);
+
+					std::cout << "Begin array GPU\n";
+					unsigned_scalar_t* const data = reinterpret_cast<unsigned_scalar_t*>(const_cast<void*>(bufSrc));
+					std::cout << std::bitset<32>(data[0]) << "\n";
+					/*
+					for (auto i = 0u; i < bufferSize; i++) {
+						std::cout << std::bitset<32>(data[i]) << "\n";
+					}
+					*/
+					std::cout << "\nEnd array GPU\n";
+				},
+				// Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it
+				// hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands.
+				// It could also be latched in the upstreaming deallocate, because its the same fence.
+				std::move(cmdbuf), m_downStreamingBuffer
+			);
+			// We put a function we want to execute 
+			m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get());
+
+			// ------------------------------------------- CPP ------------------------------------------------------------------------------------------------------
 			const auto masksArray = hlsl::morton::impl::decode_masks_array<uint32_t, 3>::Masks;
 			for (auto i = 0u; i < 3; i++)
 			{
 				std::cout << std::bitset<32>(masksArray[i]) << std::endl;
 			}
 
+			const auto someCode = hlsl::morton::code<uint32_t, 4>::create(hlsl::vector<uint32_t, 4>(1, 1, 1, 1));
+
 			return true;
 		}
 
@@ -61,8 +267,35 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 		// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
 		bool keepRunning() override {return false;}
 
+		// Cleanup
+		bool onAppTerminated() override
+		{
+			// Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated`
+			// (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain)
+			while (m_downStreamingBuffer->cull_frees()) {}
+			return device_base_t::onAppTerminated();
+		}
+
 	private:
-		smart_refctd_ptr<nbl::video::CVulkanConnection> m_api;
+		smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
+
+		smart_refctd_ptr<nbl::video::IUtilities> m_utils;
+
+		nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer;
+		StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
+		smart_refctd_ptr<nbl::video::IGPUBuffer> m_deviceLocalBuffer;
+
+		// These are Buffer Device Addresses
+		uint64_t m_upStreamingBufferAddress;
+		uint64_t m_downStreamingBufferAddress;
+		uint64_t m_deviceLocalBufferAddress;
+
+		// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
+		uint32_t m_alignment;
+
+		// This example really lets the advantages of a timeline semaphore shine through!
+		smart_refctd_ptr<ISemaphore> m_timeline;
+		uint64_t semaphorValue = 0;
 };
 
 

From 0aedfd929a505657ef761c84be15cfaf8d4ddb7b Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Fri, 28 Mar 2025 20:16:45 -0300
Subject: [PATCH 3/9] All tests passing, HLSL compiles fine!

---
 XX_Mortons/main.cpp | 235 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 177 insertions(+), 58 deletions(-)

diff --git a/XX_Mortons/main.cpp b/XX_Mortons/main.cpp
index 860b581d2..b20662904 100644
--- a/XX_Mortons/main.cpp
+++ b/XX_Mortons/main.cpp
@@ -10,6 +10,9 @@
 #include "app_resources/common.hlsl"
 #include <bitset>
 
+// Right now the test only checks that HLSL compiles the file
+constexpr bool TestHLSL = true;
+
 using namespace nbl;
 using namespace core;
 using namespace system;
@@ -22,6 +25,12 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 		using device_base_t = application_templates::MonoDeviceApplication;
 		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 
+		using morton_t = nbl::hlsl::morton::code<int32_t, 3>;
+		using vector_t = nbl::hlsl::vector<int32_t, 3>;
+		using unsigned_morton_t = nbl::hlsl::morton::code<uint32_t, 3>;
+		using unsigned_vector_t = nbl::hlsl::vector<uint32_t, 3>;
+		using bool_vector_t = nbl::hlsl::vector<bool, 3>;
+
 		inline core::smart_refctd_ptr<video::IGPUShader> createShader(
 			const char* includeMainName)
 		{
@@ -43,18 +52,173 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
 
+			// ----------------------------------------------- CPP TESTS ----------------------------------------------------------------------
+			
+			// Coordinate extraction and whole vector decode tests
+			{
+				morton_t morton(vector_t(-1011, 765, 248));
+				unsigned_morton_t unsignedMorton(unsigned_vector_t(154, 789, 1011));
+
+				assert(morton.getCoordinate(0) == -1011 && morton.getCoordinate(1) == 765 && morton.getCoordinate(2) == 248);
+				assert(unsignedMorton.getCoordinate(0) == 154u && unsignedMorton.getCoordinate(1) == 789u && unsignedMorton.getCoordinate(2) == 1011u);
+
+				assert(static_cast<vector_t>(morton) == vector_t(-1011, 765, 248) && static_cast<unsigned_vector_t>(unsignedMorton) == unsigned_vector_t(154, 789, 1011));
+			}
+
+			// ***********************************************************************************************************************************
+			// ************************************************* Arithmetic operator tests *******************************************************
+			// ***********************************************************************************************************************************
+			
+			//  ----------------------------------------------------------------------------------------------------
+			//  --------------------------------------- ADDITION ---------------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// ---------------------------------------- Signed -----------------------------------------------------
+			
+			// No overflow
+			assert(static_cast<vector_t>(morton_t(vector_t(-1011, 765, 248)) + morton_t(vector_t(1000, -985, 200))) == vector_t(-11, -220, 448));
+			
+			// Type 1 overflow: Addition of representable coordinates goes out of range
+			assert(static_cast<vector_t>(morton_t(vector_t(-900, 70, 500)) + morton_t(vector_t(-578, -50, 20))) == vector_t(570, 20, -504));
+
+			// Type 2 overflow: Addition of irrepresentable range gives correct result
+			assert(static_cast<vector_t>(morton_t(vector_t(54, 900, -475)) + morton_t(vector_t(46, -1437, 699))) == vector_t(100, -537, 224));
+
+			// ---------------------------------------- Unsigned -----------------------------------------------------
+
+			// No overflow
+			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(1563, 754, 220))) == unsigned_vector_t(1945, 1664, 763));
+
+			// Type 1 overflow: Addition of representable coordinates goes out of range
+			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(2000, 2000, 1000))) == unsigned_vector_t(334, 862, 519));
+
+			// Type 2 overflow: Addition of irrepresentable range gives correct result
+			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(-143, -345, -233))) == unsigned_vector_t(239, 565, 310));
+
+			//  ----------------------------------------------------------------------------------------------------
+			//  -------------------------------------- SUBTRACTION -------------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// ---------------------------------------- Signed -----------------------------------------------------
+
+			// No overflow
+			assert(static_cast<vector_t>(morton_t(vector_t(1000, 764, -365)) - morton_t(vector_t(834, -243, 100))) == vector_t(166, 1007, -465));
+
+			// Type 1 overflow: Subtraction of representable coordinates goes out of range
+			assert(static_cast<vector_t>(morton_t(vector_t(-900, 70, 500)) - morton_t(vector_t(578, -50, -20))) == vector_t(570, 120, -504));
+
+			// Type 2 overflow: Subtraction of irrepresentable range gives correct result
+			assert(static_cast<vector_t>(morton_t(vector_t(54, 900, -475)) - morton_t(vector_t(-46, 1437, -699))) == vector_t(100, -537, 224));
+
+			// ---------------------------------------- Unsigned -----------------------------------------------------
+
+			// No overflow
+			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) - unsigned_morton_t(unsigned_vector_t(322, 564, 299))) == unsigned_vector_t(60, 346, 244));
+
+			// Type 1 overflow: Subtraction of representable coordinates goes out of range
+			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) - unsigned_morton_t(unsigned_vector_t(2000, 2000, 1000))) == unsigned_vector_t(430, 958, 567));
+
+			// Type 2 overflow: Subtraction of irrepresentable range gives correct result
+			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(54, 900, 475)) - unsigned_morton_t(unsigned_vector_t(-865, -100, -10))) == unsigned_vector_t(919, 1000, 485));
+
+
+			//  ----------------------------------------------------------------------------------------------------
+			//  -------------------------------------- UNARY NEGATION ----------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// Only makes sense for signed
+			assert(static_cast<vector_t>(- morton_t(vector_t(-1024, 543, -475))) == vector_t(-1024, -543, 475));
+
+			// ***********************************************************************************************************************************
+			// ************************************************* Comparison operator tests *******************************************************
+			// ***********************************************************************************************************************************
+
+			//  ----------------------------------------------------------------------------------------------------
+			//  -------------------------------------- OPERATOR< ---------------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// Signed
+			
+			// Same sign, negative
+			assert(morton_t(vector_t(-954, -455, -333)) < morton_t(vector_t(-433, -455, -433)) == bool_vector_t(true, false, false));
+			// Same sign, positive
+			assert(morton_t(vector_t(954, 455, 333)) < morton_t(vector_t(433, 455, 433)) == bool_vector_t(false, false, true));
+			// Differing signs
+			assert(morton_t(vector_t(954, -32, 0)) < morton_t(vector_t(-44, 0, -1)) == bool_vector_t(false, true, false));
+
+			// Unsigned
+			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) < unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(true, false, false));
+
+			//  ----------------------------------------------------------------------------------------------------
+			//  -------------------------------------- OPERATOR<= --------------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// Signed
+
+			// Same sign, negative
+			assert(morton_t(vector_t(-954, -455, -333)) <= morton_t(vector_t(-433, -455, -433)) == bool_vector_t(true, true, false));
+			// Same sign, positive
+			assert(morton_t(vector_t(954, 455, 333)) <= morton_t(vector_t(433, 455, 433)) == bool_vector_t(false, true, true));
+			// Differing signs
+			assert(morton_t(vector_t(954, -32, 0)) <= morton_t(vector_t(-44, 0, -1)) == bool_vector_t(false, true, false));
+
+			// Unsigned
+			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) <= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(true, true, false));
+
+			//  ----------------------------------------------------------------------------------------------------
+			//  -------------------------------------- OPERATOR> ---------------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// Signed
+
+			// Same sign, negative
+			assert(morton_t(vector_t(-954, -455, -333)) > morton_t(vector_t(-433, -455, -433)) == bool_vector_t(false, false, true));
+			// Same sign, positive
+			assert(morton_t(vector_t(954, 455, 333)) > morton_t(vector_t(433, 455, 433)) == bool_vector_t(true, false, false));
+			// Differing signs
+			assert(morton_t(vector_t(954, -32, 0)) > morton_t(vector_t(-44, 0, -1)) == bool_vector_t(true, false, true));
+
+			// Unsigned
+			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) > unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, false, true));
+
+			//  ----------------------------------------------------------------------------------------------------
+			//  -------------------------------------- OPERATOR>= --------------------------------------------------
+			//  ----------------------------------------------------------------------------------------------------
+
+			// Signed
+
+			// Same sign, negative
+			assert(morton_t(vector_t(-954, -455, -333)) >= morton_t(vector_t(-433, -455, -433)) == bool_vector_t(false, true, true));
+			// Same sign, positive
+			assert(morton_t(vector_t(954, 455, 333)) >= morton_t(vector_t(433, 455, 433)) == bool_vector_t(true, true, false));
+			// Differing signs
+			assert(morton_t(vector_t(954, -32, 0)) >= morton_t(vector_t(-44, 0, -1)) == bool_vector_t(true, false, true));
+
+			// Unsigned
+			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) >= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, true, true));
+
+
+			if(!TestHLSL)
+				return true;
+
+
+
+
+
+
+
+
+
+			// ----------------------------------------------- HLSL COMPILATION + OPTIONAL TESTS ----------------------------------------------
 			auto shader = createShader("app_resources/shader.hlsl");
 
 			// Create massive upload/download buffers
 			constexpr uint32_t DownstreamBufferSize = sizeof(unsigned_scalar_t) << 23;
-			constexpr uint32_t UpstreamBufferSize = sizeof(unsigned_scalar_t) << 23;
 
-			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize, UpstreamBufferSize);
+			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize);
 			if (!m_utils)
 				return logFail("Failed to create Utilities!");
-			m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
 			m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
-			m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress();
 			m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress();
 
 			// Create device-local buffer
@@ -109,40 +273,9 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			// Just need a single suballocation in this example
 			const uint32_t AllocationCount = 1;
 
-			// It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value
-			// this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args.
-			auto inputOffset = m_upStreamingBuffer->invalid_value;
-
 			// We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled)
 			// Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later).
 			std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
-			// note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly
-			m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment);
-
-			// Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example!
-			{
-				auto* const inputPtr = reinterpret_cast<unsigned_scalar_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer()) + inputOffset);
-				for (auto j = 0; j < bufferSize; j++)
-				{
-					unsigned_scalar_t x = j > 0 ? 0.f : 2.f;
-					unsigned_scalar_t y = 0;
-
-					/*
-					unsigned_scalar_t x = 1.f;
-					unsigned_scalar_t y = 0.f;
-					*/
-
-					inputPtr[2 * j] = x;
-					inputPtr[2 * j + 1] = y;
-				}
-				// Always remember to flush!
-				if (m_upStreamingBuffer->needsManualFlushOrInvalidate())
-				{
-					const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory();
-					const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize);
-					m_device->flushMappedMemoryRanges(1, &range);
-				}
-			}
 
 			// finally allocate our output range
 			const uint32_t outputSize = inputSize;
@@ -161,11 +294,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 				cmdbuf->bindComputePipeline(m_pipeline.get());
 				// This is the new fun part, pushing constants
 				const PushConstantData pc = { .deviceBufferAddress = m_deviceLocalBufferAddress };
-				IGPUCommandBuffer::SBufferCopy copyInfo = {};
-				copyInfo.srcOffset = 0;
-				copyInfo.dstOffset = 0;
-				copyInfo.size = m_deviceLocalBuffer->getSize();
-				cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, &copyInfo);
 				cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
 				// Remember we do a single workgroup per 1D array in these parts
 				cmdbuf->dispatch(1, 1, 1);
@@ -184,6 +312,11 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 				barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS;
 
 				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo);
+
+				IGPUCommandBuffer::SBufferCopy copyInfo = {};
+				copyInfo.srcOffset = 0;
+				copyInfo.dstOffset = 0;
+				copyInfo.size = m_deviceLocalBuffer->getSize();
 				cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, &copyInfo);
 				cmdbuf->end();
 			}
@@ -215,10 +348,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			// We let all latches know what semaphore and counter value has to be passed for the functors to execute
 			const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue };
 
-			// As promised, we can defer an upstreaming buffer deallocation until a fence is signalled
-			// You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation.
-			m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait);
-
 			// Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer.
 			// Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory.
 			// Its nice because it will also remember to invalidate our memory mapping if its not coherent.
@@ -249,15 +378,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			// We put a function we want to execute 
 			m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get());
 
-			// ------------------------------------------- CPP ------------------------------------------------------------------------------------------------------
-			const auto masksArray = hlsl::morton::impl::decode_masks_array<uint32_t, 3>::Masks;
-			for (auto i = 0u; i < 3; i++)
-			{
-				std::cout << std::bitset<32>(masksArray[i]) << std::endl;
-			}
-
-			const auto someCode = hlsl::morton::code<uint32_t, 4>::create(hlsl::vector<uint32_t, 4>(1, 1, 1, 1));
-
 			return true;
 		}
 
@@ -272,7 +392,10 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 		{
 			// Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated`
 			// (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain)
-			while (m_downStreamingBuffer->cull_frees()) {}
+			if (TestHLSL)
+			{
+				while (m_downStreamingBuffer->cull_frees()) {}
+			}
 			return device_base_t::onAppTerminated();
 		}
 
@@ -281,19 +404,15 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 
 		smart_refctd_ptr<nbl::video::IUtilities> m_utils;
 
-		nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer;
 		StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
 		smart_refctd_ptr<nbl::video::IGPUBuffer> m_deviceLocalBuffer;
 
 		// These are Buffer Device Addresses
-		uint64_t m_upStreamingBufferAddress;
 		uint64_t m_downStreamingBufferAddress;
 		uint64_t m_deviceLocalBufferAddress;
 
-		// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
 		uint32_t m_alignment;
 
-		// This example really lets the advantages of a timeline semaphore shine through!
 		smart_refctd_ptr<ISemaphore> m_timeline;
 		uint64_t semaphorValue = 0;
 };

From ea42d5bf287cbff376809be65f64c71567e0134f Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Tue, 1 Apr 2025 15:44:55 -0300
Subject: [PATCH 4/9] Rename example

---
 {XX_Mortons => 12_Mortons}/CMakeLists.txt     |  0
 12_Mortons/app_resources/common.hlsl          | 13 ++++++++++++
 .../app_resources/shader.hlsl                 |  8 ++++---
 .../config.json.template                      |  0
 {XX_Mortons => 12_Mortons}/main.cpp           | 21 ++++++++-----------
 {XX_Mortons => 12_Mortons}/pipeline.groovy    |  0
 CMakeLists.txt                                |  2 +-
 XX_Mortons/app_resources/common.hlsl          | 10 ---------
 8 files changed, 28 insertions(+), 26 deletions(-)
 rename {XX_Mortons => 12_Mortons}/CMakeLists.txt (100%)
 create mode 100644 12_Mortons/app_resources/common.hlsl
 rename {XX_Mortons => 12_Mortons}/app_resources/shader.hlsl (79%)
 rename {XX_Mortons => 12_Mortons}/config.json.template (100%)
 rename {XX_Mortons => 12_Mortons}/main.cpp (97%)
 rename {XX_Mortons => 12_Mortons}/pipeline.groovy (100%)
 delete mode 100644 XX_Mortons/app_resources/common.hlsl

diff --git a/XX_Mortons/CMakeLists.txt b/12_Mortons/CMakeLists.txt
similarity index 100%
rename from XX_Mortons/CMakeLists.txt
rename to 12_Mortons/CMakeLists.txt
diff --git a/12_Mortons/app_resources/common.hlsl b/12_Mortons/app_resources/common.hlsl
new file mode 100644
index 000000000..bd5184f80
--- /dev/null
+++ b/12_Mortons/app_resources/common.hlsl
@@ -0,0 +1,13 @@
+//#include "nbl/builtin/hlsl/morton.hlsl"
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+NBL_CONSTEXPR uint32_t bufferSize = 256;
+
+// Proper coverage would require writing tests for ALL possible sign, dimensions and width configurations
+//using morton_t2 = nbl::hlsl::morton::code<true, 8, 2>; // Fits in an int16_t
+using vector_t2 = nbl::hlsl::vector<int16_t, 3>;
+
+struct PushConstantData
+{
+	uint64_t deviceBufferAddress;
+};
\ No newline at end of file
diff --git a/XX_Mortons/app_resources/shader.hlsl b/12_Mortons/app_resources/shader.hlsl
similarity index 79%
rename from XX_Mortons/app_resources/shader.hlsl
rename to 12_Mortons/app_resources/shader.hlsl
index d1f7c967e..e7f570eee 100644
--- a/XX_Mortons/app_resources/shader.hlsl
+++ b/12_Mortons/app_resources/shader.hlsl
@@ -3,14 +3,16 @@
 
 [[vk::push_constant]] PushConstantData pushConstants;
 
-using namespace nbl::hlsl;
-
 [numthreads(bufferSize, 1, 1)]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
+	/*
 	LegacyBdaAccessor<unsigned_scalar_t> accessor = LegacyBdaAccessor<unsigned_scalar_t>::create(pushConstants.deviceBufferAddress);
 	
 	morton::code<int32_t, 2> foo = morton::code<int32_t, 2>::create(vector<int32_t, 2>(-32768, -1));
 
-	accessor.set(0, foo.value);
+	//accessor.set(0, foo.value);
+	*/
+	uint32_t bar = _static_cast<uint32_t>(0xCAFEDEADDEADBEEF);
+	accessor.set(0, bar);
 }
\ No newline at end of file
diff --git a/XX_Mortons/config.json.template b/12_Mortons/config.json.template
similarity index 100%
rename from XX_Mortons/config.json.template
rename to 12_Mortons/config.json.template
diff --git a/XX_Mortons/main.cpp b/12_Mortons/main.cpp
similarity index 97%
rename from XX_Mortons/main.cpp
rename to 12_Mortons/main.cpp
index b20662904..d1fddba7a 100644
--- a/XX_Mortons/main.cpp
+++ b/12_Mortons/main.cpp
@@ -25,12 +25,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 		using device_base_t = application_templates::MonoDeviceApplication;
 		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
 
-		using morton_t = nbl::hlsl::morton::code<int32_t, 3>;
-		using vector_t = nbl::hlsl::vector<int32_t, 3>;
-		using unsigned_morton_t = nbl::hlsl::morton::code<uint32_t, 3>;
-		using unsigned_vector_t = nbl::hlsl::vector<uint32_t, 3>;
-		using bool_vector_t = nbl::hlsl::vector<bool, 3>;
-
 		inline core::smart_refctd_ptr<video::IGPUShader> createShader(
 			const char* includeMainName)
 		{
@@ -52,6 +46,8 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
 
+			/*
+
 			// ----------------------------------------------- CPP TESTS ----------------------------------------------------------------------
 			
 			// Coordinate extraction and whole vector decode tests
@@ -201,7 +197,7 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			if(!TestHLSL)
 				return true;
 
-
+			*/
 
 
 
@@ -213,7 +209,7 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			auto shader = createShader("app_resources/shader.hlsl");
 
 			// Create massive upload/download buffers
-			constexpr uint32_t DownstreamBufferSize = sizeof(unsigned_scalar_t) << 23;
+			constexpr uint32_t DownstreamBufferSize = sizeof(uint32_t) << 23;
 
 			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize);
 			if (!m_utils)
@@ -230,7 +226,7 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 
 				deviceLocalBufferParams.queueFamilyIndexCount = 1;
 				deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex;
-				deviceLocalBufferParams.size = sizeof(unsigned_scalar_t) * bufferSize;
+				deviceLocalBufferParams.size = sizeof(uint32_t) * bufferSize;
 				deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
 
 				m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams));
@@ -268,7 +264,7 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 
 			IQueue* const queue = getComputeQueue();
 
-			const uint32_t inputSize = sizeof(unsigned_scalar_t) * bufferSize;
+			const uint32_t inputSize = sizeof(uint32_t) * bufferSize;
 
 			// Just need a single suballocation in this example
 			const uint32_t AllocationCount = 1;
@@ -361,8 +357,9 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 					assert(dstOffset == 0 && size == outputSize);
 
 					std::cout << "Begin array GPU\n";
-					unsigned_scalar_t* const data = reinterpret_cast<unsigned_scalar_t*>(const_cast<void*>(bufSrc));
-					std::cout << std::bitset<32>(data[0]) << "\n";
+					uint32_t* const data = reinterpret_cast<uint32_t*>(const_cast<void*>(bufSrc));
+					//std::cout << std::bitset<32>(data[0]) << "\n";
+					std::cout << data[0] << "\n";
 					/*
 					for (auto i = 0u; i < bufferSize; i++) {
 						std::cout << std::bitset<32>(data[i]) << "\n";
diff --git a/XX_Mortons/pipeline.groovy b/12_Mortons/pipeline.groovy
similarity index 100%
rename from XX_Mortons/pipeline.groovy
rename to 12_Mortons/pipeline.groovy
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7fcddfc18..5d0c148cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,6 +48,7 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(10_CountingSort EXCLUDE_FROM_ALL)
 	# showcase use of FFT for post-FX Bloom  effect
 	add_subdirectory(11_FFT EXCLUDE_FROM_ALL)
+	add_subdirectory(12_Mortons EXCLUDE_FROM_ALL)
 
 
 	# Waiting for a refactor
@@ -96,7 +97,6 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(68_JpegLoading EXCLUDE_FROM_ALL)
 
   	add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL)
-	add_subdirectory(XX_Mortons EXCLUDE_FROM_ALL)
 
 	NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}")
 endif()
diff --git a/XX_Mortons/app_resources/common.hlsl b/XX_Mortons/app_resources/common.hlsl
deleted file mode 100644
index 3a9fca3fa..000000000
--- a/XX_Mortons/app_resources/common.hlsl
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "nbl/builtin/hlsl/math/morton.hlsl"
-
-NBL_CONSTEXPR uint32_t bufferSize = 256;
-using scalar_t = int32_t;
-using unsigned_scalar_t = nbl::hlsl::make_unsigned_t<scalar_t>;
-
-struct PushConstantData
-{
-	uint64_t deviceBufferAddress;
-};
\ No newline at end of file

From 2ba08a4a39bf15b3c689666012b263794b8371f2 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Tue, 1 Apr 2025 17:43:20 -0300
Subject: [PATCH 5/9] Add tests for AddCarry and SUbBorrow intrinsics

---
 22_CppCompat/CIntrinsicsTester.h       |  13 +
 22_CppCompat/app_resources/common.hlsl | 859 +++++++++++++------------
 2 files changed, 451 insertions(+), 421 deletions(-)

diff --git a/22_CppCompat/CIntrinsicsTester.h b/22_CppCompat/CIntrinsicsTester.h
index 77aa2c1ca..5fe7bc08e 100644
--- a/22_CppCompat/CIntrinsicsTester.h
+++ b/22_CppCompat/CIntrinsicsTester.h
@@ -85,6 +85,10 @@ class CIntrinsicsTester final : public ITester
             testInput.smoothStepEdge0 = realDistributionNeg(mt);
             testInput.smoothStepEdge1 = realDistributionPos(mt);
             testInput.smoothStepX = realDistribution(mt);
+            testInput.addCarryA = std::numeric_limits<uint32_t>::max() - uintDistribution(mt);
+            testInput.addCarryB = uintDistribution(mt);
+            testInput.subBorrowA = uintDistribution(mt);
+            testInput.subBorrowB = uintDistribution(mt);
 
             testInput.bitCountVec = int32_t3(intDistribution(mt), intDistribution(mt), intDistribution(mt));
             testInput.clampValVec = float32_t3(realDistribution(mt), realDistribution(mt), realDistribution(mt));
@@ -119,6 +123,10 @@ class CIntrinsicsTester final : public ITester
             testInput.refractI = float32_t3(realDistribution(mt), realDistribution(mt), realDistribution(mt));
             testInput.refractN = glm::normalize(float32_t3(realDistribution(mt), realDistribution(mt), realDistribution(mt)));
             testInput.refractEta = realDistribution(mt);
+            testInput.addCarryAVec = uint32_t3(std::numeric_limits<uint32_t>::max() - uintDistribution(mt), std::numeric_limits<uint32_t>::max() - uintDistribution(mt), std::numeric_limits<uint32_t>::max() - uintDistribution(mt));
+            testInput.addCarryBVec = uint32_t3(uintDistribution(mt), uintDistribution(mt), uintDistribution(mt));
+            testInput.subBorrowAVec = uint32_t3(uintDistribution(mt), uintDistribution(mt), uintDistribution(mt));
+            testInput.subBorrowBVec = uint32_t3(uintDistribution(mt), uintDistribution(mt), uintDistribution(mt));
 
             // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values
             IntrinsicsTestValues expected;
@@ -188,6 +196,11 @@ class CIntrinsicsTester final : public ITester
             auto inverseGlm = glm::inverse(reinterpret_cast<typename float32_t3x3::Base const&>(testInput.inverse));
             expected.inverse = reinterpret_cast<float32_t3x3&>(inverseGlm);
 
+            expected.addCarry.result = glm::uaddCarry(testInput.addCarryA, testInput.addCarryB, expected.addCarry.carry);
+            expected.subBorrow.result = glm::usubBorrow(testInput.subBorrowA, testInput.subBorrowB, expected.subBorrow.borrow);
+            expected.addCarryVec.result = glm::uaddCarry(testInput.addCarryAVec, testInput.addCarryBVec, expected.addCarryVec.carry);
+            expected.subBorrowVec.result = glm::usubBorrow(testInput.subBorrowAVec, testInput.subBorrowBVec, expected.subBorrowVec.borrow);
+
             performCpuTests(testInput, expected);
             performGpuTests(testInput, expected);
         }
diff --git a/22_CppCompat/app_resources/common.hlsl b/22_CppCompat/app_resources/common.hlsl
index e2303a2fc..dc3ff5fcd 100644
--- a/22_CppCompat/app_resources/common.hlsl
+++ b/22_CppCompat/app_resources/common.hlsl
@@ -1,74 +1,74 @@
-//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
-//// This file is part of the "Nabla Engine".
-//// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_COMMON_INCLUDED_
-#define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_COMMON_INCLUDED_
-
-// because DXC doesn't properly support `_Static_assert`
-// TODO: add a message, and move to macros.h or cpp_compat
-#define STATIC_ASSERT(...) { nbl::hlsl::conditional<__VA_ARGS__, int, void>::type a = 0; }
-
-#include <boost/preprocessor.hpp>
-
-#include <nbl/builtin/hlsl/cpp_compat.hlsl>
-#include <nbl/builtin/hlsl/type_traits.hlsl>
-
-#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
-#include <nbl/builtin/hlsl/cpp_compat/vector.hlsl>
-
-#include <nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl>
-#include <nbl/builtin/hlsl/colorspace/decodeCIEXYZ.hlsl>
-#include <nbl/builtin/hlsl/colorspace/EOTF.hlsl>
-#include <nbl/builtin/hlsl/colorspace/OETF.hlsl>
-
-#include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
-
-#include <nbl/builtin/hlsl/mpl.hlsl>
-#include <nbl/builtin/hlsl/bit.hlsl>
-
-#include <nbl/builtin/hlsl/limits.hlsl>
-
-
-#include <nbl/builtin/hlsl/barycentric/utils.hlsl>
-#include <nbl/builtin/hlsl/member_test_macros.hlsl>
-#include <nbl/builtin/hlsl/device_capabilities_traits.hlsl>
-
-#include <nbl/builtin/hlsl/tgmath.hlsl>
-#include <nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl>
-
-// tgmath.hlsl and intrinsics.hlsl tests
-
-using namespace nbl::hlsl;
-struct TgmathIntputTestValues
-{
-	float floor;
-	float isnan;
-	float isinf;
-	float powX;
-	float powY;
-	float exp;
-	float exp2;
-	float log;
-	float log2;
-	float absF;
-	int absI;
-	float sqrt;
-	float sin;
-	float cos;
-	float acos;
-	float modf;
-	float round;
-	float roundEven;
-	float trunc;
-	float ceil;
-	float fmaX;
-	float fmaY;
-	float fmaZ;
-	float ldexpArg;
-	int ldexpExp;
-	float modfStruct;
-	float frexpStruct;
+//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
+//// This file is part of the "Nabla Engine".
+//// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_COMMON_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_COMMON_INCLUDED_
+
+// because DXC doesn't properly support `_Static_assert`
+// TODO: add a message, and move to macros.h or cpp_compat
+#define STATIC_ASSERT(...) { nbl::hlsl::conditional<__VA_ARGS__, int, void>::type a = 0; }
+
+#include <boost/preprocessor.hpp>
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/type_traits.hlsl>
+
+#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/vector.hlsl>
+
+#include <nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl>
+#include <nbl/builtin/hlsl/colorspace/decodeCIEXYZ.hlsl>
+#include <nbl/builtin/hlsl/colorspace/EOTF.hlsl>
+#include <nbl/builtin/hlsl/colorspace/OETF.hlsl>
+
+#include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
+
+#include <nbl/builtin/hlsl/mpl.hlsl>
+#include <nbl/builtin/hlsl/bit.hlsl>
+
+#include <nbl/builtin/hlsl/limits.hlsl>
+
+
+#include <nbl/builtin/hlsl/barycentric/utils.hlsl>
+#include <nbl/builtin/hlsl/member_test_macros.hlsl>
+#include <nbl/builtin/hlsl/device_capabilities_traits.hlsl>
+
+#include <nbl/builtin/hlsl/tgmath.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl>
+
+// tgmath.hlsl and intrinsics.hlsl tests
+
+using namespace nbl::hlsl;
+struct TgmathIntputTestValues
+{
+	float floor;
+	float isnan;
+	float isinf;
+	float powX;
+	float powY;
+	float exp;
+	float exp2;
+	float log;
+	float log2;
+	float absF;
+	int absI;
+	float sqrt;
+	float sin;
+	float cos;
+	float acos;
+	float modf;
+	float round;
+	float roundEven;
+	float trunc;
+	float ceil;
+	float fmaX;
+	float fmaY;
+	float fmaZ;
+	float ldexpArg;
+	int ldexpExp;
+	float modfStruct;
+	float frexpStruct;
 	float tan;
 	float asin;
 	float atan;
@@ -78,38 +78,38 @@ struct TgmathIntputTestValues
 	float asinh;
 	float acosh;
 	float atanh;
-	float atan2X;
-	float atan2Y;
-	float erf;
-	float erfInv;
-
-	float32_t3 floorVec;
-	float32_t3 isnanVec;
-	float32_t3 isinfVec;
-	float32_t3 powXVec;
-	float32_t3 powYVec;
-	float32_t3 expVec;
-	float32_t3 exp2Vec;
-	float32_t3 logVec;
-	float32_t3 log2Vec;
-	float32_t3 absFVec;
-	int32_t3 absIVec;
-	float32_t3 sqrtVec;
-	float32_t3 sinVec;
-	float32_t3 cosVec;
-	float32_t3 acosVec;
-	float32_t3 modfVec;
-	float32_t3 roundVec;
-	float32_t3 roundEvenVec;
-	float32_t3 truncVec;
-	float32_t3 ceilVec;
-	float32_t3 fmaXVec;
-	float32_t3 fmaYVec;
-	float32_t3 fmaZVec;
-	float32_t3 ldexpArgVec;
-	int32_t3 ldexpExpVec;
-	float32_t3 modfStructVec;
-	float32_t3 frexpStructVec;
+	float atan2X;
+	float atan2Y;
+	float erf;
+	float erfInv;
+
+	float32_t3 floorVec;
+	float32_t3 isnanVec;
+	float32_t3 isinfVec;
+	float32_t3 powXVec;
+	float32_t3 powYVec;
+	float32_t3 expVec;
+	float32_t3 exp2Vec;
+	float32_t3 logVec;
+	float32_t3 log2Vec;
+	float32_t3 absFVec;
+	int32_t3 absIVec;
+	float32_t3 sqrtVec;
+	float32_t3 sinVec;
+	float32_t3 cosVec;
+	float32_t3 acosVec;
+	float32_t3 modfVec;
+	float32_t3 roundVec;
+	float32_t3 roundEvenVec;
+	float32_t3 truncVec;
+	float32_t3 ceilVec;
+	float32_t3 fmaXVec;
+	float32_t3 fmaYVec;
+	float32_t3 fmaZVec;
+	float32_t3 ldexpArgVec;
+	int32_t3 ldexpExpVec;
+	float32_t3 modfStructVec;
+	float32_t3 frexpStructVec;
 	float32_t3 tanVec;
 	float32_t3 asinVec;
 	float32_t3 atanVec;
@@ -119,35 +119,35 @@ struct TgmathIntputTestValues
 	float32_t3 asinhVec;
 	float32_t3 acoshVec;
 	float32_t3 atanhVec;
-	float32_t3 atan2XVec;
-	float32_t3 atan2YVec;
-	float32_t3 erfVec;
-	float32_t3 erfInvVec;
-};
-
-struct TgmathTestValues
-{
-	float floor;
-	int isnan;
-	int isinf;
-	float pow;
-	float exp;
-	float exp2;
-	float log;
-	float log2;
-	float absF;
-	int absI;
-	float sqrt;
-	float sin;
-	float cos;
-	float acos;
-	float modf;
-	float round;
-	float roundEven;
-	float trunc;
-	float ceil;
-	float fma;
-	float ldexp;
+	float32_t3 atan2XVec;
+	float32_t3 atan2YVec;
+	float32_t3 erfVec;
+	float32_t3 erfInvVec;
+};
+
+struct TgmathTestValues
+{
+	float floor;
+	int isnan;
+	int isinf;
+	float pow;
+	float exp;
+	float exp2;
+	float log;
+	float log2;
+	float absF;
+	int absI;
+	float sqrt;
+	float sin;
+	float cos;
+	float acos;
+	float modf;
+	float round;
+	float roundEven;
+	float trunc;
+	float ceil;
+	float fma;
+	float ldexp;
 	float tan;
 	float asin;
 	float atan;
@@ -157,40 +157,40 @@ struct TgmathTestValues
 	float asinh;
 	float acosh;
 	float atanh;
-	float atan2;
-	float erf;
-	float erfInv;
-
-	float32_t3 floorVec;
-
-	// we can't fix this because using namespace nbl::hlsl would cause ambiguous math functions below 
-	// and we can't add a nbl::hlsl alias for the builtin hLSL vector type because of https://github.com/microsoft/DirectXShaderCompiler/issues/7035
-#ifndef __HLSL_VERSION
-	nbl::hlsl::vector<int, 3> isnanVec;
-	nbl::hlsl::vector<int, 3> isinfVec;
-#else
-	vector<int, 3> isnanVec;
-	vector<int, 3> isinfVec;
-#endif
-	
-	float32_t3 powVec;
-	float32_t3 expVec;
-	float32_t3 exp2Vec;
-	float32_t3 logVec;
-	float32_t3 log2Vec;
-	float32_t3 absFVec;
-	int32_t3 absIVec;
-	float32_t3 sqrtVec;
-	float32_t3 cosVec;
-	float32_t3 sinVec;
-	float32_t3 acosVec;
-	float32_t3 modfVec;
-	float32_t3 roundVec;
-	float32_t3 roundEvenVec;
-	float32_t3 truncVec;
-	float32_t3 ceilVec;
-	float32_t3 fmaVec;
-	float32_t3 ldexpVec;
+	float atan2;
+	float erf;
+	float erfInv;
+
+	float32_t3 floorVec;
+
+	// we can't fix this because using namespace nbl::hlsl would cause ambiguous math functions below 
+	// and we can't add a nbl::hlsl alias for the builtin hLSL vector type because of https://github.com/microsoft/DirectXShaderCompiler/issues/7035
+#ifndef __HLSL_VERSION
+	nbl::hlsl::vector<int, 3> isnanVec;
+	nbl::hlsl::vector<int, 3> isinfVec;
+#else
+	vector<int, 3> isnanVec;
+	vector<int, 3> isinfVec;
+#endif
+	
+	float32_t3 powVec;
+	float32_t3 expVec;
+	float32_t3 exp2Vec;
+	float32_t3 logVec;
+	float32_t3 log2Vec;
+	float32_t3 absFVec;
+	int32_t3 absIVec;
+	float32_t3 sqrtVec;
+	float32_t3 cosVec;
+	float32_t3 sinVec;
+	float32_t3 acosVec;
+	float32_t3 modfVec;
+	float32_t3 roundVec;
+	float32_t3 roundEvenVec;
+	float32_t3 truncVec;
+	float32_t3 ceilVec;
+	float32_t3 fmaVec;
+	float32_t3 ldexpVec;
 	float32_t3 tanVec;
 	float32_t3 asinVec;
 	float32_t3 atanVec;
@@ -200,258 +200,275 @@ struct TgmathTestValues
 	float32_t3 asinhVec;
 	float32_t3 acoshVec;
 	float32_t3 atanhVec;
-	float32_t3 atan2Vec;
-	float32_t3 erfVec;
-	float32_t3 erfInvVec;
-
-	ModfOutput<float> modfStruct;
-	ModfOutput<float32_t3> modfStructVec;
-	FrexpOutput<float> frexpStruct;
-	FrexpOutput<float32_t3> frexpStructVec;
-
-	void fillTestValues(NBL_CONST_REF_ARG(TgmathIntputTestValues) input)
-	{
-		floor = nbl::hlsl::floor(input.floor);
-		isnan = nbl::hlsl::isnan(input.isnan);
-		isinf = nbl::hlsl::isinf(input.isinf);
-		pow = nbl::hlsl::pow(input.powX, input.powY);
-		exp = nbl::hlsl::exp(input.exp);
-		exp2 = nbl::hlsl::exp2(input.exp2);
-		log = nbl::hlsl::log(input.log);
-		log2 = nbl::hlsl::log2(input.log2);
-		absF = nbl::hlsl::abs(input.absF);
-		absI = nbl::hlsl::abs(input.absI);
-		sqrt = nbl::hlsl::sqrt(input.sqrt);
-		sin = nbl::hlsl::sin(input.sin);
-		cos = nbl::hlsl::cos(input.cos);
-		tan = nbl::hlsl::tan(input.tan);
-		asin = nbl::hlsl::asin(input.asin);
-		atan = nbl::hlsl::atan(input.atan);
-		sinh = nbl::hlsl::sinh(input.sinh);
-		cosh = nbl::hlsl::cosh(input.cosh);
-		tanh = nbl::hlsl::tanh(input.tanh);
-		asinh = nbl::hlsl::asinh(input.asinh);
-		acosh = nbl::hlsl::acosh(input.acosh);
-		atanh = nbl::hlsl::atanh(input.atanh);
-		atan2 = nbl::hlsl::atan2(input.atan2Y, input.atan2X);
-		erf = nbl::hlsl::erf(input.erf);
-		erfInv = nbl::hlsl::erfInv(input.erfInv);
-		acos = nbl::hlsl::acos(input.acos);
-		modf = nbl::hlsl::modf(input.modf);
-		round = nbl::hlsl::round(input.round);
-		roundEven = nbl::hlsl::roundEven(input.roundEven);
-		trunc = nbl::hlsl::trunc(input.trunc);
-		ceil = nbl::hlsl::ceil(input.ceil);
-		fma = nbl::hlsl::fma(input.fmaX, input.fmaY, input.fmaZ);
-		ldexp = nbl::hlsl::ldexp(input.ldexpArg, input.ldexpExp);
-
-		floorVec = nbl::hlsl::floor(input.floorVec);
-		isnanVec = nbl::hlsl::isnan(input.isnanVec);
-		isinfVec = nbl::hlsl::isinf(input.isinfVec);
-		powVec = nbl::hlsl::pow(input.powXVec, input.powYVec);
-		expVec = nbl::hlsl::exp(input.expVec);
-		exp2Vec = nbl::hlsl::exp2(input.exp2Vec);
-		logVec = nbl::hlsl::log(input.logVec);
-		log2Vec = nbl::hlsl::log2(input.log2Vec);
-		absFVec = nbl::hlsl::abs(input.absFVec);
-		absIVec = nbl::hlsl::abs(input.absIVec);
-		sqrtVec = nbl::hlsl::sqrt(input.sqrtVec);
-		sinVec = nbl::hlsl::sin(input.sinVec);
-		cosVec = nbl::hlsl::cos(input.cosVec);
-		tanVec = nbl::hlsl::tan(input.tanVec);
-		asinVec = nbl::hlsl::asin(input.asinVec);
-		atanVec = nbl::hlsl::atan(input.atanVec);
-		sinhVec = nbl::hlsl::sinh(input.sinhVec);
-		coshVec = nbl::hlsl::cosh(input.coshVec);
-		tanhVec = nbl::hlsl::tanh(input.tanhVec);
-		asinhVec = nbl::hlsl::asinh(input.asinhVec);
-		acoshVec = nbl::hlsl::acosh(input.acoshVec);
-		atanhVec = nbl::hlsl::atanh(input.atanhVec);
-		atan2Vec = nbl::hlsl::atan2(input.atan2YVec, input.atan2XVec);
-		acosVec = nbl::hlsl::acos(input.acosVec);
-		modfVec = nbl::hlsl::modf(input.modfVec);
-		roundVec = nbl::hlsl::round(input.roundVec);
-		roundEvenVec = nbl::hlsl::roundEven(input.roundEvenVec);
-		truncVec = nbl::hlsl::trunc(input.truncVec);
-		ceilVec = nbl::hlsl::ceil(input.ceilVec);
-		fmaVec = nbl::hlsl::fma(input.fmaXVec, input.fmaYVec, input.fmaZVec);
-		ldexpVec = nbl::hlsl::ldexp(input.ldexpArgVec, input.ldexpExpVec);
-		erfVec = nbl::hlsl::erf(input.erfVec);
-		erfInvVec = nbl::hlsl::erfInv(input.erfInvVec);
-
-		modfStruct = nbl::hlsl::modfStruct(input.modfStruct);
-		modfStructVec = nbl::hlsl::modfStruct(input.modfStructVec);
-		frexpStruct = nbl::hlsl::frexpStruct(input.frexpStruct);
-		frexpStructVec = nbl::hlsl::frexpStruct(input.frexpStructVec);
-	}
-};
-
-struct IntrinsicsIntputTestValues
-{
-	int bitCount;
-	float32_t3 crossLhs;
-	float32_t3 crossRhs;
-	float clampVal;
-	float clampMin;
-	float clampMax;
-	float32_t3 length;
-	float32_t3 normalize;
-	float32_t3 dotLhs;
-	float32_t3 dotRhs;
-	float32_t3x3 determinant;
-	uint32_t findMSB;
-	uint32_t findLSB;
-	float32_t3x3 inverse;
-	float32_t3x3 transpose;
-	float32_t3x3 mulLhs;
-	float32_t3x3 mulRhs;
-	float minA;
-	float minB;
-	float maxA;
-	float maxB;
-	float rsqrt;
-	uint32_t bitReverse;
-	float frac;
-	float mixX;
-	float mixY;
-	float mixA;
-	float sign;
-	float radians;
-	float degrees;
-	float stepEdge;
-	float stepX;
-	float smoothStepEdge0;
-	float smoothStepEdge1;
-	float smoothStepX;
-
-	int32_t3 bitCountVec;
-	float32_t3 clampValVec;
-	float32_t3 clampMinVec;
-	float32_t3 clampMaxVec;
-	uint32_t3 findMSBVec;
-	uint32_t3 findLSBVec;
-	float32_t3 minAVec;
-	float32_t3 minBVec;
-	float32_t3 maxAVec;
-	float32_t3 maxBVec;
-	float32_t3 rsqrtVec;
-	uint32_t3 bitReverseVec;
-	float32_t3 fracVec;
-	float32_t3 mixXVec;
-	float32_t3 mixYVec;
-	float32_t3 mixAVec;
-	float32_t3 signVec;
-	float32_t3 radiansVec;
-	float32_t3 degreesVec;
-	float32_t3 stepEdgeVec;
-	float32_t3 stepXVec;
-	float32_t3 smoothStepEdge0Vec;
-	float32_t3 smoothStepEdge1Vec;
-	float32_t3 smoothStepXVec;
-	float32_t3 faceForwardN;
-	float32_t3 faceForwardI;
-	float32_t3 faceForwardNref;
-	float32_t3 reflectI;
-	float32_t3 reflectN;
-	float32_t3 refractI;
-	float32_t3 refractN;
-	float refractEta;
-};
-
-struct IntrinsicsTestValues
-{
-	int bitCount;
-	float clamp;
-	float length;
-	float dot;
-	float determinant;
-	int findMSB;
-	int findLSB;
-	float min;
-	float max;
-	float rsqrt;
-	float frac;
-	uint32_t bitReverse;
-	float mix;
-	float sign;
-	float radians;
-	float degrees;
-	float step;
-	float smoothStep;
-
-	float32_t3 normalize;
-	float32_t3 cross;
-	int32_t3 bitCountVec;
-	float32_t3 clampVec;
-	uint32_t3 findMSBVec;
-	uint32_t3 findLSBVec;
-	float32_t3 minVec;
-	float32_t3 maxVec;
-	float32_t3 rsqrtVec;
-	uint32_t3 bitReverseVec;
-	float32_t3 fracVec;
-	float32_t3 mixVec;
-	float32_t3 signVec;
-	float32_t3 radiansVec;
-	float32_t3 degreesVec;
-	float32_t3 stepVec;
-	float32_t3 smoothStepVec;
-	float32_t3 faceForward;
-	float32_t3 reflect;
-	float32_t3 refract;
-
-	float32_t3x3 mul;
-	float32_t3x3 transpose;
-	float32_t3x3 inverse;
-
-	void fillTestValues(NBL_CONST_REF_ARG(IntrinsicsIntputTestValues) input)
-	{
-		bitCount = nbl::hlsl::bitCount(input.bitCount);
-		cross = nbl::hlsl::cross(input.crossLhs, input.crossRhs);
-		clamp = nbl::hlsl::clamp(input.clampVal, input.clampMin, input.clampMax);
-		length = nbl::hlsl::length(input.length);
-		normalize = nbl::hlsl::normalize(input.normalize);
-		dot = nbl::hlsl::dot(input.dotLhs, input.dotRhs);
-		determinant = nbl::hlsl::determinant(input.determinant);
-		findMSB = nbl::hlsl::findMSB(input.findMSB);
-		findLSB = nbl::hlsl::findLSB(input.findLSB);
-		inverse = nbl::hlsl::inverse(input.inverse);
-		transpose = nbl::hlsl::transpose(input.transpose);
-		mul = nbl::hlsl::mul(input.mulLhs, input.mulRhs);
-		// TODO: fix min and max
-		min = nbl::hlsl::min(input.minA, input.minB);
-		max = nbl::hlsl::max(input.maxA, input.maxB);
-		rsqrt = nbl::hlsl::rsqrt(input.rsqrt);
-		bitReverse = nbl::hlsl::bitReverse(input.bitReverse);
-		frac = nbl::hlsl::fract(input.frac);
-		mix = nbl::hlsl::mix(input.mixX, input.mixY, input.mixA);
-		sign = nbl::hlsl::sign(input.sign);
-		radians = nbl::hlsl::radians(input.radians);
-		degrees = nbl::hlsl::degrees(input.degrees);
-		step = nbl::hlsl::step(input.stepEdge, input.stepX);
-		smoothStep = nbl::hlsl::smoothStep(input.smoothStepEdge0, input.smoothStepEdge1, input.smoothStepX);
-
-		bitCountVec = nbl::hlsl::bitCount(input.bitCountVec);
-		clampVec = nbl::hlsl::clamp(input.clampValVec, input.clampMinVec, input.clampMaxVec);
-		findMSBVec = nbl::hlsl::findMSB(input.findMSBVec);
-		findLSBVec = nbl::hlsl::findLSB(input.findLSBVec);
-		// TODO: fix min and max
-		minVec = nbl::hlsl::min(input.minAVec, input.minBVec);
-		maxVec = nbl::hlsl::max(input.maxAVec, input.maxBVec);
-		rsqrtVec = nbl::hlsl::rsqrt(input.rsqrtVec);
-		bitReverseVec = nbl::hlsl::bitReverse(input.bitReverseVec);
-		fracVec = nbl::hlsl::fract(input.fracVec);
-		mixVec = nbl::hlsl::mix(input.mixXVec, input.mixYVec, input.mixAVec);
-		
-		signVec = nbl::hlsl::sign(input.signVec);
-		radiansVec = nbl::hlsl::radians(input.radiansVec);
-		degreesVec = nbl::hlsl::degrees(input.degreesVec);
-		stepVec = nbl::hlsl::step(input.stepEdgeVec, input.stepXVec);
-		smoothStepVec = nbl::hlsl::smoothStep(input.smoothStepEdge0Vec, input.smoothStepEdge1Vec, input.smoothStepXVec);
-		faceForward = nbl::hlsl::faceForward(input.faceForwardN, input.faceForwardI, input.faceForwardNref);
-		reflect = nbl::hlsl::reflect(input.reflectI, input.reflectN);
-		refract = nbl::hlsl::refract(input.refractI, input.refractN, input.refractEta);
-	}
-};
-
-#endif
+	float32_t3 atan2Vec;
+	float32_t3 erfVec;
+	float32_t3 erfInvVec;
+
+	ModfOutput<float> modfStruct;
+	ModfOutput<float32_t3> modfStructVec;
+	FrexpOutput<float> frexpStruct;
+	FrexpOutput<float32_t3> frexpStructVec;
+
+	void fillTestValues(NBL_CONST_REF_ARG(TgmathIntputTestValues) input)
+	{
+		floor = nbl::hlsl::floor(input.floor);
+		isnan = nbl::hlsl::isnan(input.isnan);
+		isinf = nbl::hlsl::isinf(input.isinf);
+		pow = nbl::hlsl::pow(input.powX, input.powY);
+		exp = nbl::hlsl::exp(input.exp);
+		exp2 = nbl::hlsl::exp2(input.exp2);
+		log = nbl::hlsl::log(input.log);
+		log2 = nbl::hlsl::log2(input.log2);
+		absF = nbl::hlsl::abs(input.absF);
+		absI = nbl::hlsl::abs(input.absI);
+		sqrt = nbl::hlsl::sqrt(input.sqrt);
+		sin = nbl::hlsl::sin(input.sin);
+		cos = nbl::hlsl::cos(input.cos);
+		tan = nbl::hlsl::tan(input.tan);
+		asin = nbl::hlsl::asin(input.asin);
+		atan = nbl::hlsl::atan(input.atan);
+		sinh = nbl::hlsl::sinh(input.sinh);
+		cosh = nbl::hlsl::cosh(input.cosh);
+		tanh = nbl::hlsl::tanh(input.tanh);
+		asinh = nbl::hlsl::asinh(input.asinh);
+		acosh = nbl::hlsl::acosh(input.acosh);
+		atanh = nbl::hlsl::atanh(input.atanh);
+		atan2 = nbl::hlsl::atan2(input.atan2Y, input.atan2X);
+		erf = nbl::hlsl::erf(input.erf);
+		erfInv = nbl::hlsl::erfInv(input.erfInv);
+		acos = nbl::hlsl::acos(input.acos);
+		modf = nbl::hlsl::modf(input.modf);
+		round = nbl::hlsl::round(input.round);
+		roundEven = nbl::hlsl::roundEven(input.roundEven);
+		trunc = nbl::hlsl::trunc(input.trunc);
+		ceil = nbl::hlsl::ceil(input.ceil);
+		fma = nbl::hlsl::fma(input.fmaX, input.fmaY, input.fmaZ);
+		ldexp = nbl::hlsl::ldexp(input.ldexpArg, input.ldexpExp);
+
+		floorVec = nbl::hlsl::floor(input.floorVec);
+		isnanVec = nbl::hlsl::isnan(input.isnanVec);
+		isinfVec = nbl::hlsl::isinf(input.isinfVec);
+		powVec = nbl::hlsl::pow(input.powXVec, input.powYVec);
+		expVec = nbl::hlsl::exp(input.expVec);
+		exp2Vec = nbl::hlsl::exp2(input.exp2Vec);
+		logVec = nbl::hlsl::log(input.logVec);
+		log2Vec = nbl::hlsl::log2(input.log2Vec);
+		absFVec = nbl::hlsl::abs(input.absFVec);
+		absIVec = nbl::hlsl::abs(input.absIVec);
+		sqrtVec = nbl::hlsl::sqrt(input.sqrtVec);
+		sinVec = nbl::hlsl::sin(input.sinVec);
+		cosVec = nbl::hlsl::cos(input.cosVec);
+		tanVec = nbl::hlsl::tan(input.tanVec);
+		asinVec = nbl::hlsl::asin(input.asinVec);
+		atanVec = nbl::hlsl::atan(input.atanVec);
+		sinhVec = nbl::hlsl::sinh(input.sinhVec);
+		coshVec = nbl::hlsl::cosh(input.coshVec);
+		tanhVec = nbl::hlsl::tanh(input.tanhVec);
+		asinhVec = nbl::hlsl::asinh(input.asinhVec);
+		acoshVec = nbl::hlsl::acosh(input.acoshVec);
+		atanhVec = nbl::hlsl::atanh(input.atanhVec);
+		atan2Vec = nbl::hlsl::atan2(input.atan2YVec, input.atan2XVec);
+		acosVec = nbl::hlsl::acos(input.acosVec);
+		modfVec = nbl::hlsl::modf(input.modfVec);
+		roundVec = nbl::hlsl::round(input.roundVec);
+		roundEvenVec = nbl::hlsl::roundEven(input.roundEvenVec);
+		truncVec = nbl::hlsl::trunc(input.truncVec);
+		ceilVec = nbl::hlsl::ceil(input.ceilVec);
+		fmaVec = nbl::hlsl::fma(input.fmaXVec, input.fmaYVec, input.fmaZVec);
+		ldexpVec = nbl::hlsl::ldexp(input.ldexpArgVec, input.ldexpExpVec);
+		erfVec = nbl::hlsl::erf(input.erfVec);
+		erfInvVec = nbl::hlsl::erfInv(input.erfInvVec);
+
+		modfStruct = nbl::hlsl::modfStruct(input.modfStruct);
+		modfStructVec = nbl::hlsl::modfStruct(input.modfStructVec);
+		frexpStruct = nbl::hlsl::frexpStruct(input.frexpStruct);
+		frexpStructVec = nbl::hlsl::frexpStruct(input.frexpStructVec);
+	}
+};
+
+struct IntrinsicsIntputTestValues
+{
+	int bitCount;
+	float32_t3 crossLhs;
+	float32_t3 crossRhs;
+	float clampVal;
+	float clampMin;
+	float clampMax;
+	float32_t3 length;
+	float32_t3 normalize;
+	float32_t3 dotLhs;
+	float32_t3 dotRhs;
+	float32_t3x3 determinant;
+	uint32_t findMSB;
+	uint32_t findLSB;
+	float32_t3x3 inverse;
+	float32_t3x3 transpose;
+	float32_t3x3 mulLhs;
+	float32_t3x3 mulRhs;
+	float minA;
+	float minB;
+	float maxA;
+	float maxB;
+	float rsqrt;
+	uint32_t bitReverse;
+	float frac;
+	float mixX;
+	float mixY;
+	float mixA;
+	float sign;
+	float radians;
+	float degrees;
+	float stepEdge;
+	float stepX;
+	float smoothStepEdge0;
+	float smoothStepEdge1;
+	float smoothStepX;
+	uint32_t addCarryA;
+	uint32_t addCarryB;
+	uint32_t subBorrowA;
+	uint32_t subBorrowB;
+
+	int32_t3 bitCountVec;
+	float32_t3 clampValVec;
+	float32_t3 clampMinVec;
+	float32_t3 clampMaxVec;
+	uint32_t3 findMSBVec;
+	uint32_t3 findLSBVec;
+	float32_t3 minAVec;
+	float32_t3 minBVec;
+	float32_t3 maxAVec;
+	float32_t3 maxBVec;
+	float32_t3 rsqrtVec;
+	uint32_t3 bitReverseVec;
+	float32_t3 fracVec;
+	float32_t3 mixXVec;
+	float32_t3 mixYVec;
+	float32_t3 mixAVec;
+	float32_t3 signVec;
+	float32_t3 radiansVec;
+	float32_t3 degreesVec;
+	float32_t3 stepEdgeVec;
+	float32_t3 stepXVec;
+	float32_t3 smoothStepEdge0Vec;
+	float32_t3 smoothStepEdge1Vec;
+	float32_t3 smoothStepXVec;
+	float32_t3 faceForwardN;
+	float32_t3 faceForwardI;
+	float32_t3 faceForwardNref;
+	float32_t3 reflectI;
+	float32_t3 reflectN;
+	float32_t3 refractI;
+	float32_t3 refractN;
+	float refractEta;
+	uint32_t3 addCarryAVec;
+	uint32_t3 addCarryBVec;
+	uint32_t3 subBorrowAVec;
+	uint32_t3 subBorrowBVec;
+};
+
+struct IntrinsicsTestValues
+{
+	int bitCount;
+	float clamp;
+	float length;
+	float dot;
+	float determinant;
+	int findMSB;
+	int findLSB;
+	float min;
+	float max;
+	float rsqrt;
+	float frac;
+	uint32_t bitReverse;
+	float mix;
+	float sign;
+	float radians;
+	float degrees;
+	float step;
+	float smoothStep;
+
+	float32_t3 normalize;
+	float32_t3 cross;
+	int32_t3 bitCountVec;
+	float32_t3 clampVec;
+	uint32_t3 findMSBVec;
+	uint32_t3 findLSBVec;
+	float32_t3 minVec;
+	float32_t3 maxVec;
+	float32_t3 rsqrtVec;
+	uint32_t3 bitReverseVec;
+	float32_t3 fracVec;
+	float32_t3 mixVec;
+	float32_t3 signVec;
+	float32_t3 radiansVec;
+	float32_t3 degreesVec;
+	float32_t3 stepVec;
+	float32_t3 smoothStepVec;
+	float32_t3 faceForward;
+	float32_t3 reflect;
+	float32_t3 refract;
+
+	float32_t3x3 mul;
+	float32_t3x3 transpose;
+	float32_t3x3 inverse;
+
+	spirv::AddCarryOutput<uint32_t> addCarry;
+	spirv::SubBorrowOutput<uint32_t> subBorrow;
+	spirv::AddCarryOutput<uint32_t3> addCarryVec;
+	spirv::SubBorrowOutput<uint32_t3> subBorrowVec;
+
+	void fillTestValues(NBL_CONST_REF_ARG(IntrinsicsIntputTestValues) input)
+	{
+		bitCount = nbl::hlsl::bitCount(input.bitCount);
+		cross = nbl::hlsl::cross(input.crossLhs, input.crossRhs);
+		clamp = nbl::hlsl::clamp(input.clampVal, input.clampMin, input.clampMax);
+		length = nbl::hlsl::length(input.length);
+		normalize = nbl::hlsl::normalize(input.normalize);
+		dot = nbl::hlsl::dot(input.dotLhs, input.dotRhs);
+		determinant = nbl::hlsl::determinant(input.determinant);
+		findMSB = nbl::hlsl::findMSB(input.findMSB);
+		findLSB = nbl::hlsl::findLSB(input.findLSB);
+		inverse = nbl::hlsl::inverse(input.inverse);
+		transpose = nbl::hlsl::transpose(input.transpose);
+		mul = nbl::hlsl::mul(input.mulLhs, input.mulRhs);
+		// TODO: fix min and max
+		min = nbl::hlsl::min(input.minA, input.minB);
+		max = nbl::hlsl::max(input.maxA, input.maxB);
+		rsqrt = nbl::hlsl::rsqrt(input.rsqrt);
+		bitReverse = nbl::hlsl::bitReverse(input.bitReverse);
+		frac = nbl::hlsl::fract(input.frac);
+		mix = nbl::hlsl::mix(input.mixX, input.mixY, input.mixA);
+		sign = nbl::hlsl::sign(input.sign);
+		radians = nbl::hlsl::radians(input.radians);
+		degrees = nbl::hlsl::degrees(input.degrees);
+		step = nbl::hlsl::step(input.stepEdge, input.stepX);
+		smoothStep = nbl::hlsl::smoothStep(input.smoothStepEdge0, input.smoothStepEdge1, input.smoothStepX);
+
+		bitCountVec = nbl::hlsl::bitCount(input.bitCountVec);
+		clampVec = nbl::hlsl::clamp(input.clampValVec, input.clampMinVec, input.clampMaxVec);
+		findMSBVec = nbl::hlsl::findMSB(input.findMSBVec);
+		findLSBVec = nbl::hlsl::findLSB(input.findLSBVec);
+		// TODO: fix min and max
+		minVec = nbl::hlsl::min(input.minAVec, input.minBVec);
+		maxVec = nbl::hlsl::max(input.maxAVec, input.maxBVec);
+		rsqrtVec = nbl::hlsl::rsqrt(input.rsqrtVec);
+		bitReverseVec = nbl::hlsl::bitReverse(input.bitReverseVec);
+		fracVec = nbl::hlsl::fract(input.fracVec);
+		mixVec = nbl::hlsl::mix(input.mixXVec, input.mixYVec, input.mixAVec);
+		
+		signVec = nbl::hlsl::sign(input.signVec);
+		radiansVec = nbl::hlsl::radians(input.radiansVec);
+		degreesVec = nbl::hlsl::degrees(input.degreesVec);
+		stepVec = nbl::hlsl::step(input.stepEdgeVec, input.stepXVec);
+		smoothStepVec = nbl::hlsl::smoothStep(input.smoothStepEdge0Vec, input.smoothStepEdge1Vec, input.smoothStepXVec);
+		faceForward = nbl::hlsl::faceForward(input.faceForwardN, input.faceForwardI, input.faceForwardNref);
+		reflect = nbl::hlsl::reflect(input.reflectI, input.reflectN);
+		refract = nbl::hlsl::refract(input.refractI, input.refractN, input.refractEta);
+		addCarry = nbl::hlsl::addCarry(input.addCarryA, input.addCarryB);
+		subBorrow = nbl::hlsl::subBorrow(input.subBorrowA, input.subBorrowB);
+		addCarryVec = nbl::hlsl::addCarry(input.addCarryAVec, input.addCarryBVec);
+		subBorrowVec = nbl::hlsl::subBorrow(input.subBorrowAVec, input.subBorrowBVec);
+	}
+};
+
+#endif

From f00bbf6fa914ec230df8a000deee75aee69cdce9 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Mon, 7 Apr 2025 19:48:46 -0300
Subject: [PATCH 6/9] Disable intrinsic tests for uSUbBorrow for the time
 being, start copying 22_CppCOmpat to run tests

---
 12_Mortons/Tester.h                  | 417 +++++++++++++++++++++++++++
 12_Mortons/app_resources/common.hlsl |  38 ++-
 12_Mortons/app_resources/shader.hlsl |  18 --
 12_Mortons/main.cpp                  | 198 +------------
 22_CppCompat/CIntrinsicsTester.h     |  22 +-
 5 files changed, 474 insertions(+), 219 deletions(-)
 create mode 100644 12_Mortons/Tester.h
 delete mode 100644 12_Mortons/app_resources/shader.hlsl

diff --git a/12_Mortons/Tester.h b/12_Mortons/Tester.h
new file mode 100644
index 000000000..5c4773111
--- /dev/null
+++ b/12_Mortons/Tester.h
@@ -0,0 +1,417 @@
+#ifndef _NBL_EXAMPLES_TESTS_12_MORTONS_I_TESTER_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_12_MORTONS_I_TESTER_INCLUDED_
+
+#include <nabla.h>
+#include "app_resources/common.hlsl"
+#include "nbl/application_templates/MonoDeviceApplication.hpp"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+using namespace nbl;
+
+class Tester
+{
+public:
+    virtual ~Tester()
+    {
+        m_outputBufferAllocation.memory->unmap();
+    };
+
+    struct PipelineSetupData
+    {
+        std::string testShaderPath;
+
+        core::smart_refctd_ptr<video::ILogicalDevice> device;
+        core::smart_refctd_ptr<video::CVulkanConnection> api;
+        core::smart_refctd_ptr<asset::IAssetManager> assetMgr;
+        core::smart_refctd_ptr<system::ILogger> logger;
+        video::IPhysicalDevice* physicalDevice;
+        uint32_t computeFamilyIndex;
+    };
+
+    template<typename InputStruct, typename OutputStruct>
+    void setupPipeline(const PipelineSetupData& pipleineSetupData)
+    {
+        // setting up pipeline in the constructor
+        m_device = core::smart_refctd_ptr(pipleineSetupData.device);
+        m_physicalDevice = pipleineSetupData.physicalDevice;
+        m_api = core::smart_refctd_ptr(pipleineSetupData.api);
+        m_assetMgr = core::smart_refctd_ptr(pipleineSetupData.assetMgr);
+        m_logger = core::smart_refctd_ptr(pipleineSetupData.logger);
+        m_queueFamily = pipleineSetupData.computeFamilyIndex;
+        m_semaphoreCounter = 0;
+        m_semaphore = m_device->createSemaphore(0);
+        m_cmdpool = m_device->createCommandPool(m_queueFamily, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf))
+            logFail("Failed to create Command Buffers!\n");
+
+        // Load shaders, set up pipeline
+        core::smart_refctd_ptr<video::IGPUShader> shader;
+        {
+            asset::IAssetLoader::SAssetLoadParams lp = {};
+            lp.logger = m_logger.get();
+            lp.workingDirectory = ""; // virtual root
+            auto assetBundle = m_assetMgr->getAsset(pipleineSetupData.testShaderPath, lp);
+            const auto assets = assetBundle.getContents();
+            if (assets.empty())
+            {
+                logFail("Could not load shader!");
+                assert(0);
+            }
+
+            // It would be super weird if loading a shader from a file produced more than 1 asset
+            assert(assets.size() == 1);
+            core::smart_refctd_ptr<asset::ICPUShader> source = asset::IAsset::castDown<asset::ICPUShader>(assets[0]);
+
+            auto* compilerSet = m_assetMgr->getCompilerSet();
+
+            asset::IShaderCompiler::SCompilerOptions options = {};
+            options.stage = source->getStage();
+            options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+            options.spirvOptimizer = nullptr;
+            options.debugInfoFlags |= asset::IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT;
+            options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
+            options.preprocessorOptions.logger = m_logger.get();
+            options.preprocessorOptions.includeFinder = compilerSet->getShaderCompiler(source->getContentType())->getDefaultIncludeFinder();
+
+            auto spirv = compilerSet->compileToSPIRV(source.get(), options);
+
+            video::ILogicalDevice::SShaderCreationParameters params{};
+            params.cpushader = spirv.get();
+            shader = m_device->createShader(params);
+        }
+
+        if (!shader)
+            logFail("Failed to create a GPU Shader, seems the Driver doesn't like the SPIR-V we're feeding it!\n");
+
+        video::IGPUDescriptorSetLayout::SBinding bindings[2] = {
+            {
+                .binding = 0,
+                .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
+                .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+                .stageFlags = ShaderStage::ESS_COMPUTE,
+                .count = 1
+            },
+            {
+                .binding = 1,
+                .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
+                .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+                .stageFlags = ShaderStage::ESS_COMPUTE,
+                .count = 1
+            }
+        };
+
+        core::smart_refctd_ptr<video::IGPUDescriptorSetLayout> dsLayout = m_device->createDescriptorSetLayout(bindings);
+        if (!dsLayout)
+            logFail("Failed to create a Descriptor Layout!\n");
+
+        m_pplnLayout = m_device->createPipelineLayout({}, core::smart_refctd_ptr(dsLayout));
+        if (!m_pplnLayout)
+            logFail("Failed to create a Pipeline Layout!\n");
+
+        {
+            video::IGPUComputePipeline::SCreationParams params = {};
+            params.layout = m_pplnLayout.get();
+            params.shader.entryPoint = "main";
+            params.shader.shader = shader.get();
+            if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
+                logFail("Failed to create pipelines (compile & link shaders)!\n");
+        }
+
+        // Allocate memory of the input buffer
+        {
+            constexpr size_t BufferSize = sizeof(InputStruct);
+
+            video::IGPUBuffer::SCreationParams params = {};
+            params.size = BufferSize;
+            params.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+            core::smart_refctd_ptr<video::IGPUBuffer> inputBuff = m_device->createBuffer(std::move(params));
+            if (!inputBuff)
+                logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
+
+            inputBuff->setObjectDebugName("emulated_float64_t output buffer");
+
+            video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuff->getMemoryReqs();
+            reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
+
+            m_inputBufferAllocation = m_device->allocate(reqs, inputBuff.get(), video::IDeviceMemoryAllocation::EMAF_NONE);
+            if (!m_inputBufferAllocation.isValid())
+                logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
+
+            assert(inputBuff->getBoundMemory().memory == m_inputBufferAllocation.memory.get());
+            core::smart_refctd_ptr<video::IDescriptorPool> pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 });
+
+            m_ds = pool->createDescriptorSet(core::smart_refctd_ptr(dsLayout));
+            {
+                video::IGPUDescriptorSet::SDescriptorInfo info[1];
+                info[0].desc = core::smart_refctd_ptr(inputBuff);
+                info[0].info.buffer = { .offset = 0,.size = BufferSize };
+                video::IGPUDescriptorSet::SWriteDescriptorSet writes[1] = {
+                    {.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info}
+                };
+                m_device->updateDescriptorSets(writes, {});
+            }
+        }
+
+        // Allocate memory of the output buffer
+        {
+            constexpr size_t BufferSize = sizeof(OutputStruct);
+
+            video::IGPUBuffer::SCreationParams params = {};
+            params.size = BufferSize;
+            params.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+            core::smart_refctd_ptr<video::IGPUBuffer> outputBuff = m_device->createBuffer(std::move(params));
+            if (!outputBuff)
+                logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
+
+            outputBuff->setObjectDebugName("emulated_float64_t output buffer");
+
+            video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs();
+            reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
+
+            m_outputBufferAllocation = m_device->allocate(reqs, outputBuff.get(), video::IDeviceMemoryAllocation::EMAF_NONE);
+            if (!m_outputBufferAllocation.isValid())
+                logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
+
+            assert(outputBuff->getBoundMemory().memory == m_outputBufferAllocation.memory.get());
+            core::smart_refctd_ptr<video::IDescriptorPool> pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 });
+
+            {
+                video::IGPUDescriptorSet::SDescriptorInfo info[1];
+                info[0].desc = core::smart_refctd_ptr(outputBuff);
+                info[0].info.buffer = { .offset = 0,.size = BufferSize };
+                video::IGPUDescriptorSet::SWriteDescriptorSet writes[1] = {
+                    {.dstSet = m_ds.get(),.binding = 1,.arrayElement = 0,.count = 1,.info = info}
+                };
+                m_device->updateDescriptorSets(writes, {});
+            }
+        }
+
+        if (!m_outputBufferAllocation.memory->map({ 0ull,m_outputBufferAllocation.memory->getAllocationSize() }, video::IDeviceMemoryAllocation::EMCAF_READ))
+            logFail("Failed to map the Device Memory!\n");
+
+        // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches
+        const video::ILogicalDevice::MappedMemoryRange memoryRange(m_outputBufferAllocation.memory.get(), 0ull, m_outputBufferAllocation.memory->getAllocationSize());
+        if (!m_outputBufferAllocation.memory->getMemoryPropertyFlags().hasFlags(video::IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+            m_device->invalidateMappedMemoryRanges(1, &memoryRange);
+
+        assert(memoryRange.valid() && memoryRange.length >= sizeof(OutputStruct));
+
+        m_queue = m_device->getQueue(m_queueFamily, 0);
+    }
+
+    enum class TestType
+    {
+        CPU,
+        GPU
+    };
+
+    template<typename T>
+    void verifyTestValue(const std::string& memberName, const T& expectedVal, const T& testVal, const TestType testType)
+    {
+        static constexpr float MaxAllowedError = 0.1f;
+        if (std::abs(double(expectedVal) - double(testVal)) <= MaxAllowedError)
+            return;
+
+        std::stringstream ss;
+        switch (testType)
+        {
+        case TestType::CPU:
+            ss << "CPU TEST ERROR:\n";
+        case TestType::GPU:
+            ss << "GPU TEST ERROR:\n";
+        }
+
+        ss << "nbl::hlsl::" << memberName << " produced incorrect output! test value: " << testVal << " expected value: " << expectedVal << '\n';
+
+        m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR);
+    }
+
+    template<typename T>
+    void verifyTestVector3dValue(const std::string& memberName, const nbl::hlsl::vector<T, 3>& expectedVal, const nbl::hlsl::vector<T, 3>& testVal, const TestType testType)
+    {
+        static constexpr float MaxAllowedError = 0.1f;
+        if (std::abs(double(expectedVal.x) - double(testVal.x)) <= MaxAllowedError &&
+            std::abs(double(expectedVal.y) - double(testVal.y)) <= MaxAllowedError &&
+            std::abs(double(expectedVal.z) - double(testVal.z)) <= MaxAllowedError)
+            return;
+
+        std::stringstream ss;
+        switch (testType)
+        {
+        case TestType::CPU:
+            ss << "CPU TEST ERROR:\n";
+        case TestType::GPU:
+            ss << "GPU TEST ERROR:\n";
+        }
+
+        ss << "nbl::hlsl::" << memberName << " produced incorrect output! test value: " <<
+            testVal.x << ' ' << testVal.y << ' ' << testVal.z <<
+            " expected value: " << expectedVal.x << ' ' << expectedVal.y << ' ' << expectedVal.z << '\n';
+
+        m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR);
+    }
+
+    template<typename T>
+    void verifyTestMatrix3x3Value(const std::string& memberName, const nbl::hlsl::matrix<T, 3, 3>& expectedVal, const nbl::hlsl::matrix<T, 3, 3>& testVal, const TestType testType)
+    {
+        for (int i = 0; i < 3; ++i)
+        {
+            auto expectedValRow = expectedVal[i];
+            auto testValRow = testVal[i];
+            verifyTestVector3dValue(memberName, expectedValRow, testValRow, testType);
+        }
+    }
+
+    void performTests()
+    {
+        m_logger->log("intrinsics.hlsl TESTS:", system::ILogger::ELL_PERFORMANCE);
+        for (int i = 0; i < Iterations; ++i)
+        {
+            // Set input thest values that will be used in both CPU and GPU tests
+            InputTestValues testInput;
+
+            // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values
+            TestValues expected;
+
+            performCpuTests(testInput, expected);
+            performGpuTests(testInput, expected);
+        }
+        m_logger->log("intrinsics.hlsl TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
+    }
+
+protected:
+    uint32_t m_queueFamily;
+    core::smart_refctd_ptr<video::ILogicalDevice> m_device;
+    core::smart_refctd_ptr<video::CVulkanConnection> m_api;
+    video::IPhysicalDevice* m_physicalDevice;
+    core::smart_refctd_ptr<asset::IAssetManager> m_assetMgr;
+    core::smart_refctd_ptr<system::ILogger> m_logger;
+    video::IDeviceMemoryAllocator::SAllocation m_inputBufferAllocation = {};
+    video::IDeviceMemoryAllocator::SAllocation m_outputBufferAllocation = {};
+    core::smart_refctd_ptr<video::IGPUCommandBuffer> m_cmdbuf = nullptr;
+    core::smart_refctd_ptr<video::IGPUCommandPool> m_cmdpool = nullptr;
+    core::smart_refctd_ptr<video::IGPUDescriptorSet> m_ds = nullptr;
+    core::smart_refctd_ptr<video::IGPUPipelineLayout> m_pplnLayout = nullptr;
+    core::smart_refctd_ptr<video::IGPUComputePipeline> m_pipeline;
+    core::smart_refctd_ptr<video::ISemaphore> m_semaphore;
+    video::IQueue* m_queue;
+    uint64_t m_semaphoreCounter;
+
+    template<typename InputStruct, typename OutputStruct>
+    OutputStruct dispatch(const InputStruct& input)
+    {
+        // Update input buffer
+        if (!m_inputBufferAllocation.memory->map({ 0ull,m_inputBufferAllocation.memory->getAllocationSize() }, video::IDeviceMemoryAllocation::EMCAF_READ))
+            logFail("Failed to map the Device Memory!\n");
+
+        const video::ILogicalDevice::MappedMemoryRange memoryRange(m_inputBufferAllocation.memory.get(), 0ull, m_inputBufferAllocation.memory->getAllocationSize());
+        if (!m_inputBufferAllocation.memory->getMemoryPropertyFlags().hasFlags(video::IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+            m_device->invalidateMappedMemoryRanges(1, &memoryRange);
+
+        std::memcpy(static_cast<InputStruct*>(m_inputBufferAllocation.memory->getMappedPointer()), &input, sizeof(InputStruct));
+
+        m_inputBufferAllocation.memory->unmap();
+
+        // record command buffer
+        m_cmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE);
+        m_cmdbuf->begin(video::IGPUCommandBuffer::USAGE::NONE);
+        m_cmdbuf->beginDebugMarker("test", core::vector4df_SIMD(0, 1, 0, 1));
+        m_cmdbuf->bindComputePipeline(m_pipeline.get());
+        m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get());
+        m_cmdbuf->dispatch(1, 1, 1);
+        m_cmdbuf->endDebugMarker();
+        m_cmdbuf->end();
+
+        video::IQueue::SSubmitInfo submitInfos[1] = {};
+        const video::IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} };
+        submitInfos[0].commandBuffers = cmdbufs;
+        const video::IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = m_semaphore.get(), .value = ++m_semaphoreCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} };
+        submitInfos[0].signalSemaphores = signals;
+
+        m_api->startCapture();
+        m_queue->submit(submitInfos);
+        m_api->endCapture();
+
+        m_device->waitIdle();
+        OutputStruct output;
+        std::memcpy(&output, static_cast<OutputStruct*>(m_outputBufferAllocation.memory->getMappedPointer()), sizeof(OutputStruct));
+        m_device->waitIdle();
+
+        return output;
+    }
+
+private:
+    template<typename... Args>
+    inline void logFail(const char* msg, Args&&... args)
+    {
+        m_logger->log(msg, system::ILogger::ELL_ERROR, std::forward<Args>(args)...);
+        exit(-1);
+    }
+
+    inline static constexpr int Iterations = 100u;
+
+    void performCpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
+    {
+        TestValues cpuTestValues;
+        cpuTestValues.fillTestValues(commonTestInputValues);
+        verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU);
+
+    }
+
+    void performGpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
+    {
+        TestValues gpuTestValues;
+        gpuTestValues = dispatch<InputTestValues, TestValues>(commonTestInputValues);
+        verifyTestValues(expectedTestValues, gpuTestValues, ITester::TestType::GPU);
+    }
+
+    void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, ITester::TestType testType)
+    {
+        verifyTestValue("bitCount", expectedTestValues.bitCount, testValues.bitCount, testType);
+        verifyTestValue("clamp", expectedTestValues.clamp, testValues.clamp, testType);
+        verifyTestValue("length", expectedTestValues.length, testValues.length, testType);
+        verifyTestValue("dot", expectedTestValues.dot, testValues.dot, testType);
+        verifyTestValue("determinant", expectedTestValues.determinant, testValues.determinant, testType);
+        verifyTestValue("findMSB", expectedTestValues.findMSB, testValues.findMSB, testType);
+        verifyTestValue("findLSB", expectedTestValues.findLSB, testValues.findLSB, testType);
+        verifyTestValue("min", expectedTestValues.min, testValues.min, testType);
+        verifyTestValue("max", expectedTestValues.max, testValues.max, testType);
+        verifyTestValue("rsqrt", expectedTestValues.rsqrt, testValues.rsqrt, testType);
+        verifyTestValue("frac", expectedTestValues.frac, testValues.frac, testType);
+        verifyTestValue("bitReverse", expectedTestValues.bitReverse, testValues.bitReverse, testType);
+        verifyTestValue("mix", expectedTestValues.mix, testValues.mix, testType);
+        verifyTestValue("sign", expectedTestValues.sign, testValues.sign, testType);
+        verifyTestValue("radians", expectedTestValues.radians, testValues.radians, testType);
+        verifyTestValue("degrees", expectedTestValues.degrees, testValues.degrees, testType);
+        verifyTestValue("step", expectedTestValues.step, testValues.step, testType);
+        verifyTestValue("smoothStep", expectedTestValues.smoothStep, testValues.smoothStep, testType);
+
+        verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType);
+        verifyTestVector3dValue("cross", expectedTestValues.cross, testValues.cross, testType);
+        verifyTestVector3dValue("bitCountVec", expectedTestValues.bitCountVec, testValues.bitCountVec, testType);
+        verifyTestVector3dValue("clampVec", expectedTestValues.clampVec, testValues.clampVec, testType);
+        verifyTestVector3dValue("findMSBVec", expectedTestValues.findMSBVec, testValues.findMSBVec, testType);
+        verifyTestVector3dValue("findLSBVec", expectedTestValues.findLSBVec, testValues.findLSBVec, testType);
+        verifyTestVector3dValue("minVec", expectedTestValues.minVec, testValues.minVec, testType);
+        verifyTestVector3dValue("maxVec", expectedTestValues.maxVec, testValues.maxVec, testType);
+        verifyTestVector3dValue("rsqrtVec", expectedTestValues.rsqrtVec, testValues.rsqrtVec, testType);
+        verifyTestVector3dValue("bitReverseVec", expectedTestValues.bitReverseVec, testValues.bitReverseVec, testType);
+        verifyTestVector3dValue("fracVec", expectedTestValues.fracVec, testValues.fracVec, testType);
+        verifyTestVector3dValue("mixVec", expectedTestValues.mixVec, testValues.mixVec, testType);
+
+        verifyTestVector3dValue("signVec", expectedTestValues.signVec, testValues.signVec, testType);
+        verifyTestVector3dValue("radiansVec", expectedTestValues.radiansVec, testValues.radiansVec, testType);
+        verifyTestVector3dValue("degreesVec", expectedTestValues.degreesVec, testValues.degreesVec, testType);
+        verifyTestVector3dValue("stepVec", expectedTestValues.stepVec, testValues.stepVec, testType);
+        verifyTestVector3dValue("smoothStepVec", expectedTestValues.smoothStepVec, testValues.smoothStepVec, testType);
+        verifyTestVector3dValue("faceForward", expectedTestValues.faceForward, testValues.faceForward, testType);
+        verifyTestVector3dValue("reflect", expectedTestValues.reflect, testValues.reflect, testType);
+        verifyTestVector3dValue("refract", expectedTestValues.refract, testValues.refract, testType);
+
+        verifyTestMatrix3x3Value("mul", expectedTestValues.mul, testValues.mul, testType);
+        verifyTestMatrix3x3Value("transpose", expectedTestValues.transpose, testValues.transpose, testType);
+        verifyTestMatrix3x3Value("inverse", expectedTestValues.inverse, testValues.inverse, testType);
+    }
+};
+
+#endif
\ No newline at end of file
diff --git a/12_Mortons/app_resources/common.hlsl b/12_Mortons/app_resources/common.hlsl
index bd5184f80..9632bd372 100644
--- a/12_Mortons/app_resources/common.hlsl
+++ b/12_Mortons/app_resources/common.hlsl
@@ -1,13 +1,33 @@
-//#include "nbl/builtin/hlsl/morton.hlsl"
-#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
+//// This file is part of the "Nabla Engine".
+//// For conditions of distribution and use, see copyright notice in nabla.h
 
-NBL_CONSTEXPR uint32_t bufferSize = 256;
+#ifndef _NBL_EXAMPLES_TESTS_12_MORTON_COMMON_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_12_MORTON_COMMON_INCLUDED_
 
-// Proper coverage would require writing tests for ALL possible sign, dimensions and width configurations
-//using morton_t2 = nbl::hlsl::morton::code<true, 8, 2>; // Fits in an int16_t
-using vector_t2 = nbl::hlsl::vector<int16_t, 3>;
+// because DXC doesn't properly support `_Static_assert`
+// TODO: add a message, and move to macros.h or cpp_compat
+#define STATIC_ASSERT(...) { nbl::hlsl::conditional<__VA_ARGS__, int, void>::type a = 0; }
 
-struct PushConstantData
+#include <boost/preprocessor.hpp>
+
+#include <nbl/builtin/hlsl/morton.hlsl>
+
+// tgmath.hlsl and intrinsics.hlsl tests
+
+using namespace nbl::hlsl;
+struct InputTestValues
+{
+	
+};
+
+struct TestValues
 {
-	uint64_t deviceBufferAddress;
-};
\ No newline at end of file
+
+	void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input)
+	{
+
+	}
+};
+
+#endif
diff --git a/12_Mortons/app_resources/shader.hlsl b/12_Mortons/app_resources/shader.hlsl
deleted file mode 100644
index e7f570eee..000000000
--- a/12_Mortons/app_resources/shader.hlsl
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "app_resources/common.hlsl"
-#include "nbl/builtin/hlsl/bda/legacy_bda_accessor.hlsl"
-
-[[vk::push_constant]] PushConstantData pushConstants;
-
-[numthreads(bufferSize, 1, 1)]
-void main(uint32_t3 ID : SV_DispatchThreadID)
-{
-	/*
-	LegacyBdaAccessor<unsigned_scalar_t> accessor = LegacyBdaAccessor<unsigned_scalar_t>::create(pushConstants.deviceBufferAddress);
-	
-	morton::code<int32_t, 2> foo = morton::code<int32_t, 2>::create(vector<int32_t, 2>(-32768, -1));
-
-	//accessor.set(0, foo.value);
-	*/
-	uint32_t bar = _static_cast<uint32_t>(0xCAFEDEADDEADBEEF);
-	accessor.set(0, bar);
-}
\ No newline at end of file
diff --git a/12_Mortons/main.cpp b/12_Mortons/main.cpp
index d1fddba7a..8118ec939 100644
--- a/12_Mortons/main.cpp
+++ b/12_Mortons/main.cpp
@@ -45,7 +45,17 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 				return false;
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
-
+			{
+				using namespace nbl::hlsl;
+
+				auto bar = morton::code<false, 21, 3, emulated_uint64_t>::create(hlsl::vector<uint32_t, 3>(893728, 7843, 98032));
+				auto foo = _static_cast<hlsl::vector<uint32_t, 3>>(bar);
+				std::cout << foo[0] << " " << foo[1] << " " << foo[2] << " " << std::endl;
+				
+				//auto bar = morton::code<false, 21, 3, emulated_uint64_t>::create(hlsl::vector<uint32_t, 3>(893728, 7843, 98032));
+				//std::cout << "High Encoded: " << std::bitset<32>(bar.value.data.x) << std::endl;
+				//std::cout << "Low Encoded: " << std::bitset<32>(bar.value.data.y) << std::endl;
+			}
 			/*
 
 			// ----------------------------------------------- CPP TESTS ----------------------------------------------------------------------
@@ -193,188 +203,8 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 			// Unsigned
 			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) >= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, true, true));
 
-
-			if(!TestHLSL)
-				return true;
-
 			*/
 
-
-
-
-
-
-
-			// ----------------------------------------------- HLSL COMPILATION + OPTIONAL TESTS ----------------------------------------------
-			auto shader = createShader("app_resources/shader.hlsl");
-
-			// Create massive upload/download buffers
-			constexpr uint32_t DownstreamBufferSize = sizeof(uint32_t) << 23;
-
-			m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize);
-			if (!m_utils)
-				return logFail("Failed to create Utilities!");
-			m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
-			m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress();
-
-			// Create device-local buffer
-			{
-				IGPUBuffer::SCreationParams deviceLocalBufferParams = {};
-
-				IQueue* const queue = getComputeQueue();
-				uint32_t queueFamilyIndex = queue->getFamilyIndex();
-
-				deviceLocalBufferParams.queueFamilyIndexCount = 1;
-				deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex;
-				deviceLocalBufferParams.size = sizeof(uint32_t) * bufferSize;
-				deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
-
-				m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams));
-				auto mreqs = m_deviceLocalBuffer->getMemoryReqs();
-				mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-				auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
-
-				m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress();
-			}
-
-			const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(PushConstantData) };
-
-			{
-				auto layout = m_device->createPipelineLayout({ &pcRange,1 });
-				IGPUComputePipeline::SCreationParams params = {};
-				params.layout = layout.get();
-				params.shader.shader = shader.get();
-				params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize));
-				params.shader.requireFullSubgroups = true;
-				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
-					return logFail("Failed to create compute pipeline!\n");
-			}
-
-			const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
-			// The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices
-			// which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets.
-			// Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc.
-			// there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those.
-			// We'll align to max of coherent atom size even if the memory is coherent,
-			// and we also need to take into account BDA shader loads need to be aligned to the type being loaded.
-			m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(float));
-
-			// Semaphor used here to know the FFT is done before download
-			m_timeline = m_device->createSemaphore(semaphorValue);
-
-			IQueue* const queue = getComputeQueue();
-
-			const uint32_t inputSize = sizeof(uint32_t) * bufferSize;
-
-			// Just need a single suballocation in this example
-			const uint32_t AllocationCount = 1;
-
-			// We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled)
-			// Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later).
-			std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
-
-			// finally allocate our output range
-			const uint32_t outputSize = inputSize;
-
-			auto outputOffset = m_downStreamingBuffer->invalid_value;
-			m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment);
-
-			smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
-			{
-				smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
-				if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) {
-					return logFail("Failed to create Command Buffers!\n");
-				}
-				cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger));
-				cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-				cmdbuf->bindComputePipeline(m_pipeline.get());
-				// This is the new fun part, pushing constants
-				const PushConstantData pc = { .deviceBufferAddress = m_deviceLocalBufferAddress };
-				cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
-				// Remember we do a single workgroup per 1D array in these parts
-				cmdbuf->dispatch(1, 1, 1);
-
-				// Pipeline barrier: wait for FFT shader to be done before copying to downstream buffer 
-				IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo = {};
-
-				decltype(pipelineBarrierInfo)::buffer_barrier_t barrier = {};
-				pipelineBarrierInfo.bufBarriers = { &barrier, 1u };
-
-				barrier.range.buffer = m_deviceLocalBuffer;
-
-				barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
-				barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS;
-				barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
-				barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS;
-
-				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo);
-
-				IGPUCommandBuffer::SBufferCopy copyInfo = {};
-				copyInfo.srcOffset = 0;
-				copyInfo.dstOffset = 0;
-				copyInfo.size = m_deviceLocalBuffer->getSize();
-				cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, &copyInfo);
-				cmdbuf->end();
-			}
-
-			semaphorValue++;
-			{
-				const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
-				{
-					.cmdbuf = cmdbuf.get()
-				};
-				const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
-				{
-					.semaphore = m_timeline.get(),
-					.value = semaphorValue,
-					.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
-				};
-
-				const IQueue::SSubmitInfo submitInfo = {
-					.waitSemaphores = {},
-					.commandBuffers = {&cmdbufInfo,1},
-					.signalSemaphores = {&signalInfo,1}
-				};
-
-				m_api->startCapture();
-				queue->submit({ &submitInfo,1 });
-				m_api->endCapture();
-			}
-
-			// We let all latches know what semaphore and counter value has to be passed for the functors to execute
-			const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue };
-
-			// Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer.
-			// Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory.
-			// Its nice because it will also remember to invalidate our memory mapping if its not coherent.
-			auto latchedConsumer = make_smart_refctd_ptr<IUtilities::CDownstreamingDataConsumer>(
-				IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize),
-				// Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals
-				[=](const size_t dstOffset, const void* bufSrc, const size_t size)->void
-				{
-					// The unused variable is used for letting the consumer know the subsection of the output we've managed to download
-					// But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves.
-					assert(dstOffset == 0 && size == outputSize);
-
-					std::cout << "Begin array GPU\n";
-					uint32_t* const data = reinterpret_cast<uint32_t*>(const_cast<void*>(bufSrc));
-					//std::cout << std::bitset<32>(data[0]) << "\n";
-					std::cout << data[0] << "\n";
-					/*
-					for (auto i = 0u; i < bufferSize; i++) {
-						std::cout << std::bitset<32>(data[i]) << "\n";
-					}
-					*/
-					std::cout << "\nEnd array GPU\n";
-				},
-				// Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it
-				// hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands.
-				// It could also be latched in the upstreaming deallocate, because its the same fence.
-				std::move(cmdbuf), m_downStreamingBuffer
-			);
-			// We put a function we want to execute 
-			m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get());
-
 			return true;
 		}
 
@@ -387,12 +217,6 @@ class MortonTestApp final : public application_templates::MonoDeviceApplication,
 		// Cleanup
 		bool onAppTerminated() override
 		{
-			// Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated`
-			// (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain)
-			if (TestHLSL)
-			{
-				while (m_downStreamingBuffer->cull_frees()) {}
-			}
 			return device_base_t::onAppTerminated();
 		}
 
diff --git a/22_CppCompat/CIntrinsicsTester.h b/22_CppCompat/CIntrinsicsTester.h
index 5fe7bc08e..09219a9e7 100644
--- a/22_CppCompat/CIntrinsicsTester.h
+++ b/22_CppCompat/CIntrinsicsTester.h
@@ -147,6 +147,9 @@ class CIntrinsicsTester final : public ITester
             expected.step = glm::step(testInput.stepEdge, testInput.stepX);
             expected.smoothStep = glm::smoothstep(testInput.smoothStepEdge0, testInput.smoothStepEdge1, testInput.smoothStepX);
 
+            expected.addCarry.result = glm::uaddCarry(testInput.addCarryA, testInput.addCarryB, expected.addCarry.carry);
+            expected.subBorrow.result = glm::usubBorrow(testInput.subBorrowA, testInput.subBorrowB, expected.subBorrow.borrow);
+
             expected.frac = testInput.frac - std::floor(testInput.frac);
             expected.bitReverse = glm::bitfieldReverse(testInput.bitReverse);
 
@@ -189,6 +192,9 @@ class CIntrinsicsTester final : public ITester
             expected.reflect = glm::reflect(testInput.reflectI, testInput.reflectN);
             expected.refract = glm::refract(testInput.refractI, testInput.refractN, testInput.refractEta);
 
+            expected.addCarryVec.result = glm::uaddCarry(testInput.addCarryAVec, testInput.addCarryBVec, expected.addCarryVec.carry);
+            expected.subBorrowVec.result = glm::usubBorrow(testInput.subBorrowAVec, testInput.subBorrowBVec, expected.subBorrowVec.borrow);
+
             auto mulGlm = nbl::hlsl::mul(testInput.mulLhs, testInput.mulRhs);
             expected.mul = reinterpret_cast<float32_t3x3&>(mulGlm);
             auto transposeGlm = glm::transpose(reinterpret_cast<typename float32_t3x3::Base const&>(testInput.transpose));
@@ -196,11 +202,6 @@ class CIntrinsicsTester final : public ITester
             auto inverseGlm = glm::inverse(reinterpret_cast<typename float32_t3x3::Base const&>(testInput.inverse));
             expected.inverse = reinterpret_cast<float32_t3x3&>(inverseGlm);
 
-            expected.addCarry.result = glm::uaddCarry(testInput.addCarryA, testInput.addCarryB, expected.addCarry.carry);
-            expected.subBorrow.result = glm::usubBorrow(testInput.subBorrowA, testInput.subBorrowB, expected.subBorrow.borrow);
-            expected.addCarryVec.result = glm::uaddCarry(testInput.addCarryAVec, testInput.addCarryBVec, expected.addCarryVec.carry);
-            expected.subBorrowVec.result = glm::usubBorrow(testInput.subBorrowAVec, testInput.subBorrowBVec, expected.subBorrowVec.borrow);
-
             performCpuTests(testInput, expected);
             performGpuTests(testInput, expected);
         }
@@ -213,6 +214,7 @@ class CIntrinsicsTester final : public ITester
     void performCpuTests(const IntrinsicsIntputTestValues& commonTestInputValues, const IntrinsicsTestValues& expectedTestValues)
     {
         IntrinsicsTestValues cpuTestValues;
+
         cpuTestValues.fillTestValues(commonTestInputValues);
         verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU);
 
@@ -245,6 +247,11 @@ class CIntrinsicsTester final : public ITester
         verifyTestValue("degrees", expectedTestValues.degrees, testValues.degrees, testType);
         verifyTestValue("step", expectedTestValues.step, testValues.step, testType);
         verifyTestValue("smoothStep", expectedTestValues.smoothStep, testValues.smoothStep, testType);
+        verifyTestValue("addCarryResult", expectedTestValues.addCarry.result, testValues.addCarry.result, testType);
+        verifyTestValue("addCarryCarry", expectedTestValues.addCarry.carry, testValues.addCarry.carry, testType);
+        // Disabled: current glm implementation is wrong
+        //verifyTestValue("subBorrowResult", expectedTestValues.subBorrow.result, testValues.subBorrow.result, testType);
+        //verifyTestValue("subBorrowBorrow", expectedTestValues.subBorrow.borrow, testValues.subBorrow.borrow, testType);
 
         verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType);
         verifyTestVector3dValue("cross", expectedTestValues.cross, testValues.cross, testType);
@@ -267,6 +274,11 @@ class CIntrinsicsTester final : public ITester
         verifyTestVector3dValue("faceForward", expectedTestValues.faceForward, testValues.faceForward, testType);
         verifyTestVector3dValue("reflect", expectedTestValues.reflect, testValues.reflect, testType);
         verifyTestVector3dValue("refract", expectedTestValues.refract, testValues.refract, testType);
+        verifyTestVector3dValue("addCarryVecResult", expectedTestValues.addCarryVec.result, testValues.addCarryVec.result, testType);
+        verifyTestVector3dValue("addCarryVecCarry", expectedTestValues.addCarryVec.carry, testValues.addCarryVec.carry, testType);
+        // Disabled: current glm implementation is wrong
+        //verifyTestVector3dValue("subBorrowVecResult", expectedTestValues.subBorrowVec.result, testValues.subBorrowVec.result, testType);
+        //verifyTestVector3dValue("subBorrowVecBorrow", expectedTestValues.subBorrowVec.borrow, testValues.subBorrowVec.borrow, testType);
 
         verifyTestMatrix3x3Value("mul", expectedTestValues.mul, testValues.mul, testType);
         verifyTestMatrix3x3Value("transpose", expectedTestValues.transpose, testValues.transpose, testType);

From b2d87c36ad63c27b8547ea6583aa4c1ce716690d Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Thu, 24 Apr 2025 16:06:16 -0300
Subject: [PATCH 7/9] Added extensive tests for Morton codes

---
 12_Mortons/Tester.h                           | 135 +++---
 12_Mortons/app_resources/common.hlsl          | 453 +++++++++++++++++-
 12_Mortons/app_resources/mortonTest.comp.hlsl |  16 +
 12_Mortons/main.cpp                           | 298 +++---------
 22_CppCompat/ITester.h                        |   1 +
 5 files changed, 604 insertions(+), 299 deletions(-)
 create mode 100644 12_Mortons/app_resources/mortonTest.comp.hlsl

diff --git a/12_Mortons/Tester.h b/12_Mortons/Tester.h
index 5c4773111..480328d18 100644
--- a/12_Mortons/Tester.h
+++ b/12_Mortons/Tester.h
@@ -1,5 +1,5 @@
-#ifndef _NBL_EXAMPLES_TESTS_12_MORTONS_I_TESTER_INCLUDED_
-#define _NBL_EXAMPLES_TESTS_12_MORTONS_I_TESTER_INCLUDED_
+#ifndef _NBL_EXAMPLES_TESTS_12_MORTONS_TESTER_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_12_MORTONS_TESTER_INCLUDED_
 
 #include <nabla.h>
 #include "app_resources/common.hlsl"
@@ -128,7 +128,7 @@ class Tester
             if (!inputBuff)
                 logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
 
-            inputBuff->setObjectDebugName("emulated_float64_t output buffer");
+            inputBuff->setObjectDebugName("morton input buffer");
 
             video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuff->getMemoryReqs();
             reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
@@ -163,7 +163,7 @@ class Tester
             if (!outputBuff)
                 logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
 
-            outputBuff->setObjectDebugName("emulated_float64_t output buffer");
+            outputBuff->setObjectDebugName("morton output buffer");
 
             video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs();
             reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
@@ -208,8 +208,7 @@ class Tester
     template<typename T>
     void verifyTestValue(const std::string& memberName, const T& expectedVal, const T& testVal, const TestType testType)
     {
-        static constexpr float MaxAllowedError = 0.1f;
-        if (std::abs(double(expectedVal) - double(testVal)) <= MaxAllowedError)
+        if (expectedVal == testVal)
             return;
 
         std::stringstream ss;
@@ -221,7 +220,7 @@ class Tester
             ss << "GPU TEST ERROR:\n";
         }
 
-        ss << "nbl::hlsl::" << memberName << " produced incorrect output! test value: " << testVal << " expected value: " << expectedVal << '\n';
+        ss << "nbl::hlsl::" << memberName << " produced incorrect output!" << '\n'; //test value: " << testVal << " expected value: " << expectedVal << '\n';
 
         m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR);
     }
@@ -240,6 +239,7 @@ class Tester
         {
         case TestType::CPU:
             ss << "CPU TEST ERROR:\n";
+            break;
         case TestType::GPU:
             ss << "GPU TEST ERROR:\n";
         }
@@ -251,32 +251,60 @@ class Tester
         m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR);
     }
 
-    template<typename T>
-    void verifyTestMatrix3x3Value(const std::string& memberName, const nbl::hlsl::matrix<T, 3, 3>& expectedVal, const nbl::hlsl::matrix<T, 3, 3>& testVal, const TestType testType)
-    {
-        for (int i = 0; i < 3; ++i)
-        {
-            auto expectedValRow = expectedVal[i];
-            auto testValRow = testVal[i];
-            verifyTestVector3dValue(memberName, expectedValRow, testValRow, testType);
-        }
-    }
-
     void performTests()
     {
-        m_logger->log("intrinsics.hlsl TESTS:", system::ILogger::ELL_PERFORMANCE);
+        std::random_device rd;
+        std::mt19937 mt(rd());
+
+        std::uniform_int_distribution<uint16_t> shortDistribution(uint16_t(0), std::numeric_limits<uint16_t>::max());
+        std::uniform_int_distribution<uint32_t> intDistribution(uint32_t(0), std::numeric_limits<uint32_t>::max());
+        std::uniform_int_distribution<uint64_t> longDistribution(uint64_t(0), std::numeric_limits<uint64_t>::max());
+
+        m_logger->log("TESTS:", system::ILogger::ELL_PERFORMANCE);
         for (int i = 0; i < Iterations; ++i)
         {
             // Set input thest values that will be used in both CPU and GPU tests
             InputTestValues testInput;
-
             // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values
             TestValues expected;
 
+            uint32_t generatedShift = intDistribution(mt) & uint32_t(63);
+            testInput.shift = generatedShift;
+            {
+                uint64_t generatedA = longDistribution(mt);
+                uint64_t generatedB = longDistribution(mt);
+
+                testInput.generatedA = generatedA;
+                testInput.generatedB = generatedB;
+
+                expected.emulatedAnd = _static_cast<emulated_uint64_t>(generatedA & generatedB);
+                expected.emulatedOr = _static_cast<emulated_uint64_t>(generatedA | generatedB);
+                expected.emulatedXor = _static_cast<emulated_uint64_t>(generatedA ^ generatedB);
+                expected.emulatedNot = _static_cast<emulated_uint64_t>(~generatedA);
+                expected.emulatedPlus = _static_cast<emulated_uint64_t>(generatedA + generatedB);
+                expected.emulatedMinus = _static_cast<emulated_uint64_t>(generatedA - generatedB);
+                expected.emulatedLess = uint32_t(generatedA < generatedB);
+                expected.emulatedLessEqual = uint32_t(generatedA <= generatedB);
+                expected.emulatedGreater = uint32_t(generatedA > generatedB);
+                expected.emulatedGreaterEqual = uint32_t(generatedA >= generatedB);
+
+                expected.emulatedLeftShifted = _static_cast<emulated_uint64_t>(generatedA << generatedShift);
+                expected.emulatedUnsignedRightShifted = _static_cast<emulated_uint64_t>(generatedA >> generatedShift);
+                expected.emulatedSignedRightShifted = _static_cast<emulated_int64_t>(static_cast<int64_t>(generatedA) >> generatedShift);
+            }
+            {
+                uint64_t coordX = longDistribution(mt);
+                uint64_t coordY = longDistribution(mt);
+                uint64_t coordZ = longDistribution(mt);
+                uint64_t coordW = longDistribution(mt);
+
+
+            }
+
             performCpuTests(testInput, expected);
             performGpuTests(testInput, expected);
         }
-        m_logger->log("intrinsics.hlsl TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
+        m_logger->log("TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
     }
 
 protected:
@@ -354,7 +382,7 @@ class Tester
     {
         TestValues cpuTestValues;
         cpuTestValues.fillTestValues(commonTestInputValues);
-        verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU);
+        verifyTestValues(expectedTestValues, cpuTestValues, TestType::CPU);
 
     }
 
@@ -362,55 +390,26 @@ class Tester
     {
         TestValues gpuTestValues;
         gpuTestValues = dispatch<InputTestValues, TestValues>(commonTestInputValues);
-        verifyTestValues(expectedTestValues, gpuTestValues, ITester::TestType::GPU);
+        verifyTestValues(expectedTestValues, gpuTestValues, TestType::GPU);
     }
 
-    void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, ITester::TestType testType)
+    void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, TestType testType)
     {
-        verifyTestValue("bitCount", expectedTestValues.bitCount, testValues.bitCount, testType);
-        verifyTestValue("clamp", expectedTestValues.clamp, testValues.clamp, testType);
-        verifyTestValue("length", expectedTestValues.length, testValues.length, testType);
-        verifyTestValue("dot", expectedTestValues.dot, testValues.dot, testType);
-        verifyTestValue("determinant", expectedTestValues.determinant, testValues.determinant, testType);
-        verifyTestValue("findMSB", expectedTestValues.findMSB, testValues.findMSB, testType);
-        verifyTestValue("findLSB", expectedTestValues.findLSB, testValues.findLSB, testType);
-        verifyTestValue("min", expectedTestValues.min, testValues.min, testType);
-        verifyTestValue("max", expectedTestValues.max, testValues.max, testType);
-        verifyTestValue("rsqrt", expectedTestValues.rsqrt, testValues.rsqrt, testType);
-        verifyTestValue("frac", expectedTestValues.frac, testValues.frac, testType);
-        verifyTestValue("bitReverse", expectedTestValues.bitReverse, testValues.bitReverse, testType);
-        verifyTestValue("mix", expectedTestValues.mix, testValues.mix, testType);
-        verifyTestValue("sign", expectedTestValues.sign, testValues.sign, testType);
-        verifyTestValue("radians", expectedTestValues.radians, testValues.radians, testType);
-        verifyTestValue("degrees", expectedTestValues.degrees, testValues.degrees, testType);
-        verifyTestValue("step", expectedTestValues.step, testValues.step, testType);
-        verifyTestValue("smoothStep", expectedTestValues.smoothStep, testValues.smoothStep, testType);
-
-        verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType);
-        verifyTestVector3dValue("cross", expectedTestValues.cross, testValues.cross, testType);
-        verifyTestVector3dValue("bitCountVec", expectedTestValues.bitCountVec, testValues.bitCountVec, testType);
-        verifyTestVector3dValue("clampVec", expectedTestValues.clampVec, testValues.clampVec, testType);
-        verifyTestVector3dValue("findMSBVec", expectedTestValues.findMSBVec, testValues.findMSBVec, testType);
-        verifyTestVector3dValue("findLSBVec", expectedTestValues.findLSBVec, testValues.findLSBVec, testType);
-        verifyTestVector3dValue("minVec", expectedTestValues.minVec, testValues.minVec, testType);
-        verifyTestVector3dValue("maxVec", expectedTestValues.maxVec, testValues.maxVec, testType);
-        verifyTestVector3dValue("rsqrtVec", expectedTestValues.rsqrtVec, testValues.rsqrtVec, testType);
-        verifyTestVector3dValue("bitReverseVec", expectedTestValues.bitReverseVec, testValues.bitReverseVec, testType);
-        verifyTestVector3dValue("fracVec", expectedTestValues.fracVec, testValues.fracVec, testType);
-        verifyTestVector3dValue("mixVec", expectedTestValues.mixVec, testValues.mixVec, testType);
-
-        verifyTestVector3dValue("signVec", expectedTestValues.signVec, testValues.signVec, testType);
-        verifyTestVector3dValue("radiansVec", expectedTestValues.radiansVec, testValues.radiansVec, testType);
-        verifyTestVector3dValue("degreesVec", expectedTestValues.degreesVec, testValues.degreesVec, testType);
-        verifyTestVector3dValue("stepVec", expectedTestValues.stepVec, testValues.stepVec, testType);
-        verifyTestVector3dValue("smoothStepVec", expectedTestValues.smoothStepVec, testValues.smoothStepVec, testType);
-        verifyTestVector3dValue("faceForward", expectedTestValues.faceForward, testValues.faceForward, testType);
-        verifyTestVector3dValue("reflect", expectedTestValues.reflect, testValues.reflect, testType);
-        verifyTestVector3dValue("refract", expectedTestValues.refract, testValues.refract, testType);
-
-        verifyTestMatrix3x3Value("mul", expectedTestValues.mul, testValues.mul, testType);
-        verifyTestMatrix3x3Value("transpose", expectedTestValues.transpose, testValues.transpose, testType);
-        verifyTestMatrix3x3Value("inverse", expectedTestValues.inverse, testValues.inverse, testType);
+        verifyTestValue("emulatedAnd", expectedTestValues.emulatedAnd, testValues.emulatedAnd, testType);
+        verifyTestValue("emulatedOr", expectedTestValues.emulatedOr, testValues.emulatedOr, testType);
+        verifyTestValue("emulatedXor", expectedTestValues.emulatedXor, testValues.emulatedXor, testType);
+        verifyTestValue("emulatedNot", expectedTestValues.emulatedNot, testValues.emulatedNot, testType);
+        verifyTestValue("emulatedPlus", expectedTestValues.emulatedPlus, testValues.emulatedPlus, testType);
+        verifyTestValue("emulatedMinus", expectedTestValues.emulatedMinus, testValues.emulatedMinus, testType);
+        verifyTestValue("emulatedLess", expectedTestValues.emulatedLess, testValues.emulatedLess, testType);
+        verifyTestValue("emulatedLessEqual", expectedTestValues.emulatedLessEqual, testValues.emulatedLessEqual, testType);
+        verifyTestValue("emulatedGreater", expectedTestValues.emulatedGreater, testValues.emulatedGreater, testType);
+        verifyTestValue("emulatedGreaterEqual", expectedTestValues.emulatedGreaterEqual, testValues.emulatedGreaterEqual, testType);
+        verifyTestValue("emulatedLeftShifted", expectedTestValues.emulatedLeftShifted, testValues.emulatedLeftShifted, testType);
+        verifyTestValue("emulatedUnsignedRightShifted", expectedTestValues.emulatedUnsignedRightShifted, testValues.emulatedUnsignedRightShifted, testType);
+        verifyTestValue("emulatedSignedRightShifted", expectedTestValues.emulatedSignedRightShifted, testValues.emulatedSignedRightShifted, testType);
+        
+        //verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType);
     }
 };
 
diff --git a/12_Mortons/app_resources/common.hlsl b/12_Mortons/app_resources/common.hlsl
index 9632bd372..be6a2f4a0 100644
--- a/12_Mortons/app_resources/common.hlsl
+++ b/12_Mortons/app_resources/common.hlsl
@@ -13,20 +13,471 @@
 
 #include <nbl/builtin/hlsl/morton.hlsl>
 
-// tgmath.hlsl and intrinsics.hlsl tests
+NBL_CONSTEXPR uint16_t smallBits_2 = 8;
+NBL_CONSTEXPR uint16_t mediumBits_2 = 16;
+NBL_CONSTEXPR uint16_t fullBits_2 = 32;
+NBL_CONSTEXPR uint16_t smallBits_3 = 5;
+NBL_CONSTEXPR uint16_t mediumBits_3 = 10;
+NBL_CONSTEXPR uint16_t fullBits_3 = 21;
+NBL_CONSTEXPR uint16_t smallBits_4 = 4;
+NBL_CONSTEXPR uint16_t mediumBits_4 = 8;
+NBL_CONSTEXPR uint16_t fullBits_4 = 16;
 
 using namespace nbl::hlsl;
 struct InputTestValues
 {
+	// Both tests
+	uint32_t shift;
+
+	// Emulated int tests
+	uint64_t generatedA;
+	uint64_t generatedB;
 	
+	// Morton tests
+	uint64_t coordX;
+	uint64_t coordY;
+	uint64_t coordZ;
+	uint64_t coordW;
 };
 
 struct TestValues
 {
+	// Emulated int tests
+	emulated_uint64_t emulatedAnd;
+	emulated_uint64_t emulatedOr;
+	emulated_uint64_t emulatedXor;
+	emulated_uint64_t emulatedNot;
+	emulated_uint64_t emulatedPlus;
+	emulated_uint64_t emulatedMinus;
+	// These are bools but stored as uint because you can't store bools, causes a SPIR-V issue
+	uint32_t emulatedLess;
+	uint32_t emulatedLessEqual;
+	uint32_t emulatedGreater;
+	uint32_t emulatedGreaterEqual;
+	emulated_uint64_t emulatedLeftShifted;
+	emulated_uint64_t emulatedUnsignedRightShifted;
+	emulated_int64_t  emulatedSignedRightShifted;
+
+	// Morton tests - for each dimension let's do one small, medium and full-szied (max bits possible) test to cover representation with
+	// 16, 32 and 64-bit types. Could make it more exhaustive with macros (test all possible bitwidths)
+	// For emulated mortons, we store only the emulated uint64 representing it, because DXC complains about bitcasts otherwise
+
+	// Plus
+	morton::code<false, smallBits_2, 2>					  mortonPlus_small_2;
+	morton::code<false, mediumBits_2, 2>				  mortonPlus_medium_2;
+	morton::code<false, fullBits_2, 2>					  mortonPlus_full_2;
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> mortonPlus_emulated_2;
+	
+	morton::code<false, smallBits_3, 3>					  mortonPlus_small_3;
+	morton::code<false, mediumBits_3, 3>				  mortonPlus_medium_3;
+	morton::code<false, fullBits_3, 3>					  mortonPlus_full_3;
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> mortonPlus_emulated_3;
+	
+	morton::code<false, smallBits_4, 4>					  mortonPlus_small_4;
+	morton::code<false, mediumBits_4, 4>				  mortonPlus_medium_4;
+	morton::code<false, fullBits_4, 4>					  mortonPlus_full_4;
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> mortonPlus_emulated_4;
+	
+	// Minus
+	morton::code<false, smallBits_2, 2>					  mortonMinus_small_2;
+	morton::code<false, mediumBits_2, 2>				  mortonMinus_medium_2;
+	morton::code<false, fullBits_2, 2>					  mortonMinus_full_2;
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> mortonMinus_emulated_2;
+	
+	morton::code<false, smallBits_3, 3>					  mortonMinus_small_3;
+	morton::code<false, mediumBits_3, 3>				  mortonMinus_medium_3;
+	morton::code<false, fullBits_3, 3>					  mortonMinus_full_3;
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> mortonMinus_emulated_3;
+	
+	morton::code<false, smallBits_4, 4>					  mortonMinus_small_4;
+	morton::code<false, mediumBits_4, 4>				  mortonMinus_medium_4;
+	morton::code<false, fullBits_4, 4>					  mortonMinus_full_4;
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> mortonMinus_emulated_4;
+
+	// Coordinate-wise equality (these are bools)
+	uint32_t2 mortonEqual_small_2;
+	uint32_t2 mortonEqual_medium_2;
+	uint32_t2 mortonEqual_full_2;
+	uint32_t2 mortonEqual_emulated_2;
+
+	uint32_t3 mortonEqual_small_3;
+	uint32_t3 mortonEqual_medium_3;
+	uint32_t3 mortonEqual_full_3;
+	uint32_t3 mortonEqual_emulated_3;
+
+	uint32_t4 mortonEqual_small_4;
+	uint32_t4 mortonEqual_medium_4;
+	uint32_t4 mortonEqual_full_4;
+	uint32_t4 mortonEqual_emulated_4;
+
+	// Coordinate-wise unsigned inequality (just testing with less, again these are bools)
+	uint32_t2 mortonUnsignedLess_small_2;
+	uint32_t2 mortonUnsignedLess_medium_2;
+	uint32_t2 mortonUnsignedLess_full_2;
+	uint32_t2 mortonUnsignedLess_emulated_2;
+
+	uint32_t3 mortonUnsignedLess_small_3;
+	uint32_t3 mortonUnsignedLess_medium_3;
+	uint32_t3 mortonUnsignedLess_full_3;
+	uint32_t3 mortonUnsignedLess_emulated_3;
+
+	uint32_t4 mortonUnsignedLess_small_4;
+	uint32_t4 mortonUnsignedLess_medium_4;
+	uint32_t4 mortonUnsignedLess_full_4;
+	uint32_t4 mortonUnsignedLess_emulated_4;
+
+	// Coordinate-wise signed inequality (bools)
+	uint32_t2 mortonSignedLess_small_2;
+	uint32_t2 mortonSignedLess_medium_2;
+	uint32_t2 mortonSignedLess_full_2;
+	uint32_t2 mortonSignedLess_emulated_2;
+
+	uint32_t3 mortonSignedLess_small_3;
+	uint32_t3 mortonSignedLess_medium_3;
+	uint32_t3 mortonSignedLess_full_3;
+	uint32_t3 mortonSignedLess_emulated_3;
+
+	uint32_t4 mortonSignedLess_small_4;
+	uint32_t4 mortonSignedLess_medium_4;
+	uint32_t4 mortonSignedLess_full_4;
+	uint32_t4 mortonSignedLess_emulated_4;
+
+	// Left-shift
+	morton::code<false, smallBits_2, 2>					  mortonLeftShift_small_2;
+	morton::code<false, mediumBits_2, 2>				  mortonLeftShift_medium_2;
+	morton::code<false, fullBits_2, 2>					  mortonLeftShift_full_2;
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> mortonLeftShift_emulated_2;
+
+	morton::code<false, smallBits_3, 3>					  mortonLeftShift_small_3;
+	morton::code<false, mediumBits_3, 3>				  mortonLeftShift_medium_3;
+	morton::code<false, fullBits_3, 3>					  mortonLeftShift_full_3;
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> mortonLeftShift_emulated_3;
+
+	morton::code<false, smallBits_4, 4>					  mortonLeftShift_small_4;
+	morton::code<false, mediumBits_4, 4>				  mortonLeftShift_medium_4;
+	morton::code<false, fullBits_4, 4>					  mortonLeftShift_full_4;
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> mortonLeftShift_emulated_4;
+
+	// Unsigned right-shift
+	morton::code<false, smallBits_2, 2>					  mortonUnsignedRightShift_small_2;
+	morton::code<false, mediumBits_2, 2>				  mortonUnsignedRightShift_medium_2;
+	morton::code<false, fullBits_2, 2>					  mortonUnsignedRightShift_full_2;
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> mortonUnsignedRightShift_emulated_2;
+
+	morton::code<false, smallBits_3, 3>					  mortonUnsignedRightShift_small_3;
+	morton::code<false, mediumBits_3, 3>				  mortonUnsignedRightShift_medium_3;
+	morton::code<false, fullBits_3, 3>					  mortonUnsignedRightShift_full_3;
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> mortonUnsignedRightShift_emulated_3;
+
+	morton::code<false, smallBits_4, 4>					  mortonUnsignedRightShift_small_4;
+	morton::code<false, mediumBits_4, 4>				  mortonUnsignedRightShift_medium_4;
+	morton::code<false, fullBits_4, 4>					  mortonUnsignedRightShift_full_4;
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> mortonUnsignedRightShift_emulated_4;
+
+	// Signed right-shift
+	morton::code<true, smallBits_2, 2>					  mortonSignedRightShift_small_2;
+	morton::code<true, mediumBits_2, 2>					  mortonSignedRightShift_medium_2;
+	morton::code<true, fullBits_2, 2>					  mortonSignedRightShift_full_2;
+	morton::code<true, fullBits_2, 2, emulated_uint64_t>  mortonSignedRightShift_emulated_2;
+
+	morton::code<true, smallBits_3, 3>					  mortonSignedRightShift_small_3;
+	morton::code<true, mediumBits_3, 3>					  mortonSignedRightShift_medium_3;
+	morton::code<true, fullBits_3, 3>					  mortonSignedRightShift_full_3;
+	morton::code<true, fullBits_3, 3, emulated_uint64_t>  mortonSignedRightShift_emulated_3;
+
+	morton::code<true, smallBits_4, 4>					  mortonSignedRightShift_small_4;
+	morton::code<true, mediumBits_4, 4>					  mortonSignedRightShift_medium_4;
+	morton::code<true, fullBits_4, 4>					  mortonSignedRightShift_full_4;
+	morton::code<true, fullBits_4, 4, emulated_uint64_t>  mortonSignedRightShift_emulated_4;
 
 	void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input)
 	{
+		emulated_uint64_t emulatedA = _static_cast<emulated_uint64_t>(input.generatedA);
+		emulated_uint64_t emulatedB = _static_cast<emulated_uint64_t>(input.generatedB);
+
+		// Emulated int tests
+		emulatedAnd = emulatedA & emulatedB;
+		emulatedOr = emulatedA | emulatedB;
+		emulatedXor = emulatedA ^ emulatedB;
+		emulatedNot = emulatedA.operator~();
+		emulatedPlus = emulatedA + emulatedB;
+		emulatedMinus = emulatedA - emulatedB;
+		emulatedLess = uint32_t(emulatedA < emulatedB);
+		emulatedLessEqual = uint32_t(emulatedA <= emulatedB);
+		emulatedGreater = uint32_t(emulatedA > emulatedB);
+		emulatedGreaterEqual = uint32_t(emulatedA >= emulatedB);
+
+		left_shift_operator<emulated_uint64_t> leftShift;
+		emulatedLeftShifted = leftShift(emulatedA, input.shift);
+
+		arithmetic_right_shift_operator<emulated_uint64_t> unsignedRightShift;
+		emulatedUnsignedRightShifted = unsignedRightShift(emulatedA, input.shift);
+
+		arithmetic_right_shift_operator<emulated_int64_t> signedRightShift;
+		emulatedSignedRightShifted = signedRightShift(_static_cast<emulated_int64_t>(emulatedA), input.shift);
+
+		// Morton tests
+		uint64_t2 Vec2A = { input.coordX, input.coordY };
+		uint64_t2 Vec2B = { input.coordZ, input.coordW };
+
+		uint64_t3 Vec3A = { input.coordX, input.coordY, input.coordZ };
+		uint64_t3 Vec3B = { input.coordY, input.coordZ, input.coordW };
+
+		uint64_t4 Vec4A = { input.coordX, input.coordY, input.coordZ, input.coordW };
+		uint64_t4 Vec4B = { input.coordY, input.coordZ, input.coordW, input.coordX };
+
+		int64_t2 Vec2ASigned = int64_t2(Vec2A);
+		int64_t2 Vec2BSigned = int64_t2(Vec2B);
+
+		int64_t3 Vec3ASigned = int64_t3(Vec3A);
+		int64_t3 Vec3BSigned = int64_t3(Vec3B);
+
+		int64_t4 Vec4ASigned = int64_t4(Vec4A);
+		int64_t4 Vec4BSigned = int64_t4(Vec4B);
+
+		morton::code<false, smallBits_2, 2> morton_small_2A = morton::code<false, smallBits_2, 2>::create(Vec2A);
+		morton::code<false, mediumBits_2, 2> morton_medium_2A = morton::code<false, mediumBits_2, 2>::create(Vec2A);
+		morton::code<false, fullBits_2, 2> morton_full_2A = morton::code<false, fullBits_2, 2>::create(Vec2A);
+		morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2A = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2A);
+		morton::code<false, smallBits_2, 2> morton_small_2B = morton::code<false, smallBits_2, 2>::create(Vec2B);
+		morton::code<false, mediumBits_2, 2> morton_medium_2B = morton::code<false, mediumBits_2, 2>::create(Vec2B);
+		morton::code<false, fullBits_2, 2> morton_full_2B = morton::code<false, fullBits_2, 2>::create(Vec2B);
+		morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2B = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2B);
+
+		morton::code<false, smallBits_3, 3> morton_small_3A = morton::code<false, smallBits_3, 3>::create(Vec3A);
+		morton::code<false, mediumBits_3, 3> morton_medium_3A = morton::code<false, mediumBits_3, 3>::create(Vec3A);
+		morton::code<false, fullBits_3, 3> morton_full_3A = morton::code<false, fullBits_3, 3>::create(Vec3A);
+		morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3A = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3A);
+		morton::code<false, smallBits_3, 3> morton_small_3B = morton::code<false, smallBits_3, 3>::create(Vec3B);
+		morton::code<false, mediumBits_3, 3> morton_medium_3B = morton::code<false, mediumBits_3, 3>::create(Vec3B);
+		morton::code<false, fullBits_3, 3> morton_full_3B = morton::code<false, fullBits_3, 3>::create(Vec3B);
+		morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3B = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3B);
+
+		morton::code<false, smallBits_4, 4> morton_small_4A = morton::code<false, smallBits_4, 4>::create(Vec4A);
+		morton::code<false, mediumBits_4, 4> morton_medium_4A = morton::code<false, mediumBits_4, 4>::create(Vec4A);
+		morton::code<false, fullBits_4, 4> morton_full_4A = morton::code<false, fullBits_4, 4>::create(Vec4A);
+		morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4A = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4A);
+		morton::code<false, smallBits_4, 4> morton_small_4B = morton::code<false, smallBits_4, 4>::create(Vec4B);
+		morton::code<false, mediumBits_4, 4> morton_medium_4B = morton::code<false, mediumBits_4, 4>::create(Vec4B);
+		morton::code<false, fullBits_4, 4> morton_full_4B = morton::code<false, fullBits_4, 4>::create(Vec4B);
+		morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4B = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4B);
+
+		morton::code<true, smallBits_2, 2> morton_small_2ASigned = morton::code<true, smallBits_2, 2>::create(Vec2ASigned);
+		morton::code<true, mediumBits_2, 2> morton_medium_2ASigned = morton::code<true, mediumBits_2, 2>::create(Vec2ASigned);
+		morton::code<true, fullBits_2, 2> morton_full_2ASigned = morton::code<true, fullBits_2, 2>::create(Vec2ASigned);
+		morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2ASigned = morton::code<true, fullBits_2, 2, emulated_uint64_t>::create(Vec2ASigned);
+		morton::code<true, smallBits_2, 2> morton_small_2BSigned = morton::code<true, smallBits_2, 2>::create(Vec2BSigned);
+		morton::code<true, mediumBits_2, 2> morton_medium_2BSigned = morton::code<true, mediumBits_2, 2>::create(Vec2BSigned);
+		morton::code<true, fullBits_2, 2> morton_full_2BSigned = morton::code<true, fullBits_2, 2>::create(Vec2BSigned);
+		morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2BSigned = morton::code<true, fullBits_2, 2, emulated_uint64_t>::create(Vec2BSigned);
+
+		morton::code<true, smallBits_3, 3> morton_small_3ASigned = morton::code<true, smallBits_3, 3>::create(Vec3ASigned);
+		morton::code<true, mediumBits_3, 3> morton_medium_3ASigned = morton::code<true, mediumBits_3, 3>::create(Vec3ASigned);
+		morton::code<true, fullBits_3, 3> morton_full_3ASigned = morton::code<true, fullBits_3, 3>::create(Vec3ASigned);
+		morton::code<true, fullBits_3, 3, emulated_uint64_t> morton_emulated_3ASigned = morton::code<true, fullBits_3, 3, emulated_uint64_t>::create(Vec3ASigned);
+		morton::code<true, smallBits_3, 3> morton_small_3BSigned = morton::code<true, smallBits_3, 3>::create(Vec3BSigned);
+		morton::code<true, mediumBits_3, 3> morton_medium_3BSigned = morton::code<true, mediumBits_3, 3>::create(Vec3BSigned);
+		morton::code<true, fullBits_3, 3> morton_full_3BSigned = morton::code<true, fullBits_3, 3>::create(Vec3BSigned);
+		morton::code<true, fullBits_3, 3, emulated_uint64_t> morton_emulated_3BSigned = morton::code<true, fullBits_3, 3, emulated_uint64_t>::create(Vec3BSigned);
+
+		morton::code<true, smallBits_4, 4> morton_small_4ASigned = morton::code<true, smallBits_4, 4>::create(Vec4ASigned);
+		morton::code<true, mediumBits_4, 4> morton_medium_4ASigned = morton::code<true, mediumBits_4, 4>::create(Vec4ASigned);
+		morton::code<true, fullBits_4, 4> morton_full_4ASigned = morton::code<true, fullBits_4, 4>::create(Vec4ASigned);
+		morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4ASigned = morton::code<true, fullBits_4, 4, emulated_uint64_t>::create(Vec4ASigned);
+		morton::code<true, smallBits_4, 4> morton_small_4BSigned = morton::code<true, smallBits_4, 4>::create(Vec4BSigned);
+		morton::code<true, mediumBits_4, 4> morton_medium_4BSigned = morton::code<true, mediumBits_4, 4>::create(Vec4BSigned);
+		morton::code<true, fullBits_4, 4> morton_full_4BSigned = morton::code<true, fullBits_4, 4>::create(Vec4BSigned);
+		morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4BSigned = morton::code<true, fullBits_4, 4, emulated_uint64_t>::create(Vec4BSigned);
+
+		/*
+		left_shift_operator<portable_vector_t<emulated_uint64_t, 4> > leftShiftTemp;
+		portable_vector_t<emulated_uint64_t, 4> interleaved = _static_cast<portable_vector_t<emulated_uint64_t, 4> >(uint16_t4(Vec4B)) & morton::impl::coding_mask_v<4, fullBits_4, morton::impl::CodingStages, emulated_uint64_t>;
+		
+		#define ENCODE_LOOP_ITERATION(I) NBL_IF_CONSTEXPR(fullBits_4 > (uint16_t(1) << I))\
+        {\
+            interleaved = interleaved | leftShiftTemp(interleaved, (uint16_t(1) << I) * (4 - 1));\
+            interleaved = interleaved & _static_cast<emulated_uint64_t>(morton::impl::coding_mask<4, fullBits_4, I>::value);\
+        }
+		
+		ENCODE_LOOP_ITERATION(4)
+		ENCODE_LOOP_ITERATION(3)
+		ENCODE_LOOP_ITERATION(2)
+		ENCODE_LOOP_ITERATION(1)
+		ENCODE_LOOP_ITERATION(0)
+
+		#undef ENCODE_LOOP_ITERATION
+		// After interleaving, shift each coordinate left by their index
+		return leftShiftTemp(interleaved, truncate<vector<uint16_t, Dim> >(vector<uint16_t, 4>(0, 1, 2, 3)));
+		
+		
+		array_get<portable_vector_t<emulated_uint64_t, 4>, emulated_uint64_t> getter;
+		emulatedAnd = getter(interleaved, 0);
+		*/
+		
+		// Plus
+		mortonPlus_small_2 = morton_small_2A + morton_small_2B;
+		mortonPlus_medium_2 = morton_medium_2A + morton_medium_2B;
+		mortonPlus_full_2 = morton_full_2A + morton_full_2B;
+		mortonPlus_emulated_2 = morton_emulated_2A + morton_emulated_2B;
+		
+		mortonPlus_small_3 = morton_small_3A + morton_small_3B;
+		mortonPlus_medium_3 = morton_medium_3A + morton_medium_3B;
+		mortonPlus_full_3 = morton_full_3A + morton_full_3B;
+		mortonPlus_emulated_3 = morton_emulated_3A + morton_emulated_3B;
+
+		mortonPlus_small_4 = morton_small_4A + morton_small_4B;
+		mortonPlus_medium_4 = morton_medium_4A + morton_medium_4B;
+		mortonPlus_full_4 = morton_full_4A + morton_full_4B;
+		mortonPlus_emulated_4 = morton_emulated_4A + morton_emulated_4B;
+		
+		// Minus
+		mortonMinus_small_2 = morton_small_2A - morton_small_2B;
+		mortonMinus_medium_2 = morton_medium_2A - morton_medium_2B;
+		mortonMinus_full_2 = morton_full_2A - morton_full_2B;
+		mortonMinus_emulated_2 = morton_emulated_2A - morton_emulated_2B;
+
+		mortonMinus_small_3 = morton_small_3A - morton_small_3B;
+		mortonMinus_medium_3 = morton_medium_3A - morton_medium_3B;
+		mortonMinus_full_3 = morton_full_3A - morton_full_3B;
+		mortonMinus_emulated_3 = morton_emulated_3A - morton_emulated_3B;
+
+		mortonMinus_small_4 = morton_small_4A - morton_small_4B;
+		mortonMinus_medium_4 = morton_medium_4A - morton_medium_4B;
+		mortonMinus_full_4 = morton_full_4A - morton_full_4B;
+		mortonMinus_emulated_4 = morton_emulated_4A - morton_emulated_4B;
+
+		// Coordinate-wise equality
+		mortonEqual_small_2 = uint32_t2(morton_small_2A.equal<false>(uint16_t2(Vec2B)));
+		mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal<false>(uint16_t2(Vec2B)));
+		mortonEqual_full_2 = uint32_t2(morton_full_2A.equal<false>(uint32_t2(Vec2B)));
+		mortonEqual_emulated_2 = uint32_t2(morton_emulated_2A.equal<false>(uint32_t2(Vec2B)));
+
+		mortonEqual_small_3 = uint32_t3(morton_small_3A.equal<false>(uint16_t3(Vec3B)));
+		mortonEqual_medium_3 = uint32_t3(morton_medium_3A.equal<false>(uint16_t3(Vec3B)));
+		mortonEqual_full_3 = uint32_t3(morton_full_3A.equal<false>(uint32_t3(Vec3B)));
+		mortonEqual_emulated_3 = uint32_t3(morton_emulated_3A.equal<false>(uint32_t3(Vec3B)));
+
+		mortonEqual_small_4 = uint32_t4(morton_small_4A.equal<false>(uint16_t4(Vec4B)));
+		mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal<false>(uint16_t4(Vec4B)));
+		mortonEqual_full_4 = uint32_t4(morton_full_4A.equal<false>(uint16_t4(Vec4B)));
+		mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(uint16_t4(Vec4B)));
+		
+		// Coordinate-wise unsigned inequality (just testing with less)
+		mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan<false>(uint16_t2(Vec2B)));
+		mortonUnsignedLess_medium_2 = uint32_t2(morton_medium_2A.lessThan<false>(uint16_t2(Vec2B)));
+		mortonUnsignedLess_full_2 = uint32_t2(morton_full_2A.lessThan<false>(uint32_t2(Vec2B)));
+		mortonUnsignedLess_emulated_2 = uint32_t2(morton_emulated_2A.lessThan<false>(uint32_t2(Vec2B)));
+		
+		mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan<false>(uint16_t3(Vec3B)));
+		mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan<false>(uint16_t3(Vec3B)));
+		mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan<false>(uint32_t3(Vec3B)));
+		mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan<false>(uint32_t3(Vec3B)));
+		
+		mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan<false>(uint16_t4(Vec4B)));
+		mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan<false>(uint16_t4(Vec4B)));
+		mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan<false>(uint16_t4(Vec4B)));
+		mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan<false>(uint16_t4(Vec4B)));
+		
+		// Coordinate-wise signed inequality
+		mortonSignedLess_small_2 = uint32_t2(morton_small_2ASigned.lessThan<false>(int16_t2(Vec2BSigned)));
+		mortonSignedLess_medium_2 = uint32_t2(morton_medium_2ASigned.lessThan<false>(int16_t2(Vec2BSigned)));
+		mortonSignedLess_full_2 = uint32_t2(morton_full_2ASigned.lessThan<false>(int32_t2(Vec2BSigned)));
+		//mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2ASigned.lessThan<false>(int32_t2(Vec2BSigned)));
+
+		mortonSignedLess_small_3 = uint32_t3(morton_small_3ASigned.lessThan<false>(int16_t3(Vec3BSigned)));
+		mortonSignedLess_medium_3 = uint32_t3(morton_medium_3ASigned.lessThan<false>(int16_t3(Vec3BSigned)));
+		mortonSignedLess_full_3 = uint32_t3(morton_full_3ASigned.lessThan<false>(int32_t3(Vec3BSigned)));
+		//mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3ASigned.lessThan<false>(int32_t3(Vec3BSigned)));
+
+		mortonSignedLess_small_4 = uint32_t4(morton_small_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
+		mortonSignedLess_medium_4 = uint32_t4(morton_medium_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
+		mortonSignedLess_full_4 = uint32_t4(morton_full_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
+		//mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
+		
+		// Left-shift
+		uint16_t castedShift = uint16_t(input.shift);
+		left_shift_operator<morton::code<false, smallBits_2, 2> > leftShiftSmall2;
+		mortonLeftShift_small_2 = leftShiftSmall2(morton_small_2A, castedShift);
+		left_shift_operator<morton::code<false, mediumBits_2, 2> > leftShiftMedium2;
+		mortonLeftShift_medium_2 = leftShiftMedium2(morton_medium_2A, castedShift);
+		left_shift_operator<morton::code<false, fullBits_2, 2> > leftShiftFull2;
+		mortonLeftShift_full_2 = leftShiftFull2(morton_full_2A, castedShift);
+		left_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > leftShiftEmulated2;
+		mortonLeftShift_emulated_2 = leftShiftEmulated2(morton_emulated_2A, castedShift);
+
+		left_shift_operator<morton::code<false, smallBits_3, 3> > leftShiftSmall3;
+		mortonLeftShift_small_3 = leftShiftSmall3(morton_small_3A, castedShift);
+		left_shift_operator<morton::code<false, mediumBits_3, 3> > leftShiftMedium3;
+		mortonLeftShift_medium_3 = leftShiftMedium3(morton_medium_3A, castedShift);
+		left_shift_operator<morton::code<false, fullBits_3, 3> > leftShiftFull3;
+		mortonLeftShift_full_3 = leftShiftFull3(morton_full_3A, castedShift);
+		left_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > leftShiftEmulated3;
+		mortonLeftShift_emulated_3 = leftShiftEmulated3(morton_emulated_3A, castedShift);
+
+		left_shift_operator<morton::code<false, smallBits_4, 4> > leftShiftSmall4;
+		mortonLeftShift_small_4 = leftShiftSmall4(morton_small_4A, castedShift);
+		left_shift_operator<morton::code<false, mediumBits_4, 4> > leftShiftMedium4;
+		mortonLeftShift_medium_4 = leftShiftMedium4(morton_medium_4A, castedShift);
+		left_shift_operator<morton::code<false, fullBits_4, 4> > leftShiftFull4;
+		mortonLeftShift_full_4 = leftShiftFull4(morton_full_4A, castedShift);
+		left_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > leftShiftEmulated4;
+		mortonLeftShift_emulated_4 = leftShiftEmulated4(morton_emulated_4A, castedShift);
+		
+		// Unsigned right-shift
+		arithmetic_right_shift_operator<morton::code<false, smallBits_2, 2> > rightShiftSmall2;
+		mortonUnsignedRightShift_small_2 = rightShiftSmall2(morton_small_2A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, mediumBits_2, 2> > rightShiftMedium2;
+		mortonUnsignedRightShift_medium_2 = rightShiftMedium2(morton_medium_2A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2> > rightShiftFull2;
+		mortonUnsignedRightShift_full_2 = rightShiftFull2(morton_full_2A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > rightShiftEmulated2;
+		mortonUnsignedRightShift_emulated_2 = rightShiftEmulated2(morton_emulated_2A, castedShift);
+
+		arithmetic_right_shift_operator<morton::code<false, smallBits_3, 3> > rightShiftSmall3;
+		mortonUnsignedRightShift_small_3 = rightShiftSmall3(morton_small_3A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, mediumBits_3, 3> > rightShiftMedium3;
+		mortonUnsignedRightShift_medium_3 = rightShiftMedium3(morton_medium_3A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3> > rightShiftFull3;
+		mortonUnsignedRightShift_full_3 = rightShiftFull3(morton_full_3A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > rightShiftEmulated3;
+		mortonUnsignedRightShift_emulated_3 = rightShiftEmulated3(morton_emulated_3A, castedShift);
+
+		arithmetic_right_shift_operator<morton::code<false, smallBits_4, 4> > rightShiftSmall4;
+		mortonUnsignedRightShift_small_4 = rightShiftSmall4(morton_small_4A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, mediumBits_4, 4> > rightShiftMedium4;
+		mortonUnsignedRightShift_medium_4 = rightShiftMedium4(morton_medium_4A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4> > rightShiftFull4;
+		mortonUnsignedRightShift_full_4 = rightShiftFull4(morton_full_4A, castedShift);
+		arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > rightShiftEmulated4;
+		mortonUnsignedRightShift_emulated_4 = rightShiftEmulated4(morton_emulated_4A, castedShift);
+
+		// Signed right-shift
+		arithmetic_right_shift_operator<morton::code<true, smallBits_2, 2> > rightShiftSignedSmall2;
+		mortonSignedRightShift_small_2 = rightShiftSignedSmall2(morton_small_2ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, mediumBits_2, 2> > rightShiftSignedMedium2;
+		mortonSignedRightShift_medium_2 = rightShiftSignedMedium2(morton_medium_2ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2> > rightShiftSignedFull2;
+		mortonSignedRightShift_full_2 = rightShiftSignedFull2(morton_full_2ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2, emulated_uint64_t> > rightShiftSignedEmulated2;
+		//mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2ASigned, castedShift);
+
+		arithmetic_right_shift_operator<morton::code<true, smallBits_3, 3> > rightShiftSignedSmall3;
+		mortonSignedRightShift_small_3 = rightShiftSignedSmall3(morton_small_3ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, mediumBits_3, 3> > rightShiftSignedMedium3;
+		mortonSignedRightShift_medium_3 = rightShiftSignedMedium3(morton_medium_3ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3> > rightShiftSignedFull3;
+		mortonSignedRightShift_full_3 = rightShiftSignedFull3(morton_full_3ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3, emulated_uint64_t> > rightShiftSignedEmulated3;
+		//mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3ASigned, castedShift);
 
+		arithmetic_right_shift_operator<morton::code<true, smallBits_4, 4> > rightShiftSignedSmall4;
+		mortonSignedRightShift_small_4 = rightShiftSignedSmall4(morton_small_4ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, mediumBits_4, 4> > rightShiftSignedMedium4;
+		mortonSignedRightShift_medium_4 = rightShiftSignedMedium4(morton_medium_4ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4> > rightShiftSignedFull4;
+		mortonSignedRightShift_full_4 = rightShiftSignedFull4(morton_full_4ASigned, castedShift);
+		arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4, emulated_uint64_t> > rightShiftSignedEmulated4;
+		//mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4ASigned, castedShift);
 	}
 };
 
diff --git a/12_Mortons/app_resources/mortonTest.comp.hlsl b/12_Mortons/app_resources/mortonTest.comp.hlsl
new file mode 100644
index 000000000..7041568b8
--- /dev/null
+++ b/12_Mortons/app_resources/mortonTest.comp.hlsl
@@ -0,0 +1,16 @@
+//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
+//// This file is part of the "Nabla Engine".
+//// For conditions of distribution and use, see copyright notice in nabla.h
+#pragma shader_stage(compute)
+
+#include "common.hlsl"
+
+[[vk::binding(0, 0)]] RWStructuredBuffer<InputTestValues> inputTestValues;
+[[vk::binding(1, 0)]] RWStructuredBuffer<TestValues> outputTestValues;
+
+[numthreads(256, 1, 1)]
+void main(uint3 invocationID : SV_DispatchThreadID)
+{
+    if (invocationID.x == 0)
+        outputTestValues[0].fillTestValues(inputTestValues[0]);
+}
diff --git a/12_Mortons/main.cpp b/12_Mortons/main.cpp
index 8118ec939..f83c49b9e 100644
--- a/12_Mortons/main.cpp
+++ b/12_Mortons/main.cpp
@@ -1,242 +1,80 @@
-// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+#include <nabla.h>
+#include <iostream>
+#include <cstdio>
+#include <assert.h>
 
-
-// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
 #include "app_resources/common.hlsl"
-#include <bitset>
-
-// Right now the test only checks that HLSL compiles the file
-constexpr bool TestHLSL = true;
+#include "Tester.h"
 
-using namespace nbl;
-using namespace core;
-using namespace system;
-using namespace asset;
-using namespace video;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::video;
+using namespace nbl::application_templates;
 
-// this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platforms
-class MortonTestApp final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+class MortonTest final : public MonoDeviceApplication, public MonoAssetManagerAndBuiltinResourceApplication
 {
-		using device_base_t = application_templates::MonoDeviceApplication;
-		using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
-
-		inline core::smart_refctd_ptr<video::IGPUShader> createShader(
-			const char* includeMainName)
-		{
-			std::string prelude = "#include \"";
-			auto CPUShader = core::make_smart_refctd_ptr<ICPUShader>((prelude + includeMainName + "\"\n").c_str(), IShader::E_SHADER_STAGE::ESS_COMPUTE, IShader::E_CONTENT_TYPE::ECT_HLSL, includeMainName);
-			assert(CPUShader);
-			return m_device->createShader(CPUShader.get());
-		}
-	public:
-		MortonTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
-			system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
-
-		// we stuff all our work here because its a "single shot" app
-		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
-		{
-			// Remember to call the base class initialization!
-			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
-				return false;
-			if (!asset_base_t::onAppInitialized(std::move(system)))
-				return false;
-			{
-				using namespace nbl::hlsl;
-
-				auto bar = morton::code<false, 21, 3, emulated_uint64_t>::create(hlsl::vector<uint32_t, 3>(893728, 7843, 98032));
-				auto foo = _static_cast<hlsl::vector<uint32_t, 3>>(bar);
-				std::cout << foo[0] << " " << foo[1] << " " << foo[2] << " " << std::endl;
-				
-				//auto bar = morton::code<false, 21, 3, emulated_uint64_t>::create(hlsl::vector<uint32_t, 3>(893728, 7843, 98032));
-				//std::cout << "High Encoded: " << std::bitset<32>(bar.value.data.x) << std::endl;
-				//std::cout << "Low Encoded: " << std::bitset<32>(bar.value.data.y) << std::endl;
-			}
-			/*
-
-			// ----------------------------------------------- CPP TESTS ----------------------------------------------------------------------
-			
-			// Coordinate extraction and whole vector decode tests
-			{
-				morton_t morton(vector_t(-1011, 765, 248));
-				unsigned_morton_t unsignedMorton(unsigned_vector_t(154, 789, 1011));
-
-				assert(morton.getCoordinate(0) == -1011 && morton.getCoordinate(1) == 765 && morton.getCoordinate(2) == 248);
-				assert(unsignedMorton.getCoordinate(0) == 154u && unsignedMorton.getCoordinate(1) == 789u && unsignedMorton.getCoordinate(2) == 1011u);
-
-				assert(static_cast<vector_t>(morton) == vector_t(-1011, 765, 248) && static_cast<unsigned_vector_t>(unsignedMorton) == unsigned_vector_t(154, 789, 1011));
-			}
-
-			// ***********************************************************************************************************************************
-			// ************************************************* Arithmetic operator tests *******************************************************
-			// ***********************************************************************************************************************************
-			
-			//  ----------------------------------------------------------------------------------------------------
-			//  --------------------------------------- ADDITION ---------------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// ---------------------------------------- Signed -----------------------------------------------------
-			
-			// No overflow
-			assert(static_cast<vector_t>(morton_t(vector_t(-1011, 765, 248)) + morton_t(vector_t(1000, -985, 200))) == vector_t(-11, -220, 448));
-			
-			// Type 1 overflow: Addition of representable coordinates goes out of range
-			assert(static_cast<vector_t>(morton_t(vector_t(-900, 70, 500)) + morton_t(vector_t(-578, -50, 20))) == vector_t(570, 20, -504));
-
-			// Type 2 overflow: Addition of irrepresentable range gives correct result
-			assert(static_cast<vector_t>(morton_t(vector_t(54, 900, -475)) + morton_t(vector_t(46, -1437, 699))) == vector_t(100, -537, 224));
-
-			// ---------------------------------------- Unsigned -----------------------------------------------------
-
-			// No overflow
-			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(1563, 754, 220))) == unsigned_vector_t(1945, 1664, 763));
-
-			// Type 1 overflow: Addition of representable coordinates goes out of range
-			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(2000, 2000, 1000))) == unsigned_vector_t(334, 862, 519));
-
-			// Type 2 overflow: Addition of irrepresentable range gives correct result
-			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) + unsigned_morton_t(unsigned_vector_t(-143, -345, -233))) == unsigned_vector_t(239, 565, 310));
-
-			//  ----------------------------------------------------------------------------------------------------
-			//  -------------------------------------- SUBTRACTION -------------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// ---------------------------------------- Signed -----------------------------------------------------
-
-			// No overflow
-			assert(static_cast<vector_t>(morton_t(vector_t(1000, 764, -365)) - morton_t(vector_t(834, -243, 100))) == vector_t(166, 1007, -465));
-
-			// Type 1 overflow: Subtraction of representable coordinates goes out of range
-			assert(static_cast<vector_t>(morton_t(vector_t(-900, 70, 500)) - morton_t(vector_t(578, -50, -20))) == vector_t(570, 120, -504));
-
-			// Type 2 overflow: Subtraction of irrepresentable range gives correct result
-			assert(static_cast<vector_t>(morton_t(vector_t(54, 900, -475)) - morton_t(vector_t(-46, 1437, -699))) == vector_t(100, -537, 224));
-
-			// ---------------------------------------- Unsigned -----------------------------------------------------
-
-			// No overflow
-			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) - unsigned_morton_t(unsigned_vector_t(322, 564, 299))) == unsigned_vector_t(60, 346, 244));
-
-			// Type 1 overflow: Subtraction of representable coordinates goes out of range
-			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(382, 910, 543)) - unsigned_morton_t(unsigned_vector_t(2000, 2000, 1000))) == unsigned_vector_t(430, 958, 567));
-
-			// Type 2 overflow: Subtraction of irrepresentable range gives correct result
-			assert(static_cast<unsigned_vector_t>(unsigned_morton_t(unsigned_vector_t(54, 900, 475)) - unsigned_morton_t(unsigned_vector_t(-865, -100, -10))) == unsigned_vector_t(919, 1000, 485));
-
-
-			//  ----------------------------------------------------------------------------------------------------
-			//  -------------------------------------- UNARY NEGATION ----------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// Only makes sense for signed
-			assert(static_cast<vector_t>(- morton_t(vector_t(-1024, 543, -475))) == vector_t(-1024, -543, 475));
-
-			// ***********************************************************************************************************************************
-			// ************************************************* Comparison operator tests *******************************************************
-			// ***********************************************************************************************************************************
-
-			//  ----------------------------------------------------------------------------------------------------
-			//  -------------------------------------- OPERATOR< ---------------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// Signed
-			
-			// Same sign, negative
-			assert(morton_t(vector_t(-954, -455, -333)) < morton_t(vector_t(-433, -455, -433)) == bool_vector_t(true, false, false));
-			// Same sign, positive
-			assert(morton_t(vector_t(954, 455, 333)) < morton_t(vector_t(433, 455, 433)) == bool_vector_t(false, false, true));
-			// Differing signs
-			assert(morton_t(vector_t(954, -32, 0)) < morton_t(vector_t(-44, 0, -1)) == bool_vector_t(false, true, false));
-
-			// Unsigned
-			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) < unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(true, false, false));
-
-			//  ----------------------------------------------------------------------------------------------------
-			//  -------------------------------------- OPERATOR<= --------------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// Signed
-
-			// Same sign, negative
-			assert(morton_t(vector_t(-954, -455, -333)) <= morton_t(vector_t(-433, -455, -433)) == bool_vector_t(true, true, false));
-			// Same sign, positive
-			assert(morton_t(vector_t(954, 455, 333)) <= morton_t(vector_t(433, 455, 433)) == bool_vector_t(false, true, true));
-			// Differing signs
-			assert(morton_t(vector_t(954, -32, 0)) <= morton_t(vector_t(-44, 0, -1)) == bool_vector_t(false, true, false));
-
-			// Unsigned
-			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) <= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(true, true, false));
-
-			//  ----------------------------------------------------------------------------------------------------
-			//  -------------------------------------- OPERATOR> ---------------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// Signed
-
-			// Same sign, negative
-			assert(morton_t(vector_t(-954, -455, -333)) > morton_t(vector_t(-433, -455, -433)) == bool_vector_t(false, false, true));
-			// Same sign, positive
-			assert(morton_t(vector_t(954, 455, 333)) > morton_t(vector_t(433, 455, 433)) == bool_vector_t(true, false, false));
-			// Differing signs
-			assert(morton_t(vector_t(954, -32, 0)) > morton_t(vector_t(-44, 0, -1)) == bool_vector_t(true, false, true));
-
-			// Unsigned
-			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) > unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, false, true));
-
-			//  ----------------------------------------------------------------------------------------------------
-			//  -------------------------------------- OPERATOR>= --------------------------------------------------
-			//  ----------------------------------------------------------------------------------------------------
-
-			// Signed
-
-			// Same sign, negative
-			assert(morton_t(vector_t(-954, -455, -333)) >= morton_t(vector_t(-433, -455, -433)) == bool_vector_t(false, true, true));
-			// Same sign, positive
-			assert(morton_t(vector_t(954, 455, 333)) >= morton_t(vector_t(433, 455, 433)) == bool_vector_t(true, true, false));
-			// Differing signs
-			assert(morton_t(vector_t(954, -32, 0)) >= morton_t(vector_t(-44, 0, -1)) == bool_vector_t(true, false, true));
-
-			// Unsigned
-			assert(unsigned_morton_t(unsigned_vector_t(239, 435, 66)) >= unsigned_morton_t(unsigned_vector_t(240, 435, 50)) == bool_vector_t(false, true, true));
-
-			*/
-
-			return true;
-		}
-
-		// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
-		void workLoopBody() override {}
-
-		// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
-		bool keepRunning() override {return false;}
-
-		// Cleanup
-		bool onAppTerminated() override
-		{
-			return device_base_t::onAppTerminated();
-		}
-
-	private:
-		smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
-
-		smart_refctd_ptr<nbl::video::IUtilities> m_utils;
-
-		StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
-		smart_refctd_ptr<nbl::video::IGPUBuffer> m_deviceLocalBuffer;
-
-		// These are Buffer Device Addresses
-		uint64_t m_downStreamingBufferAddress;
-		uint64_t m_deviceLocalBufferAddress;
-
-		uint32_t m_alignment;
-
-		smart_refctd_ptr<ISemaphore> m_timeline;
-		uint64_t semaphorValue = 0;
+    using device_base_t = MonoDeviceApplication;
+    using asset_base_t = MonoAssetManagerAndBuiltinResourceApplication;
+public:
+    MortonTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+        IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {
+    }
+
+    bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+    {
+        // Remember to call the base class initialization!
+        if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+            return false;
+        if (!asset_base_t::onAppInitialized(std::move(system)))
+            return false;
+        {
+            
+        }
+        
+        Tester::PipelineSetupData pplnSetupData;
+        pplnSetupData.device = m_device;
+        pplnSetupData.api = m_api;
+        pplnSetupData.assetMgr = m_assetMgr;
+        pplnSetupData.logger = m_logger;
+        pplnSetupData.physicalDevice = m_physicalDevice;
+        pplnSetupData.computeFamilyIndex = getComputeQueue()->getFamilyIndex();
+        {
+            Tester mortonTester;
+            pplnSetupData.testShaderPath = "app_resources/mortonTest.comp.hlsl";
+            mortonTester.setupPipeline<InputTestValues, TestValues>(pplnSetupData);
+            mortonTester.performTests();
+        }
+        
+
+        return true;
+    }
+
+    void onAppTerminated_impl() override
+    {
+        m_device->waitIdle();
+    }
+
+    void workLoopBody() override
+    {
+        m_keepRunning = false;
+    }
+
+    bool keepRunning() override
+    {
+        return m_keepRunning;
+    }
+
+
+private:
+    bool m_keepRunning = true;
 };
 
-
-NBL_MAIN_FUNC(MortonTestApp)
\ No newline at end of file
+NBL_MAIN_FUNC(MortonTest)
\ No newline at end of file
diff --git a/22_CppCompat/ITester.h b/22_CppCompat/ITester.h
index a216fbf40..207cdee51 100644
--- a/22_CppCompat/ITester.h
+++ b/22_CppCompat/ITester.h
@@ -217,6 +217,7 @@ class ITester
         {
         case TestType::CPU:
             ss << "CPU TEST ERROR:\n";
+            break;
         case TestType::GPU:
             ss << "GPU TEST ERROR:\n";
         }

From c68c336317024ae80fb017b1cb71e6b32a152224 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Mon, 28 Apr 2025 15:16:34 -0300
Subject: [PATCH 8/9] Done with tests

---
 12_Mortons/CTester.h                          | 401 ++++++++++++++++++
 12_Mortons/{Tester.h => ITester.h}            | 133 +-----
 12_Mortons/app_resources/common.hlsl          | 299 ++-----------
 .../{mortonTest.comp.hlsl => test.comp.hlsl}  |   5 +-
 12_Mortons/app_resources/testCommon.hlsl      | 242 +++++++++++
 12_Mortons/main.cpp                           |  13 +-
 6 files changed, 691 insertions(+), 402 deletions(-)
 create mode 100644 12_Mortons/CTester.h
 rename 12_Mortons/{Tester.h => ITester.h} (66%)
 rename 12_Mortons/app_resources/{mortonTest.comp.hlsl => test.comp.hlsl} (79%)
 create mode 100644 12_Mortons/app_resources/testCommon.hlsl

diff --git a/12_Mortons/CTester.h b/12_Mortons/CTester.h
new file mode 100644
index 000000000..5a61be501
--- /dev/null
+++ b/12_Mortons/CTester.h
@@ -0,0 +1,401 @@
+#ifndef _NBL_EXAMPLES_TESTS_12_MORTON_C_TESTER_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_12_MORTON_C_TESTER_INCLUDED_
+
+#include <nabla.h>
+#include "app_resources/testCommon.hlsl"
+#include "nbl/application_templates/MonoDeviceApplication.hpp"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "ITester.h"
+
+using namespace nbl;
+
+class CTester final : public ITester
+{
+public:
+    void performTests()
+    {
+        std::random_device rd;
+        std::mt19937 mt(rd());
+
+        std::uniform_int_distribution<uint16_t> shortDistribution(uint16_t(0), std::numeric_limits<uint16_t>::max());
+        std::uniform_int_distribution<uint32_t> intDistribution(uint32_t(0), std::numeric_limits<uint32_t>::max());
+        std::uniform_int_distribution<uint64_t> longDistribution(uint64_t(0), std::numeric_limits<uint64_t>::max());
+
+        m_logger->log("TESTS:", system::ILogger::ELL_PERFORMANCE);
+        for (int i = 0; i < Iterations; ++i)
+        {
+            // Set input thest values that will be used in both CPU and GPU tests
+            InputTestValues testInput;
+            // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values
+            TestValues expected;
+
+            uint32_t generatedShift = intDistribution(mt) & uint32_t(63);
+            testInput.shift = generatedShift;
+            {
+                uint64_t generatedA = longDistribution(mt);
+                uint64_t generatedB = longDistribution(mt);
+
+                testInput.generatedA = generatedA;
+                testInput.generatedB = generatedB;
+
+                expected.emulatedAnd = _static_cast<emulated_uint64_t>(generatedA & generatedB);
+                expected.emulatedOr = _static_cast<emulated_uint64_t>(generatedA | generatedB);
+                expected.emulatedXor = _static_cast<emulated_uint64_t>(generatedA ^ generatedB);
+                expected.emulatedNot = _static_cast<emulated_uint64_t>(~generatedA);
+                expected.emulatedPlus = _static_cast<emulated_uint64_t>(generatedA + generatedB);
+                expected.emulatedMinus = _static_cast<emulated_uint64_t>(generatedA - generatedB);
+                expected.emulatedLess = uint32_t(generatedA < generatedB);
+                expected.emulatedLessEqual = uint32_t(generatedA <= generatedB);
+                expected.emulatedGreater = uint32_t(generatedA > generatedB);
+                expected.emulatedGreaterEqual = uint32_t(generatedA >= generatedB);
+
+                expected.emulatedLeftShifted = _static_cast<emulated_uint64_t>(generatedA << generatedShift);
+                expected.emulatedUnsignedRightShifted = _static_cast<emulated_uint64_t>(generatedA >> generatedShift);
+                expected.emulatedSignedRightShifted = _static_cast<emulated_int64_t>(static_cast<int64_t>(generatedA) >> generatedShift);
+            }
+            {
+                testInput.coordX = longDistribution(mt);
+                testInput.coordY = longDistribution(mt);
+                testInput.coordZ = longDistribution(mt);
+                testInput.coordW = longDistribution(mt);
+
+                uint64_t2 Vec2A = { testInput.coordX, testInput.coordY };
+                uint64_t2 Vec2B = { testInput.coordZ, testInput.coordW };
+
+                uint16_t2 Vec2ASmall = uint16_t2(Vec2A & smallBitsMask_2 );
+                uint16_t2 Vec2BSmall = uint16_t2(Vec2B & smallBitsMask_2 );
+                uint16_t2 Vec2AMedium = uint16_t2(Vec2A & mediumBitsMask_2);
+                uint16_t2 Vec2BMedium = uint16_t2(Vec2B & mediumBitsMask_2);
+                uint32_t2 Vec2AFull = uint32_t2(Vec2A & fullBitsMask_2);
+                uint32_t2 Vec2BFull = uint32_t2(Vec2B & fullBitsMask_2);
+
+                uint64_t3 Vec3A = { testInput.coordX, testInput.coordY, testInput.coordZ };
+                uint64_t3 Vec3B = { testInput.coordY, testInput.coordZ, testInput.coordW };
+
+                uint16_t3 Vec3ASmall = uint16_t3(Vec3A & smallBitsMask_3);
+                uint16_t3 Vec3BSmall = uint16_t3(Vec3B & smallBitsMask_3);
+                uint16_t3 Vec3AMedium = uint16_t3(Vec3A & mediumBitsMask_3);
+                uint16_t3 Vec3BMedium = uint16_t3(Vec3B & mediumBitsMask_3);
+                uint32_t3 Vec3AFull = uint32_t3(Vec3A & fullBitsMask_3);
+                uint32_t3 Vec3BFull = uint32_t3(Vec3B & fullBitsMask_3);
+
+                uint64_t4 Vec4A = { testInput.coordX, testInput.coordY, testInput.coordZ, testInput.coordW };
+                uint64_t4 Vec4B = { testInput.coordY, testInput.coordZ, testInput.coordW, testInput.coordX };
+
+                uint16_t4 Vec4ASmall = uint16_t4(Vec4A & smallBitsMask_4);
+                uint16_t4 Vec4BSmall = uint16_t4(Vec4B & smallBitsMask_4);
+                uint16_t4 Vec4AMedium = uint16_t4(Vec4A & mediumBitsMask_4);
+                uint16_t4 Vec4BMedium = uint16_t4(Vec4B & mediumBitsMask_4);
+                uint16_t4 Vec4AFull = uint16_t4(Vec4A & fullBitsMask_4);
+                uint16_t4 Vec4BFull = uint16_t4(Vec4B & fullBitsMask_4);
+
+                // Signed vectors can't just have their highest bits masked off, for them to preserve sign we also need to left shift then right shift them
+                // so their highest bits are all 0s or 1s depending on the sign of the number they encode
+
+                int16_t2 Vec2ASignedSmall = int16_t2(Vec2ASmall << uint16_t(16 - smallBits_2)) >> int16_t(16 - smallBits_2);
+                int16_t2 Vec2BSignedSmall = int16_t2(Vec2BSmall << uint16_t(16 - smallBits_2)) >> int16_t(16 - smallBits_2);
+                int16_t2 Vec2ASignedMedium = int16_t2(Vec2AMedium << uint16_t(16 - mediumBits_2)) >> int16_t(16 - mediumBits_2);
+                int16_t2 Vec2BSignedMedium = int16_t2(Vec2BMedium << uint16_t(16 - mediumBits_2)) >> int16_t(16 - mediumBits_2);
+                int32_t2 Vec2ASignedFull = int32_t2(Vec2AFull << uint32_t(32 - fullBits_2)) >> int32_t(32 - fullBits_2);
+                int32_t2 Vec2BSignedFull = int32_t2(Vec2BFull << uint32_t(32 - fullBits_2)) >> int32_t(32 - fullBits_2);
+
+                int16_t3 Vec3ASignedSmall = int16_t3(Vec3ASmall << uint16_t(16 - smallBits_3)) >> int16_t(16 - smallBits_3);
+                int16_t3 Vec3BSignedSmall = int16_t3(Vec3BSmall << uint16_t(16 - smallBits_3)) >> int16_t(16 - smallBits_3);
+                int16_t3 Vec3ASignedMedium = int16_t3(Vec3AMedium << uint16_t(16 - mediumBits_3)) >> int16_t(16 - mediumBits_3);
+                int16_t3 Vec3BSignedMedium = int16_t3(Vec3BMedium << uint16_t(16 - mediumBits_3)) >> int16_t(16 - mediumBits_3);
+                int32_t3 Vec3ASignedFull = int32_t3(Vec3AFull << uint32_t(32 - fullBits_3)) >> int32_t(32 - fullBits_3);
+                int32_t3 Vec3BSignedFull = int32_t3(Vec3BFull << uint32_t(32 - fullBits_3)) >> int32_t(32 - fullBits_3);
+
+                int16_t4 Vec4ASignedSmall = int16_t4(Vec4ASmall << uint16_t(16 - smallBits_4)) >> int16_t(16 - smallBits_4);
+                int16_t4 Vec4BSignedSmall = int16_t4(Vec4BSmall << uint16_t(16 - smallBits_4)) >> int16_t(16 - smallBits_4);
+                int16_t4 Vec4ASignedMedium = int16_t4(Vec4AMedium << uint16_t(16 - mediumBits_4)) >> int16_t(16 - mediumBits_4);
+                int16_t4 Vec4BSignedMedium = int16_t4(Vec4BMedium << uint16_t(16 - mediumBits_4)) >> int16_t(16 - mediumBits_4);
+                int16_t4 Vec4ASignedFull = int16_t4(Vec4AFull << uint16_t(16 - fullBits_4)) >> int16_t(16 - fullBits_4);
+                int16_t4 Vec4BSignedFull = int16_t4(Vec4BFull << uint16_t(16 - fullBits_4)) >> int16_t(16 - fullBits_4);
+
+                // Plus
+                expected.mortonPlus_small_2 = morton::code<false, smallBits_2, 2>::create(Vec2ASmall + Vec2BSmall);
+                expected.mortonPlus_medium_2 = morton::code<false, mediumBits_2, 2>::create(Vec2AMedium + Vec2BMedium);
+                expected.mortonPlus_full_2 = morton::code<false, fullBits_2, 2>::create(Vec2AFull + Vec2BFull);
+                expected.mortonPlus_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2AFull + Vec2BFull);
+
+                expected.mortonPlus_small_3 = morton::code<false, smallBits_3, 3>::create(Vec3ASmall + Vec3BSmall);
+                expected.mortonPlus_medium_3 = morton::code<false, mediumBits_3, 3>::create(Vec3AMedium + Vec3BMedium);
+                expected.mortonPlus_full_3 = morton::code<false, fullBits_3, 3>::create(Vec3AFull + Vec3BFull);
+                expected.mortonPlus_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3AFull + Vec3BFull);
+
+                expected.mortonPlus_small_4 = morton::code<false, smallBits_4, 4>::create(Vec4ASmall + Vec4BSmall);
+                expected.mortonPlus_medium_4 = morton::code<false, mediumBits_4, 4>::create(Vec4AMedium + Vec4BMedium);
+                expected.mortonPlus_full_4 = morton::code<false, fullBits_4, 4>::create(Vec4AFull + Vec4BFull);
+                expected.mortonPlus_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4AFull + Vec4BFull);
+
+                // Minus
+                expected.mortonMinus_small_2 = morton::code<false, smallBits_2, 2>::create(Vec2ASmall - Vec2BSmall);
+                expected.mortonMinus_medium_2 = morton::code<false, mediumBits_2, 2>::create(Vec2AMedium - Vec2BMedium);
+                expected.mortonMinus_full_2 = morton::code<false, fullBits_2, 2>::create(Vec2AFull - Vec2BFull);
+                expected.mortonMinus_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2AFull - Vec2BFull);
+
+                expected.mortonMinus_small_3 = morton::code<false, smallBits_3, 3>::create(Vec3ASmall - Vec3BSmall);
+                expected.mortonMinus_medium_3 = morton::code<false, mediumBits_3, 3>::create(Vec3AMedium - Vec3BMedium);
+                expected.mortonMinus_full_3 = morton::code<false, fullBits_3, 3>::create(Vec3AFull - Vec3BFull);
+                expected.mortonMinus_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3AFull - Vec3BFull);
+
+                expected.mortonMinus_small_4 = morton::code<false, smallBits_4, 4>::create(Vec4ASmall - Vec4BSmall);
+                expected.mortonMinus_medium_4 = morton::code<false, mediumBits_4, 4>::create(Vec4AMedium - Vec4BMedium);
+                expected.mortonMinus_full_4 = morton::code<false, fullBits_4, 4>::create(Vec4AFull - Vec4BFull);
+                expected.mortonMinus_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4AFull - Vec4BFull);
+
+                // Coordinate-wise equality
+                expected.mortonEqual_small_2 = uint32_t2(glm::equal(Vec2ASmall, Vec2BSmall));
+                expected.mortonEqual_medium_2 = uint32_t2(glm::equal(Vec2AMedium, Vec2BMedium));
+                expected.mortonEqual_full_2 = uint32_t2(glm::equal(Vec2AFull, Vec2BFull));
+                expected.mortonEqual_emulated_2 = uint32_t2(glm::equal(Vec2AFull, Vec2BFull));
+
+                expected.mortonEqual_small_3 = uint32_t3(glm::equal(Vec3ASmall, Vec3BSmall));
+                expected.mortonEqual_medium_3 = uint32_t3(glm::equal(Vec3AMedium, Vec3BMedium));
+                expected.mortonEqual_full_3 = uint32_t3(glm::equal(Vec3AFull, Vec3BFull));
+                expected.mortonEqual_emulated_3 = uint32_t3(glm::equal(Vec3AFull, Vec3BFull));
+
+                expected.mortonEqual_small_4 = uint32_t4(glm::equal(Vec4ASmall, Vec4BSmall));
+                expected.mortonEqual_medium_4 = uint32_t4(glm::equal(Vec4AMedium, Vec4BMedium));
+                expected.mortonEqual_full_4 = uint32_t4(glm::equal(Vec4AFull, Vec4BFull));
+
+                // Coordinate-wise unsigned inequality (just testing with less)
+                expected.mortonUnsignedLess_small_2 = uint32_t2(glm::lessThan(Vec2ASmall, Vec2BSmall));
+                expected.mortonUnsignedLess_medium_2 = uint32_t2(glm::lessThan(Vec2AMedium, Vec2BMedium));
+                expected.mortonUnsignedLess_full_2 = uint32_t2(glm::lessThan(Vec2AFull, Vec2BFull));
+                expected.mortonUnsignedLess_emulated_2 = uint32_t2(glm::lessThan(Vec2AFull, Vec2BFull));
+
+                expected.mortonUnsignedLess_small_3 = uint32_t3(glm::lessThan(Vec3ASmall, Vec3BSmall));
+                expected.mortonUnsignedLess_medium_3 = uint32_t3(glm::lessThan(Vec3AMedium, Vec3BMedium));
+                expected.mortonUnsignedLess_full_3 = uint32_t3(glm::lessThan(Vec3AFull, Vec3BFull));
+                expected.mortonUnsignedLess_emulated_3 = uint32_t3(glm::lessThan(Vec3AFull, Vec3BFull));
+
+                expected.mortonUnsignedLess_small_4 = uint32_t4(glm::lessThan(Vec4ASmall, Vec4BSmall));
+                expected.mortonUnsignedLess_medium_4 = uint32_t4(glm::lessThan(Vec4AMedium, Vec4BMedium));
+                expected.mortonUnsignedLess_full_4 = uint32_t4(glm::lessThan(Vec4AFull, Vec4BFull));
+
+                // Coordinate-wise signed inequality
+                expected.mortonSignedLess_small_2 = uint32_t2(glm::lessThan(Vec2ASignedSmall, Vec2BSignedSmall));
+                expected.mortonSignedLess_medium_2 = uint32_t2(glm::lessThan(Vec2ASignedMedium, Vec2BSignedMedium));
+                expected.mortonSignedLess_full_2 = uint32_t2(glm::lessThan(Vec2ASignedFull, Vec2BSignedFull));
+
+                expected.mortonSignedLess_small_3 = uint32_t3(glm::lessThan(Vec3ASignedSmall, Vec3BSignedSmall));
+                expected.mortonSignedLess_medium_3 = uint32_t3(glm::lessThan(Vec3ASignedMedium, Vec3BSignedMedium));
+                expected.mortonSignedLess_full_3 = uint32_t3(glm::lessThan(Vec3ASignedFull, Vec3BSignedFull));
+
+                expected.mortonSignedLess_small_4 = uint32_t4(glm::lessThan(Vec4ASignedSmall, Vec4BSignedSmall));
+                expected.mortonSignedLess_medium_4 = uint32_t4(glm::lessThan(Vec4ASignedMedium, Vec4BSignedMedium));
+                expected.mortonSignedLess_full_4 = uint32_t4(glm::lessThan(Vec4ASignedFull, Vec4BSignedFull));
+
+                uint16_t castedShift = uint16_t(generatedShift);
+                // Left-shift
+                expected.mortonLeftShift_small_2 = morton::code<false, smallBits_2, 2>::create((Vec2ASmall << uint16_t(castedShift % smallBits_2)) & uint16_t(smallBitsMask_2));
+                expected.mortonLeftShift_medium_2 = morton::code<false, mediumBits_2, 2>::create((Vec2AMedium << uint16_t(castedShift % mediumBits_2)) & uint16_t(mediumBitsMask_2));
+                expected.mortonLeftShift_full_2 = morton::code<false, fullBits_2, 2>::create((Vec2AFull << uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
+                expected.mortonLeftShift_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create((Vec2AFull << uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
+
+                expected.mortonLeftShift_small_3 = morton::code<false, smallBits_3, 3>::create((Vec3ASmall << uint16_t(castedShift % smallBits_3)) & uint16_t(smallBitsMask_3));
+                expected.mortonLeftShift_medium_3 = morton::code<false, mediumBits_3, 3>::create((Vec3AMedium << uint16_t(castedShift % mediumBits_3)) & uint16_t(mediumBitsMask_3));
+                expected.mortonLeftShift_full_3 = morton::code<false, fullBits_3, 3>::create((Vec3AFull << uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
+                expected.mortonLeftShift_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create((Vec3AFull << uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
+
+                expected.mortonLeftShift_small_4 = morton::code<false, smallBits_4, 4>::create((Vec4ASmall << uint16_t(castedShift % smallBits_4)) & uint16_t(smallBitsMask_4));
+                expected.mortonLeftShift_medium_4 = morton::code<false, mediumBits_4, 4>::create((Vec4AMedium << uint16_t(castedShift % mediumBits_4)) & uint16_t(mediumBitsMask_4));
+                expected.mortonLeftShift_full_4 = morton::code<false, fullBits_4, 4>::create((Vec4AFull << uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
+                expected.mortonLeftShift_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create((Vec4AFull << uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
+
+                // Unsigned right-shift
+                expected.mortonUnsignedRightShift_small_2 = morton::code<false, smallBits_2, 2>::create((Vec2ASmall >> uint16_t(castedShift % smallBits_2)) & uint16_t(smallBitsMask_2));
+                expected.mortonUnsignedRightShift_medium_2 = morton::code<false, mediumBits_2, 2>::create((Vec2AMedium >> uint16_t(castedShift % mediumBits_2)) & uint16_t(mediumBitsMask_2));
+                expected.mortonUnsignedRightShift_full_2 = morton::code<false, fullBits_2, 2>::create((Vec2AFull >> uint32_t(castedShift % fullBits_2)) & uint32_t(fullBitsMask_2));
+                expected.mortonUnsignedRightShift_emulated_2 = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create((Vec2AFull >> uint32_t(castedShift % fullBits_2))& uint32_t(fullBitsMask_2));
+
+                expected.mortonUnsignedRightShift_small_3 = morton::code<false, smallBits_3, 3>::create((Vec3ASmall >> uint16_t(castedShift % smallBits_3)) & uint16_t(smallBitsMask_3));
+                expected.mortonUnsignedRightShift_medium_3 = morton::code<false, mediumBits_3, 3>::create((Vec3AMedium >> uint16_t(castedShift % mediumBits_3)) & uint16_t(mediumBitsMask_3));
+                expected.mortonUnsignedRightShift_full_3 = morton::code<false, fullBits_3, 3>::create((Vec3AFull >> uint32_t(castedShift % fullBits_3)) & uint32_t(fullBitsMask_3));
+                expected.mortonUnsignedRightShift_emulated_3 = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create((Vec3AFull >> uint32_t(castedShift % fullBits_3))& uint32_t(fullBitsMask_3));
+
+                expected.mortonUnsignedRightShift_small_4 = morton::code<false, smallBits_4, 4>::create((Vec4ASmall >> uint16_t(castedShift % smallBits_4)) & uint16_t(smallBitsMask_4));
+                expected.mortonUnsignedRightShift_medium_4 = morton::code<false, mediumBits_4, 4>::create((Vec4AMedium >> uint16_t(castedShift % mediumBits_4)) & uint16_t(mediumBitsMask_4));
+                expected.mortonUnsignedRightShift_full_4 = morton::code<false, fullBits_4, 4>::create((Vec4AFull >> uint16_t(castedShift % fullBits_4)) & uint16_t(fullBitsMask_4));
+                expected.mortonUnsignedRightShift_emulated_4 = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create((Vec4AFull >> uint16_t(castedShift % fullBits_4))& uint16_t(fullBitsMask_4));
+            
+                // Signed right-shift
+                expected.mortonSignedRightShift_small_2 = morton::code<true, smallBits_2, 2>::create((Vec2ASignedSmall >> int16_t(castedShift % smallBits_2)) & int16_t(smallBitsMask_2));
+                expected.mortonSignedRightShift_medium_2 = morton::code<true, mediumBits_2, 2>::create((Vec2ASignedMedium >> int16_t(castedShift % mediumBits_2)) & int16_t(mediumBitsMask_2));
+                expected.mortonSignedRightShift_full_2 = morton::code<true, fullBits_2, 2>::create((Vec2ASignedFull >> int32_t(castedShift % fullBits_2)) & int32_t(fullBitsMask_2));
+
+                expected.mortonSignedRightShift_small_3 = morton::code<true, smallBits_3, 3>::create((Vec3ASignedSmall >> int16_t(castedShift % smallBits_3)) & int16_t(smallBitsMask_3));
+                expected.mortonSignedRightShift_medium_3 = morton::code<true, mediumBits_3, 3>::create((Vec3ASignedMedium >> int16_t(castedShift % mediumBits_3)) & int16_t(mediumBitsMask_3));
+                expected.mortonSignedRightShift_full_3 = morton::code<true, fullBits_3, 3>::create((Vec3ASignedFull >> int32_t(castedShift % fullBits_3)) & int32_t(fullBitsMask_3));
+
+                expected.mortonSignedRightShift_small_4 = morton::code<true, smallBits_4, 4>::create((Vec4ASignedSmall >> int16_t(castedShift % smallBits_4)) & int16_t(smallBitsMask_4));
+                expected.mortonSignedRightShift_medium_4 = morton::code<true, mediumBits_4, 4>::create((Vec4ASignedMedium >> int16_t(castedShift % mediumBits_4)) & int16_t(mediumBitsMask_4));
+                expected.mortonSignedRightShift_full_4 = morton::code<true, fullBits_4, 4>::create((Vec4ASignedFull >> int16_t(castedShift % fullBits_4)) & int16_t(fullBitsMask_4));
+            }
+
+            performCpuTests(testInput, expected);
+            performGpuTests(testInput, expected);
+        }
+        m_logger->log("FIRST TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
+    }
+
+private:
+    inline static constexpr int Iterations = 100u;
+
+    void performCpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
+    {
+        TestValues cpuTestValues;
+
+        fillTestValues(commonTestInputValues, cpuTestValues);
+        verifyTestValues(expectedTestValues, cpuTestValues, ITester::TestType::CPU);
+
+    }
+
+    void performGpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
+    {
+        TestValues gpuTestValues;
+        gpuTestValues = dispatch<InputTestValues, TestValues>(commonTestInputValues);
+        verifyTestValues(expectedTestValues, gpuTestValues, ITester::TestType::GPU);
+    }
+
+    void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, ITester::TestType testType)
+    {
+        verifyTestValue("emulatedAnd", expectedTestValues.emulatedAnd, testValues.emulatedAnd, testType);
+        verifyTestValue("emulatedOr", expectedTestValues.emulatedOr, testValues.emulatedOr, testType);
+        verifyTestValue("emulatedXor", expectedTestValues.emulatedXor, testValues.emulatedXor, testType);
+        verifyTestValue("emulatedNot", expectedTestValues.emulatedNot, testValues.emulatedNot, testType);
+        verifyTestValue("emulatedPlus", expectedTestValues.emulatedPlus, testValues.emulatedPlus, testType);
+        verifyTestValue("emulatedMinus", expectedTestValues.emulatedMinus, testValues.emulatedMinus, testType);
+        verifyTestValue("emulatedLess", expectedTestValues.emulatedLess, testValues.emulatedLess, testType);
+        verifyTestValue("emulatedLessEqual", expectedTestValues.emulatedLessEqual, testValues.emulatedLessEqual, testType);
+        verifyTestValue("emulatedGreater", expectedTestValues.emulatedGreater, testValues.emulatedGreater, testType);
+        verifyTestValue("emulatedGreaterEqual", expectedTestValues.emulatedGreaterEqual, testValues.emulatedGreaterEqual, testType);
+        verifyTestValue("emulatedLeftShifted", expectedTestValues.emulatedLeftShifted, testValues.emulatedLeftShifted, testType);
+        verifyTestValue("emulatedUnsignedRightShifted", expectedTestValues.emulatedUnsignedRightShifted, testValues.emulatedUnsignedRightShifted, testType);
+        verifyTestValue("emulatedSignedRightShifted", expectedTestValues.emulatedSignedRightShifted, testValues.emulatedSignedRightShifted, testType);
+
+        // Morton Plus
+        verifyTestValue("mortonPlus_small_2", expectedTestValues.mortonPlus_small_2, testValues.mortonPlus_small_2, testType);
+        verifyTestValue("mortonPlus_medium_2", expectedTestValues.mortonPlus_medium_2, testValues.mortonPlus_medium_2, testType);
+        verifyTestValue("mortonPlus_full_2", expectedTestValues.mortonPlus_full_2, testValues.mortonPlus_full_2, testType);
+        verifyTestValue("mortonPlus_emulated_2", expectedTestValues.mortonPlus_emulated_2, testValues.mortonPlus_emulated_2, testType);
+
+        verifyTestValue("mortonPlus_small_3", expectedTestValues.mortonPlus_small_3, testValues.mortonPlus_small_3, testType);
+        verifyTestValue("mortonPlus_medium_3", expectedTestValues.mortonPlus_medium_3, testValues.mortonPlus_medium_3, testType);
+        verifyTestValue("mortonPlus_full_3", expectedTestValues.mortonPlus_full_3, testValues.mortonPlus_full_3, testType);
+        verifyTestValue("mortonPlus_emulated_3", expectedTestValues.mortonPlus_emulated_3, testValues.mortonPlus_emulated_3, testType);
+
+        verifyTestValue("mortonPlus_small_4", expectedTestValues.mortonPlus_small_4, testValues.mortonPlus_small_4, testType);
+        verifyTestValue("mortonPlus_medium_4", expectedTestValues.mortonPlus_medium_4, testValues.mortonPlus_medium_4, testType);
+        verifyTestValue("mortonPlus_full_4", expectedTestValues.mortonPlus_full_4, testValues.mortonPlus_full_4, testType);
+        verifyTestValue("mortonPlus_emulated_4", expectedTestValues.mortonPlus_emulated_4, testValues.mortonPlus_emulated_4, testType);
+
+        // Morton Minus
+        verifyTestValue("mortonMinus_small_2", expectedTestValues.mortonMinus_small_2, testValues.mortonMinus_small_2, testType);
+        verifyTestValue("mortonMinus_medium_2", expectedTestValues.mortonMinus_medium_2, testValues.mortonMinus_medium_2, testType);
+        verifyTestValue("mortonMinus_full_2", expectedTestValues.mortonMinus_full_2, testValues.mortonMinus_full_2, testType);
+        verifyTestValue("mortonMinus_emulated_2", expectedTestValues.mortonMinus_emulated_2, testValues.mortonMinus_emulated_2, testType);
+
+        verifyTestValue("mortonMinus_small_3", expectedTestValues.mortonMinus_small_3, testValues.mortonMinus_small_3, testType);
+        verifyTestValue("mortonMinus_medium_3", expectedTestValues.mortonMinus_medium_3, testValues.mortonMinus_medium_3, testType);
+        verifyTestValue("mortonMinus_full_3", expectedTestValues.mortonMinus_full_3, testValues.mortonMinus_full_3, testType);
+        verifyTestValue("mortonMinus_emulated_3", expectedTestValues.mortonMinus_emulated_3, testValues.mortonMinus_emulated_3, testType);
+
+        verifyTestValue("mortonMinus_small_4", expectedTestValues.mortonMinus_small_4, testValues.mortonMinus_small_4, testType);
+        verifyTestValue("mortonMinus_medium_4", expectedTestValues.mortonMinus_medium_4, testValues.mortonMinus_medium_4, testType);
+        verifyTestValue("mortonMinus_full_4", expectedTestValues.mortonMinus_full_4, testValues.mortonMinus_full_4, testType);
+        verifyTestValue("mortonMinus_emulated_4", expectedTestValues.mortonMinus_emulated_4, testValues.mortonMinus_emulated_4, testType);
+
+        // Morton coordinate-wise equality
+        verifyTestValue("mortonEqual_small_2", expectedTestValues.mortonEqual_small_2, testValues.mortonEqual_small_2, testType);
+        verifyTestValue("mortonEqual_medium_2", expectedTestValues.mortonEqual_medium_2, testValues.mortonEqual_medium_2, testType);
+        verifyTestValue("mortonEqual_full_2", expectedTestValues.mortonEqual_full_2, testValues.mortonEqual_full_2, testType);
+        verifyTestValue("mortonEqual_emulated_2", expectedTestValues.mortonEqual_emulated_2, testValues.mortonEqual_emulated_2, testType);
+
+        verifyTestValue("mortonEqual_small_3", expectedTestValues.mortonEqual_small_3, testValues.mortonEqual_small_3, testType);
+        verifyTestValue("mortonEqual_medium_3", expectedTestValues.mortonEqual_medium_3, testValues.mortonEqual_medium_3, testType);
+        verifyTestValue("mortonEqual_full_3", expectedTestValues.mortonEqual_full_3, testValues.mortonEqual_full_3, testType);
+        verifyTestValue("mortonEqual_emulated_3", expectedTestValues.mortonEqual_emulated_3, testValues.mortonEqual_emulated_3, testType);
+
+        verifyTestValue("mortonEqual_small_4", expectedTestValues.mortonEqual_small_4, testValues.mortonEqual_small_4, testType);
+        verifyTestValue("mortonEqual_medium_4", expectedTestValues.mortonEqual_medium_4, testValues.mortonEqual_medium_4, testType);
+        verifyTestValue("mortonEqual_full_4", expectedTestValues.mortonEqual_full_4, testValues.mortonEqual_full_4, testType);
+
+        // Morton coordinate-wise unsigned inequality
+        verifyTestValue("mortonUnsignedLess_small_2", expectedTestValues.mortonUnsignedLess_small_2, testValues.mortonUnsignedLess_small_2, testType);
+        verifyTestValue("mortonUnsignedLess_medium_2", expectedTestValues.mortonUnsignedLess_medium_2, testValues.mortonUnsignedLess_medium_2, testType);
+        verifyTestValue("mortonUnsignedLess_full_2", expectedTestValues.mortonUnsignedLess_full_2, testValues.mortonUnsignedLess_full_2, testType);
+        verifyTestValue("mortonUnsignedLess_emulated_2", expectedTestValues.mortonUnsignedLess_emulated_2, testValues.mortonUnsignedLess_emulated_2, testType);
+
+        verifyTestValue("mortonUnsignedLess_small_3", expectedTestValues.mortonUnsignedLess_small_3, testValues.mortonUnsignedLess_small_3, testType);
+        verifyTestValue("mortonUnsignedLess_medium_3", expectedTestValues.mortonUnsignedLess_medium_3, testValues.mortonUnsignedLess_medium_3, testType);
+        verifyTestValue("mortonUnsignedLess_full_3", expectedTestValues.mortonUnsignedLess_full_3, testValues.mortonUnsignedLess_full_3, testType);
+        verifyTestValue("mortonUnsignedLess_emulated_3", expectedTestValues.mortonUnsignedLess_emulated_3, testValues.mortonUnsignedLess_emulated_3, testType);
+
+        verifyTestValue("mortonUnsignedLess_small_4", expectedTestValues.mortonUnsignedLess_small_4, testValues.mortonUnsignedLess_small_4, testType);
+        verifyTestValue("mortonUnsignedLess_medium_4", expectedTestValues.mortonUnsignedLess_medium_4, testValues.mortonUnsignedLess_medium_4, testType);
+        verifyTestValue("mortonUnsignedLess_full_4", expectedTestValues.mortonUnsignedLess_full_4, testValues.mortonUnsignedLess_full_4, testType);
+
+        // Morton coordinate-wise signed inequality
+        verifyTestValue("mortonSignedLess_small_2", expectedTestValues.mortonSignedLess_small_2, testValues.mortonSignedLess_small_2, testType);
+        verifyTestValue("mortonSignedLess_medium_2", expectedTestValues.mortonSignedLess_medium_2, testValues.mortonSignedLess_medium_2, testType);
+        verifyTestValue("mortonSignedLess_full_2", expectedTestValues.mortonSignedLess_full_2, testValues.mortonSignedLess_full_2, testType);
+
+        verifyTestValue("mortonSignedLess_small_3", expectedTestValues.mortonSignedLess_small_3, testValues.mortonSignedLess_small_3, testType);
+        verifyTestValue("mortonSignedLess_medium_3", expectedTestValues.mortonSignedLess_medium_3, testValues.mortonSignedLess_medium_3, testType);
+        verifyTestValue("mortonSignedLess_full_3", expectedTestValues.mortonSignedLess_full_3, testValues.mortonSignedLess_full_3, testType);
+
+        verifyTestValue("mortonSignedLess_small_4", expectedTestValues.mortonSignedLess_small_4, testValues.mortonSignedLess_small_4, testType);
+        verifyTestValue("mortonSignedLess_medium_4", expectedTestValues.mortonSignedLess_medium_4, testValues.mortonSignedLess_medium_4, testType);
+        verifyTestValue("mortonSignedLess_full_4", expectedTestValues.mortonSignedLess_full_4, testValues.mortonSignedLess_full_4, testType);
+
+        // Morton left-shift
+        verifyTestValue("mortonLeftShift_small_2", expectedTestValues.mortonLeftShift_small_2, testValues.mortonLeftShift_small_2, testType);
+        verifyTestValue("mortonLeftShift_medium_2", expectedTestValues.mortonLeftShift_medium_2, testValues.mortonLeftShift_medium_2, testType);
+        verifyTestValue("mortonLeftShift_full_2", expectedTestValues.mortonLeftShift_full_2, testValues.mortonLeftShift_full_2, testType);
+        verifyTestValue("mortonLeftShift_emulated_2", expectedTestValues.mortonLeftShift_emulated_2, testValues.mortonLeftShift_emulated_2, testType);
+
+        verifyTestValue("mortonLeftShift_small_3", expectedTestValues.mortonLeftShift_small_3, testValues.mortonLeftShift_small_3, testType);
+        verifyTestValue("mortonLeftShift_medium_3", expectedTestValues.mortonLeftShift_medium_3, testValues.mortonLeftShift_medium_3, testType);
+        verifyTestValue("mortonLeftShift_full_3", expectedTestValues.mortonLeftShift_full_3, testValues.mortonLeftShift_full_3, testType);
+        verifyTestValue("mortonLeftShift_emulated_3", expectedTestValues.mortonLeftShift_emulated_3, testValues.mortonLeftShift_emulated_3, testType);
+
+        verifyTestValue("mortonLeftShift_small_4", expectedTestValues.mortonLeftShift_small_4, testValues.mortonLeftShift_small_4, testType);
+        verifyTestValue("mortonLeftShift_medium_4", expectedTestValues.mortonLeftShift_medium_4, testValues.mortonLeftShift_medium_4, testType);
+        verifyTestValue("mortonLeftShift_full_4", expectedTestValues.mortonLeftShift_full_4, testValues.mortonLeftShift_full_4, testType);
+        verifyTestValue("mortonLeftShift_emulated_4", expectedTestValues.mortonLeftShift_emulated_4, testValues.mortonLeftShift_emulated_4, testType);
+
+        // Morton unsigned right-shift
+        verifyTestValue("mortonUnsignedRightShift_small_2", expectedTestValues.mortonUnsignedRightShift_small_2, testValues.mortonUnsignedRightShift_small_2, testType);
+        verifyTestValue("mortonUnsignedRightShift_medium_2", expectedTestValues.mortonUnsignedRightShift_medium_2, testValues.mortonUnsignedRightShift_medium_2, testType);
+        verifyTestValue("mortonUnsignedRightShift_full_2", expectedTestValues.mortonUnsignedRightShift_full_2, testValues.mortonUnsignedRightShift_full_2, testType);
+        verifyTestValue("mortonUnsignedRightShift_emulated_2", expectedTestValues.mortonUnsignedRightShift_emulated_2, testValues.mortonUnsignedRightShift_emulated_2, testType);
+
+        verifyTestValue("mortonUnsignedRightShift_small_3", expectedTestValues.mortonUnsignedRightShift_small_3, testValues.mortonUnsignedRightShift_small_3, testType);
+        verifyTestValue("mortonUnsignedRightShift_medium_3", expectedTestValues.mortonUnsignedRightShift_medium_3, testValues.mortonUnsignedRightShift_medium_3, testType);
+        verifyTestValue("mortonUnsignedRightShift_full_3", expectedTestValues.mortonUnsignedRightShift_full_3, testValues.mortonUnsignedRightShift_full_3, testType);
+        verifyTestValue("mortonUnsignedRightShift_emulated_3", expectedTestValues.mortonUnsignedRightShift_emulated_3, testValues.mortonUnsignedRightShift_emulated_3, testType);
+
+        verifyTestValue("mortonUnsignedRightShift_small_4", expectedTestValues.mortonUnsignedRightShift_small_4, testValues.mortonUnsignedRightShift_small_4, testType);
+        verifyTestValue("mortonUnsignedRightShift_medium_4", expectedTestValues.mortonUnsignedRightShift_medium_4, testValues.mortonUnsignedRightShift_medium_4, testType);
+        verifyTestValue("mortonUnsignedRightShift_full_4", expectedTestValues.mortonUnsignedRightShift_full_4, testValues.mortonUnsignedRightShift_full_4, testType);
+        verifyTestValue("mortonUnsignedRightShift_emulated_4", expectedTestValues.mortonUnsignedRightShift_emulated_4, testValues.mortonUnsignedRightShift_emulated_4, testType);
+
+        // Morton signed right-shift
+        verifyTestValue("mortonSignedRightShift_small_2", expectedTestValues.mortonSignedRightShift_small_2, testValues.mortonSignedRightShift_small_2, testType);
+        verifyTestValue("mortonSignedRightShift_medium_2", expectedTestValues.mortonSignedRightShift_medium_2, testValues.mortonSignedRightShift_medium_2, testType);
+        verifyTestValue("mortonSignedRightShift_full_2", expectedTestValues.mortonSignedRightShift_full_2, testValues.mortonSignedRightShift_full_2, testType);
+
+        verifyTestValue("mortonSignedRightShift_small_3", expectedTestValues.mortonSignedRightShift_small_3, testValues.mortonSignedRightShift_small_3, testType);
+        verifyTestValue("mortonSignedRightShift_medium_3", expectedTestValues.mortonSignedRightShift_medium_3, testValues.mortonSignedRightShift_medium_3, testType);
+        verifyTestValue("mortonSignedRightShift_full_3", expectedTestValues.mortonSignedRightShift_full_3, testValues.mortonSignedRightShift_full_3, testType);
+
+        verifyTestValue("mortonSignedRightShift_small_4", expectedTestValues.mortonSignedRightShift_small_4, testValues.mortonSignedRightShift_small_4, testType);
+        verifyTestValue("mortonSignedRightShift_medium_4", expectedTestValues.mortonSignedRightShift_medium_4, testValues.mortonSignedRightShift_medium_4, testType);
+        verifyTestValue("mortonSignedRightShift_full_4", expectedTestValues.mortonSignedRightShift_full_4, testValues.mortonSignedRightShift_full_4, testType);
+    }
+};
+
+#endif
\ No newline at end of file
diff --git a/12_Mortons/Tester.h b/12_Mortons/ITester.h
similarity index 66%
rename from 12_Mortons/Tester.h
rename to 12_Mortons/ITester.h
index 480328d18..2510dd997 100644
--- a/12_Mortons/Tester.h
+++ b/12_Mortons/ITester.h
@@ -1,5 +1,5 @@
-#ifndef _NBL_EXAMPLES_TESTS_12_MORTONS_TESTER_INCLUDED_
-#define _NBL_EXAMPLES_TESTS_12_MORTONS_TESTER_INCLUDED_
+#ifndef _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_I_TESTER_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_22_CPP_COMPAT_I_TESTER_INCLUDED_
 
 #include <nabla.h>
 #include "app_resources/common.hlsl"
@@ -8,10 +8,10 @@
 
 using namespace nbl;
 
-class Tester
+class ITester 
 {
 public:
-    virtual ~Tester()
+    virtual ~ITester()
     {
         m_outputBufferAllocation.memory->unmap();
     };
@@ -128,7 +128,7 @@ class Tester
             if (!inputBuff)
                 logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
 
-            inputBuff->setObjectDebugName("morton input buffer");
+            inputBuff->setObjectDebugName("emulated_float64_t output buffer");
 
             video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuff->getMemoryReqs();
             reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
@@ -163,7 +163,7 @@ class Tester
             if (!outputBuff)
                 logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
 
-            outputBuff->setObjectDebugName("morton output buffer");
+            outputBuff->setObjectDebugName("emulated_float64_t output buffer");
 
             video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs();
             reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
@@ -211,29 +211,6 @@ class Tester
         if (expectedVal == testVal)
             return;
 
-        std::stringstream ss;
-        switch (testType)
-        {
-        case TestType::CPU:
-            ss << "CPU TEST ERROR:\n";
-        case TestType::GPU:
-            ss << "GPU TEST ERROR:\n";
-        }
-
-        ss << "nbl::hlsl::" << memberName << " produced incorrect output!" << '\n'; //test value: " << testVal << " expected value: " << expectedVal << '\n';
-
-        m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR);
-    }
-
-    template<typename T>
-    void verifyTestVector3dValue(const std::string& memberName, const nbl::hlsl::vector<T, 3>& expectedVal, const nbl::hlsl::vector<T, 3>& testVal, const TestType testType)
-    {
-        static constexpr float MaxAllowedError = 0.1f;
-        if (std::abs(double(expectedVal.x) - double(testVal.x)) <= MaxAllowedError &&
-            std::abs(double(expectedVal.y) - double(testVal.y)) <= MaxAllowedError &&
-            std::abs(double(expectedVal.z) - double(testVal.z)) <= MaxAllowedError)
-            return;
-
         std::stringstream ss;
         switch (testType)
         {
@@ -244,69 +221,11 @@ class Tester
             ss << "GPU TEST ERROR:\n";
         }
 
-        ss << "nbl::hlsl::" << memberName << " produced incorrect output! test value: " <<
-            testVal.x << ' ' << testVal.y << ' ' << testVal.z <<
-            " expected value: " << expectedVal.x << ' ' << expectedVal.y << ' ' << expectedVal.z << '\n';
+        ss << "nbl::hlsl::" << memberName << " produced incorrect output!" << '\n';
 
         m_logger->log(ss.str().c_str(), system::ILogger::ELL_ERROR);
     }
 
-    void performTests()
-    {
-        std::random_device rd;
-        std::mt19937 mt(rd());
-
-        std::uniform_int_distribution<uint16_t> shortDistribution(uint16_t(0), std::numeric_limits<uint16_t>::max());
-        std::uniform_int_distribution<uint32_t> intDistribution(uint32_t(0), std::numeric_limits<uint32_t>::max());
-        std::uniform_int_distribution<uint64_t> longDistribution(uint64_t(0), std::numeric_limits<uint64_t>::max());
-
-        m_logger->log("TESTS:", system::ILogger::ELL_PERFORMANCE);
-        for (int i = 0; i < Iterations; ++i)
-        {
-            // Set input thest values that will be used in both CPU and GPU tests
-            InputTestValues testInput;
-            // use std library or glm functions to determine expected test values, the output of functions from intrinsics.hlsl will be verified against these values
-            TestValues expected;
-
-            uint32_t generatedShift = intDistribution(mt) & uint32_t(63);
-            testInput.shift = generatedShift;
-            {
-                uint64_t generatedA = longDistribution(mt);
-                uint64_t generatedB = longDistribution(mt);
-
-                testInput.generatedA = generatedA;
-                testInput.generatedB = generatedB;
-
-                expected.emulatedAnd = _static_cast<emulated_uint64_t>(generatedA & generatedB);
-                expected.emulatedOr = _static_cast<emulated_uint64_t>(generatedA | generatedB);
-                expected.emulatedXor = _static_cast<emulated_uint64_t>(generatedA ^ generatedB);
-                expected.emulatedNot = _static_cast<emulated_uint64_t>(~generatedA);
-                expected.emulatedPlus = _static_cast<emulated_uint64_t>(generatedA + generatedB);
-                expected.emulatedMinus = _static_cast<emulated_uint64_t>(generatedA - generatedB);
-                expected.emulatedLess = uint32_t(generatedA < generatedB);
-                expected.emulatedLessEqual = uint32_t(generatedA <= generatedB);
-                expected.emulatedGreater = uint32_t(generatedA > generatedB);
-                expected.emulatedGreaterEqual = uint32_t(generatedA >= generatedB);
-
-                expected.emulatedLeftShifted = _static_cast<emulated_uint64_t>(generatedA << generatedShift);
-                expected.emulatedUnsignedRightShifted = _static_cast<emulated_uint64_t>(generatedA >> generatedShift);
-                expected.emulatedSignedRightShifted = _static_cast<emulated_int64_t>(static_cast<int64_t>(generatedA) >> generatedShift);
-            }
-            {
-                uint64_t coordX = longDistribution(mt);
-                uint64_t coordY = longDistribution(mt);
-                uint64_t coordZ = longDistribution(mt);
-                uint64_t coordW = longDistribution(mt);
-
-
-            }
-
-            performCpuTests(testInput, expected);
-            performGpuTests(testInput, expected);
-        }
-        m_logger->log("TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
-    }
-
 protected:
     uint32_t m_queueFamily;
     core::smart_refctd_ptr<video::ILogicalDevice> m_device;
@@ -324,7 +243,7 @@ class Tester
     core::smart_refctd_ptr<video::ISemaphore> m_semaphore;
     video::IQueue* m_queue;
     uint64_t m_semaphoreCounter;
-
+    
     template<typename InputStruct, typename OutputStruct>
     OutputStruct dispatch(const InputStruct& input)
     {
@@ -375,42 +294,6 @@ class Tester
         m_logger->log(msg, system::ILogger::ELL_ERROR, std::forward<Args>(args)...);
         exit(-1);
     }
-
-    inline static constexpr int Iterations = 100u;
-
-    void performCpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
-    {
-        TestValues cpuTestValues;
-        cpuTestValues.fillTestValues(commonTestInputValues);
-        verifyTestValues(expectedTestValues, cpuTestValues, TestType::CPU);
-
-    }
-
-    void performGpuTests(const InputTestValues& commonTestInputValues, const TestValues& expectedTestValues)
-    {
-        TestValues gpuTestValues;
-        gpuTestValues = dispatch<InputTestValues, TestValues>(commonTestInputValues);
-        verifyTestValues(expectedTestValues, gpuTestValues, TestType::GPU);
-    }
-
-    void verifyTestValues(const TestValues& expectedTestValues, const TestValues& testValues, TestType testType)
-    {
-        verifyTestValue("emulatedAnd", expectedTestValues.emulatedAnd, testValues.emulatedAnd, testType);
-        verifyTestValue("emulatedOr", expectedTestValues.emulatedOr, testValues.emulatedOr, testType);
-        verifyTestValue("emulatedXor", expectedTestValues.emulatedXor, testValues.emulatedXor, testType);
-        verifyTestValue("emulatedNot", expectedTestValues.emulatedNot, testValues.emulatedNot, testType);
-        verifyTestValue("emulatedPlus", expectedTestValues.emulatedPlus, testValues.emulatedPlus, testType);
-        verifyTestValue("emulatedMinus", expectedTestValues.emulatedMinus, testValues.emulatedMinus, testType);
-        verifyTestValue("emulatedLess", expectedTestValues.emulatedLess, testValues.emulatedLess, testType);
-        verifyTestValue("emulatedLessEqual", expectedTestValues.emulatedLessEqual, testValues.emulatedLessEqual, testType);
-        verifyTestValue("emulatedGreater", expectedTestValues.emulatedGreater, testValues.emulatedGreater, testType);
-        verifyTestValue("emulatedGreaterEqual", expectedTestValues.emulatedGreaterEqual, testValues.emulatedGreaterEqual, testType);
-        verifyTestValue("emulatedLeftShifted", expectedTestValues.emulatedLeftShifted, testValues.emulatedLeftShifted, testType);
-        verifyTestValue("emulatedUnsignedRightShifted", expectedTestValues.emulatedUnsignedRightShifted, testValues.emulatedUnsignedRightShifted, testType);
-        verifyTestValue("emulatedSignedRightShifted", expectedTestValues.emulatedSignedRightShifted, testValues.emulatedSignedRightShifted, testType);
-        
-        //verifyTestVector3dValue("normalize", expectedTestValues.normalize, testValues.normalize, testType);
-    }
 };
 
 #endif
\ No newline at end of file
diff --git a/12_Mortons/app_resources/common.hlsl b/12_Mortons/app_resources/common.hlsl
index be6a2f4a0..b058ad821 100644
--- a/12_Mortons/app_resources/common.hlsl
+++ b/12_Mortons/app_resources/common.hlsl
@@ -5,10 +5,6 @@
 #ifndef _NBL_EXAMPLES_TESTS_12_MORTON_COMMON_INCLUDED_
 #define _NBL_EXAMPLES_TESTS_12_MORTON_COMMON_INCLUDED_
 
-// because DXC doesn't properly support `_Static_assert`
-// TODO: add a message, and move to macros.h or cpp_compat
-#define STATIC_ASSERT(...) { nbl::hlsl::conditional<__VA_ARGS__, int, void>::type a = 0; }
-
 #include <boost/preprocessor.hpp>
 
 #include <nbl/builtin/hlsl/morton.hlsl>
@@ -23,6 +19,22 @@ NBL_CONSTEXPR uint16_t smallBits_4 = 4;
 NBL_CONSTEXPR uint16_t mediumBits_4 = 8;
 NBL_CONSTEXPR uint16_t fullBits_4 = 16;
 
+#ifndef __HLSL_VERSION
+
+constexpr uint64_t smallBitsMask_2 = (uint64_t(1) << smallBits_2) - 1;
+constexpr uint64_t mediumBitsMask_2 = (uint64_t(1) << mediumBits_2) - 1;
+constexpr uint64_t fullBitsMask_2 = (uint64_t(1) << fullBits_2) - 1;
+
+constexpr uint64_t smallBitsMask_3 = (uint64_t(1) << smallBits_3) - 1;
+constexpr uint64_t mediumBitsMask_3 = (uint64_t(1) << mediumBits_3) - 1;
+constexpr uint64_t fullBitsMask_3 = (uint64_t(1) << fullBits_3) - 1;
+
+constexpr uint64_t smallBitsMask_4 = (uint64_t(1) << smallBits_4) - 1;
+constexpr uint64_t mediumBitsMask_4 = (uint64_t(1) << mediumBits_4) - 1;
+constexpr uint64_t fullBitsMask_4 = (uint64_t(1) << fullBits_4) - 1;
+
+#endif
+
 using namespace nbl::hlsl;
 struct InputTestValues
 {
@@ -190,33 +202,9 @@ struct TestValues
 	morton::code<true, fullBits_4, 4>					  mortonSignedRightShift_full_4;
 	morton::code<true, fullBits_4, 4, emulated_uint64_t>  mortonSignedRightShift_emulated_4;
 
-	void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input)
+	/*
+	void fillSecondTestValues(NBL_CONST_REF_ARG(InputTestValues) input)
 	{
-		emulated_uint64_t emulatedA = _static_cast<emulated_uint64_t>(input.generatedA);
-		emulated_uint64_t emulatedB = _static_cast<emulated_uint64_t>(input.generatedB);
-
-		// Emulated int tests
-		emulatedAnd = emulatedA & emulatedB;
-		emulatedOr = emulatedA | emulatedB;
-		emulatedXor = emulatedA ^ emulatedB;
-		emulatedNot = emulatedA.operator~();
-		emulatedPlus = emulatedA + emulatedB;
-		emulatedMinus = emulatedA - emulatedB;
-		emulatedLess = uint32_t(emulatedA < emulatedB);
-		emulatedLessEqual = uint32_t(emulatedA <= emulatedB);
-		emulatedGreater = uint32_t(emulatedA > emulatedB);
-		emulatedGreaterEqual = uint32_t(emulatedA >= emulatedB);
-
-		left_shift_operator<emulated_uint64_t> leftShift;
-		emulatedLeftShifted = leftShift(emulatedA, input.shift);
-
-		arithmetic_right_shift_operator<emulated_uint64_t> unsignedRightShift;
-		emulatedUnsignedRightShifted = unsignedRightShift(emulatedA, input.shift);
-
-		arithmetic_right_shift_operator<emulated_int64_t> signedRightShift;
-		emulatedSignedRightShifted = signedRightShift(_static_cast<emulated_int64_t>(emulatedA), input.shift);
-
-		// Morton tests
 		uint64_t2 Vec2A = { input.coordX, input.coordY };
 		uint64_t2 Vec2B = { input.coordZ, input.coordW };
 
@@ -235,250 +223,29 @@ struct TestValues
 		int64_t4 Vec4ASigned = int64_t4(Vec4A);
 		int64_t4 Vec4BSigned = int64_t4(Vec4B);
 
-		morton::code<false, smallBits_2, 2> morton_small_2A = morton::code<false, smallBits_2, 2>::create(Vec2A);
-		morton::code<false, mediumBits_2, 2> morton_medium_2A = morton::code<false, mediumBits_2, 2>::create(Vec2A);
-		morton::code<false, fullBits_2, 2> morton_full_2A = morton::code<false, fullBits_2, 2>::create(Vec2A);
-		morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2A = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2A);
-		morton::code<false, smallBits_2, 2> morton_small_2B = morton::code<false, smallBits_2, 2>::create(Vec2B);
-		morton::code<false, mediumBits_2, 2> morton_medium_2B = morton::code<false, mediumBits_2, 2>::create(Vec2B);
-		morton::code<false, fullBits_2, 2> morton_full_2B = morton::code<false, fullBits_2, 2>::create(Vec2B);
-		morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2B = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2B);
-
-		morton::code<false, smallBits_3, 3> morton_small_3A = morton::code<false, smallBits_3, 3>::create(Vec3A);
-		morton::code<false, mediumBits_3, 3> morton_medium_3A = morton::code<false, mediumBits_3, 3>::create(Vec3A);
-		morton::code<false, fullBits_3, 3> morton_full_3A = morton::code<false, fullBits_3, 3>::create(Vec3A);
-		morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3A = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3A);
-		morton::code<false, smallBits_3, 3> morton_small_3B = morton::code<false, smallBits_3, 3>::create(Vec3B);
-		morton::code<false, mediumBits_3, 3> morton_medium_3B = morton::code<false, mediumBits_3, 3>::create(Vec3B);
-		morton::code<false, fullBits_3, 3> morton_full_3B = morton::code<false, fullBits_3, 3>::create(Vec3B);
-		morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3B = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3B);
-
-		morton::code<false, smallBits_4, 4> morton_small_4A = morton::code<false, smallBits_4, 4>::create(Vec4A);
-		morton::code<false, mediumBits_4, 4> morton_medium_4A = morton::code<false, mediumBits_4, 4>::create(Vec4A);
-		morton::code<false, fullBits_4, 4> morton_full_4A = morton::code<false, fullBits_4, 4>::create(Vec4A);
 		morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4A = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4A);
-		morton::code<false, smallBits_4, 4> morton_small_4B = morton::code<false, smallBits_4, 4>::create(Vec4B);
-		morton::code<false, mediumBits_4, 4> morton_medium_4B = morton::code<false, mediumBits_4, 4>::create(Vec4B);
-		morton::code<false, fullBits_4, 4> morton_full_4B = morton::code<false, fullBits_4, 4>::create(Vec4B);
-		morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4B = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4B);
-
-		morton::code<true, smallBits_2, 2> morton_small_2ASigned = morton::code<true, smallBits_2, 2>::create(Vec2ASigned);
-		morton::code<true, mediumBits_2, 2> morton_medium_2ASigned = morton::code<true, mediumBits_2, 2>::create(Vec2ASigned);
-		morton::code<true, fullBits_2, 2> morton_full_2ASigned = morton::code<true, fullBits_2, 2>::create(Vec2ASigned);
-		morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2ASigned = morton::code<true, fullBits_2, 2, emulated_uint64_t>::create(Vec2ASigned);
-		morton::code<true, smallBits_2, 2> morton_small_2BSigned = morton::code<true, smallBits_2, 2>::create(Vec2BSigned);
-		morton::code<true, mediumBits_2, 2> morton_medium_2BSigned = morton::code<true, mediumBits_2, 2>::create(Vec2BSigned);
-		morton::code<true, fullBits_2, 2> morton_full_2BSigned = morton::code<true, fullBits_2, 2>::create(Vec2BSigned);
-		morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2BSigned = morton::code<true, fullBits_2, 2, emulated_uint64_t>::create(Vec2BSigned);
-
-		morton::code<true, smallBits_3, 3> morton_small_3ASigned = morton::code<true, smallBits_3, 3>::create(Vec3ASigned);
-		morton::code<true, mediumBits_3, 3> morton_medium_3ASigned = morton::code<true, mediumBits_3, 3>::create(Vec3ASigned);
-		morton::code<true, fullBits_3, 3> morton_full_3ASigned = morton::code<true, fullBits_3, 3>::create(Vec3ASigned);
-		morton::code<true, fullBits_3, 3, emulated_uint64_t> morton_emulated_3ASigned = morton::code<true, fullBits_3, 3, emulated_uint64_t>::create(Vec3ASigned);
-		morton::code<true, smallBits_3, 3> morton_small_3BSigned = morton::code<true, smallBits_3, 3>::create(Vec3BSigned);
-		morton::code<true, mediumBits_3, 3> morton_medium_3BSigned = morton::code<true, mediumBits_3, 3>::create(Vec3BSigned);
-		morton::code<true, fullBits_3, 3> morton_full_3BSigned = morton::code<true, fullBits_3, 3>::create(Vec3BSigned);
-		morton::code<true, fullBits_3, 3, emulated_uint64_t> morton_emulated_3BSigned = morton::code<true, fullBits_3, 3, emulated_uint64_t>::create(Vec3BSigned);
-
-		morton::code<true, smallBits_4, 4> morton_small_4ASigned = morton::code<true, smallBits_4, 4>::create(Vec4ASigned);
-		morton::code<true, mediumBits_4, 4> morton_medium_4ASigned = morton::code<true, mediumBits_4, 4>::create(Vec4ASigned);
-		morton::code<true, fullBits_4, 4> morton_full_4ASigned = morton::code<true, fullBits_4, 4>::create(Vec4ASigned);
-		morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4ASigned = morton::code<true, fullBits_4, 4, emulated_uint64_t>::create(Vec4ASigned);
-		morton::code<true, smallBits_4, 4> morton_small_4BSigned = morton::code<true, smallBits_4, 4>::create(Vec4BSigned);
-		morton::code<true, mediumBits_4, 4> morton_medium_4BSigned = morton::code<true, mediumBits_4, 4>::create(Vec4BSigned);
-		morton::code<true, fullBits_4, 4> morton_full_4BSigned = morton::code<true, fullBits_4, 4>::create(Vec4BSigned);
-		morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4BSigned = morton::code<true, fullBits_4, 4, emulated_uint64_t>::create(Vec4BSigned);
-
-		/*
-		left_shift_operator<portable_vector_t<emulated_uint64_t, 4> > leftShiftTemp;
-		portable_vector_t<emulated_uint64_t, 4> interleaved = _static_cast<portable_vector_t<emulated_uint64_t, 4> >(uint16_t4(Vec4B)) & morton::impl::coding_mask_v<4, fullBits_4, morton::impl::CodingStages, emulated_uint64_t>;
-		
-		#define ENCODE_LOOP_ITERATION(I) NBL_IF_CONSTEXPR(fullBits_4 > (uint16_t(1) << I))\
-        {\
-            interleaved = interleaved | leftShiftTemp(interleaved, (uint16_t(1) << I) * (4 - 1));\
-            interleaved = interleaved & _static_cast<emulated_uint64_t>(morton::impl::coding_mask<4, fullBits_4, I>::value);\
-        }
-		
-		ENCODE_LOOP_ITERATION(4)
-		ENCODE_LOOP_ITERATION(3)
-		ENCODE_LOOP_ITERATION(2)
-		ENCODE_LOOP_ITERATION(1)
-		ENCODE_LOOP_ITERATION(0)
-
-		#undef ENCODE_LOOP_ITERATION
-		// After interleaving, shift each coordinate left by their index
-		return leftShiftTemp(interleaved, truncate<vector<uint16_t, Dim> >(vector<uint16_t, 4>(0, 1, 2, 3)));
-		
-		
-		array_get<portable_vector_t<emulated_uint64_t, 4>, emulated_uint64_t> getter;
-		emulatedAnd = getter(interleaved, 0);
-		*/
-		
-		// Plus
-		mortonPlus_small_2 = morton_small_2A + morton_small_2B;
-		mortonPlus_medium_2 = morton_medium_2A + morton_medium_2B;
-		mortonPlus_full_2 = morton_full_2A + morton_full_2B;
-		mortonPlus_emulated_2 = morton_emulated_2A + morton_emulated_2B;
-		
-		mortonPlus_small_3 = morton_small_3A + morton_small_3B;
-		mortonPlus_medium_3 = morton_medium_3A + morton_medium_3B;
-		mortonPlus_full_3 = morton_full_3A + morton_full_3B;
-		mortonPlus_emulated_3 = morton_emulated_3A + morton_emulated_3B;
-
-		mortonPlus_small_4 = morton_small_4A + morton_small_4B;
-		mortonPlus_medium_4 = morton_medium_4A + morton_medium_4B;
-		mortonPlus_full_4 = morton_full_4A + morton_full_4B;
-		mortonPlus_emulated_4 = morton_emulated_4A + morton_emulated_4B;
-		
-		// Minus
-		mortonMinus_small_2 = morton_small_2A - morton_small_2B;
-		mortonMinus_medium_2 = morton_medium_2A - morton_medium_2B;
-		mortonMinus_full_2 = morton_full_2A - morton_full_2B;
-		mortonMinus_emulated_2 = morton_emulated_2A - morton_emulated_2B;
-
-		mortonMinus_small_3 = morton_small_3A - morton_small_3B;
-		mortonMinus_medium_3 = morton_medium_3A - morton_medium_3B;
-		mortonMinus_full_3 = morton_full_3A - morton_full_3B;
-		mortonMinus_emulated_3 = morton_emulated_3A - morton_emulated_3B;
-
-		mortonMinus_small_4 = morton_small_4A - morton_small_4B;
-		mortonMinus_medium_4 = morton_medium_4A - morton_medium_4B;
-		mortonMinus_full_4 = morton_full_4A - morton_full_4B;
-		mortonMinus_emulated_4 = morton_emulated_4A - morton_emulated_4B;
-
-		// Coordinate-wise equality
-		mortonEqual_small_2 = uint32_t2(morton_small_2A.equal<false>(uint16_t2(Vec2B)));
-		mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal<false>(uint16_t2(Vec2B)));
-		mortonEqual_full_2 = uint32_t2(morton_full_2A.equal<false>(uint32_t2(Vec2B)));
-		mortonEqual_emulated_2 = uint32_t2(morton_emulated_2A.equal<false>(uint32_t2(Vec2B)));
-
-		mortonEqual_small_3 = uint32_t3(morton_small_3A.equal<false>(uint16_t3(Vec3B)));
-		mortonEqual_medium_3 = uint32_t3(morton_medium_3A.equal<false>(uint16_t3(Vec3B)));
-		mortonEqual_full_3 = uint32_t3(morton_full_3A.equal<false>(uint32_t3(Vec3B)));
-		mortonEqual_emulated_3 = uint32_t3(morton_emulated_3A.equal<false>(uint32_t3(Vec3B)));
-
-		mortonEqual_small_4 = uint32_t4(morton_small_4A.equal<false>(uint16_t4(Vec4B)));
-		mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal<false>(uint16_t4(Vec4B)));
-		mortonEqual_full_4 = uint32_t4(morton_full_4A.equal<false>(uint16_t4(Vec4B)));
-		mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(uint16_t4(Vec4B)));
-		
-		// Coordinate-wise unsigned inequality (just testing with less)
-		mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan<false>(uint16_t2(Vec2B)));
-		mortonUnsignedLess_medium_2 = uint32_t2(morton_medium_2A.lessThan<false>(uint16_t2(Vec2B)));
-		mortonUnsignedLess_full_2 = uint32_t2(morton_full_2A.lessThan<false>(uint32_t2(Vec2B)));
-		mortonUnsignedLess_emulated_2 = uint32_t2(morton_emulated_2A.lessThan<false>(uint32_t2(Vec2B)));
-		
-		mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan<false>(uint16_t3(Vec3B)));
-		mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan<false>(uint16_t3(Vec3B)));
-		mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan<false>(uint32_t3(Vec3B)));
-		mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan<false>(uint32_t3(Vec3B)));
-		
-		mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan<false>(uint16_t4(Vec4B)));
-		mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan<false>(uint16_t4(Vec4B)));
-		mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan<false>(uint16_t4(Vec4B)));
-		mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan<false>(uint16_t4(Vec4B)));
+		morton::code<true, fullBits_2, 2, emulated_uint64_t> morton_emulated_2_signed = morton::code<true, fullBits_2, 2, emulated_uint64_t>::create(Vec2ASigned);
+		morton::code<true, fullBits_3, 3, emulated_uint64_t> morton_emulated_3_signed = morton::code<true, fullBits_3, 3, emulated_uint64_t>::create(Vec3ASigned);
+		morton::code<true, fullBits_4, 4, emulated_uint64_t> morton_emulated_4_signed = morton::code<true, fullBits_4, 4, emulated_uint64_t>::create(Vec4ASigned);
+
+		output.mortonEqual_emulated_4 = uint32_t4(morton_emulated_4A.equal<false>(uint16_t4(Vec4B)));
 		
-		// Coordinate-wise signed inequality
-		mortonSignedLess_small_2 = uint32_t2(morton_small_2ASigned.lessThan<false>(int16_t2(Vec2BSigned)));
-		mortonSignedLess_medium_2 = uint32_t2(morton_medium_2ASigned.lessThan<false>(int16_t2(Vec2BSigned)));
-		mortonSignedLess_full_2 = uint32_t2(morton_full_2ASigned.lessThan<false>(int32_t2(Vec2BSigned)));
-		//mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2ASigned.lessThan<false>(int32_t2(Vec2BSigned)));
-
-		mortonSignedLess_small_3 = uint32_t3(morton_small_3ASigned.lessThan<false>(int16_t3(Vec3BSigned)));
-		mortonSignedLess_medium_3 = uint32_t3(morton_medium_3ASigned.lessThan<false>(int16_t3(Vec3BSigned)));
-		mortonSignedLess_full_3 = uint32_t3(morton_full_3ASigned.lessThan<false>(int32_t3(Vec3BSigned)));
-		//mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3ASigned.lessThan<false>(int32_t3(Vec3BSigned)));
-
-		mortonSignedLess_small_4 = uint32_t4(morton_small_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
-		mortonSignedLess_medium_4 = uint32_t4(morton_medium_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
-		mortonSignedLess_full_4 = uint32_t4(morton_full_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
-		//mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4ASigned.lessThan<false>(int16_t4(Vec4BSigned)));
+		output.mortonUnsignedLess_emulated_4 = uint32_t4(morton_emulated_4A.lessThan<false>(uint16_t4(Vec4B)));
 		
-		// Left-shift
+		mortonSignedLess_emulated_2 = uint32_t2(morton_emulated_2_signed.lessThan<false>(int32_t2(Vec2BSigned))); 
+		mortonSignedLess_emulated_3 = uint32_t3(morton_emulated_3_signed.lessThan<false>(int32_t3(Vec3BSigned))); 
+		mortonSignedLess_emulated_4 = uint32_t4(morton_emulated_4_signed.lessThan<false>(int16_t4(Vec4BSigned))); 
+
 		uint16_t castedShift = uint16_t(input.shift);
-		left_shift_operator<morton::code<false, smallBits_2, 2> > leftShiftSmall2;
-		mortonLeftShift_small_2 = leftShiftSmall2(morton_small_2A, castedShift);
-		left_shift_operator<morton::code<false, mediumBits_2, 2> > leftShiftMedium2;
-		mortonLeftShift_medium_2 = leftShiftMedium2(morton_medium_2A, castedShift);
-		left_shift_operator<morton::code<false, fullBits_2, 2> > leftShiftFull2;
-		mortonLeftShift_full_2 = leftShiftFull2(morton_full_2A, castedShift);
-		left_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > leftShiftEmulated2;
-		mortonLeftShift_emulated_2 = leftShiftEmulated2(morton_emulated_2A, castedShift);
-
-		left_shift_operator<morton::code<false, smallBits_3, 3> > leftShiftSmall3;
-		mortonLeftShift_small_3 = leftShiftSmall3(morton_small_3A, castedShift);
-		left_shift_operator<morton::code<false, mediumBits_3, 3> > leftShiftMedium3;
-		mortonLeftShift_medium_3 = leftShiftMedium3(morton_medium_3A, castedShift);
-		left_shift_operator<morton::code<false, fullBits_3, 3> > leftShiftFull3;
-		mortonLeftShift_full_3 = leftShiftFull3(morton_full_3A, castedShift);
-		left_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > leftShiftEmulated3;
-		mortonLeftShift_emulated_3 = leftShiftEmulated3(morton_emulated_3A, castedShift);
-
-		left_shift_operator<morton::code<false, smallBits_4, 4> > leftShiftSmall4;
-		mortonLeftShift_small_4 = leftShiftSmall4(morton_small_4A, castedShift);
-		left_shift_operator<morton::code<false, mediumBits_4, 4> > leftShiftMedium4;
-		mortonLeftShift_medium_4 = leftShiftMedium4(morton_medium_4A, castedShift);
-		left_shift_operator<morton::code<false, fullBits_4, 4> > leftShiftFull4;
-		mortonLeftShift_full_4 = leftShiftFull4(morton_full_4A, castedShift);
-		left_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > leftShiftEmulated4;
-		mortonLeftShift_emulated_4 = leftShiftEmulated4(morton_emulated_4A, castedShift);
-		
-		// Unsigned right-shift
-		arithmetic_right_shift_operator<morton::code<false, smallBits_2, 2> > rightShiftSmall2;
-		mortonUnsignedRightShift_small_2 = rightShiftSmall2(morton_small_2A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, mediumBits_2, 2> > rightShiftMedium2;
-		mortonUnsignedRightShift_medium_2 = rightShiftMedium2(morton_medium_2A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2> > rightShiftFull2;
-		mortonUnsignedRightShift_full_2 = rightShiftFull2(morton_full_2A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > rightShiftEmulated2;
-		mortonUnsignedRightShift_emulated_2 = rightShiftEmulated2(morton_emulated_2A, castedShift);
-
-		arithmetic_right_shift_operator<morton::code<false, smallBits_3, 3> > rightShiftSmall3;
-		mortonUnsignedRightShift_small_3 = rightShiftSmall3(morton_small_3A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, mediumBits_3, 3> > rightShiftMedium3;
-		mortonUnsignedRightShift_medium_3 = rightShiftMedium3(morton_medium_3A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3> > rightShiftFull3;
-		mortonUnsignedRightShift_full_3 = rightShiftFull3(morton_full_3A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > rightShiftEmulated3;
-		mortonUnsignedRightShift_emulated_3 = rightShiftEmulated3(morton_emulated_3A, castedShift);
-
-		arithmetic_right_shift_operator<morton::code<false, smallBits_4, 4> > rightShiftSmall4;
-		mortonUnsignedRightShift_small_4 = rightShiftSmall4(morton_small_4A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, mediumBits_4, 4> > rightShiftMedium4;
-		mortonUnsignedRightShift_medium_4 = rightShiftMedium4(morton_medium_4A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4> > rightShiftFull4;
-		mortonUnsignedRightShift_full_4 = rightShiftFull4(morton_full_4A, castedShift);
-		arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > rightShiftEmulated4;
-		mortonUnsignedRightShift_emulated_4 = rightShiftEmulated4(morton_emulated_4A, castedShift);
-
-		// Signed right-shift
-		arithmetic_right_shift_operator<morton::code<true, smallBits_2, 2> > rightShiftSignedSmall2;
-		mortonSignedRightShift_small_2 = rightShiftSignedSmall2(morton_small_2ASigned, castedShift);
-		arithmetic_right_shift_operator<morton::code<true, mediumBits_2, 2> > rightShiftSignedMedium2;
-		mortonSignedRightShift_medium_2 = rightShiftSignedMedium2(morton_medium_2ASigned, castedShift);
-		arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2> > rightShiftSignedFull2;
-		mortonSignedRightShift_full_2 = rightShiftSignedFull2(morton_full_2ASigned, castedShift);
+
 		arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2, emulated_uint64_t> > rightShiftSignedEmulated2;
-		//mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2ASigned, castedShift);
-
-		arithmetic_right_shift_operator<morton::code<true, smallBits_3, 3> > rightShiftSignedSmall3;
-		mortonSignedRightShift_small_3 = rightShiftSignedSmall3(morton_small_3ASigned, castedShift);
-		arithmetic_right_shift_operator<morton::code<true, mediumBits_3, 3> > rightShiftSignedMedium3;
-		mortonSignedRightShift_medium_3 = rightShiftSignedMedium3(morton_medium_3ASigned, castedShift);
-		arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3> > rightShiftSignedFull3;
-		mortonSignedRightShift_full_3 = rightShiftSignedFull3(morton_full_3ASigned, castedShift);
+		mortonSignedRightShift_emulated_2 = rightShiftSignedEmulated2(morton_emulated_2_signed, castedShift); 
 		arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3, emulated_uint64_t> > rightShiftSignedEmulated3;
-		//mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3ASigned, castedShift);
-
-		arithmetic_right_shift_operator<morton::code<true, smallBits_4, 4> > rightShiftSignedSmall4;
-		mortonSignedRightShift_small_4 = rightShiftSignedSmall4(morton_small_4ASigned, castedShift);
-		arithmetic_right_shift_operator<morton::code<true, mediumBits_4, 4> > rightShiftSignedMedium4;
-		mortonSignedRightShift_medium_4 = rightShiftSignedMedium4(morton_medium_4ASigned, castedShift);
-		arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4> > rightShiftSignedFull4;
-		mortonSignedRightShift_full_4 = rightShiftSignedFull4(morton_full_4ASigned, castedShift);
+		mortonSignedRightShift_emulated_3 = rightShiftSignedEmulated3(morton_emulated_3_signed, castedShift); 
 		arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4, emulated_uint64_t> > rightShiftSignedEmulated4;
-		//mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4ASigned, castedShift);
+		mortonSignedRightShift_emulated_4 = rightShiftSignedEmulated4(morton_emulated_4_signed, castedShift); 
 	}
+	*/
 };
 
 #endif
diff --git a/12_Mortons/app_resources/mortonTest.comp.hlsl b/12_Mortons/app_resources/test.comp.hlsl
similarity index 79%
rename from 12_Mortons/app_resources/mortonTest.comp.hlsl
rename to 12_Mortons/app_resources/test.comp.hlsl
index 7041568b8..243983d5a 100644
--- a/12_Mortons/app_resources/mortonTest.comp.hlsl
+++ b/12_Mortons/app_resources/test.comp.hlsl
@@ -1,9 +1,8 @@
 //// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
 //// This file is part of the "Nabla Engine".
 //// For conditions of distribution and use, see copyright notice in nabla.h
-#pragma shader_stage(compute)
 
-#include "common.hlsl"
+#include "testCommon.hlsl"
 
 [[vk::binding(0, 0)]] RWStructuredBuffer<InputTestValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<TestValues> outputTestValues;
@@ -12,5 +11,5 @@
 void main(uint3 invocationID : SV_DispatchThreadID)
 {
     if (invocationID.x == 0)
-        outputTestValues[0].fillTestValues(inputTestValues[0]);
+        fillTestValues(inputTestValues[0], outputTestValues[0]);
 }
diff --git a/12_Mortons/app_resources/testCommon.hlsl b/12_Mortons/app_resources/testCommon.hlsl
new file mode 100644
index 000000000..9ff9a4fa8
--- /dev/null
+++ b/12_Mortons/app_resources/testCommon.hlsl
@@ -0,0 +1,242 @@
+#include "common.hlsl"
+
+void fillTestValues(NBL_CONST_REF_ARG(InputTestValues) input, NBL_REF_ARG(TestValues) output)
+{
+	emulated_uint64_t emulatedA = _static_cast<emulated_uint64_t>(input.generatedA);
+	emulated_uint64_t emulatedB = _static_cast<emulated_uint64_t>(input.generatedB);
+
+	// Emulated int tests
+	output.emulatedAnd = emulatedA & emulatedB;
+	output.emulatedOr = emulatedA | emulatedB;
+	output.emulatedXor = emulatedA ^ emulatedB;
+	output.emulatedNot = emulatedA.operator~();
+	output.emulatedPlus = emulatedA + emulatedB;
+	output.emulatedMinus = emulatedA - emulatedB;
+	output.emulatedLess = uint32_t(emulatedA < emulatedB);
+	output.emulatedLessEqual = uint32_t(emulatedA <= emulatedB);
+	output.emulatedGreater = uint32_t(emulatedA > emulatedB);
+	output.emulatedGreaterEqual = uint32_t(emulatedA >= emulatedB);
+
+	left_shift_operator<emulated_uint64_t> leftShift;
+	output.emulatedLeftShifted = leftShift(emulatedA, input.shift);
+
+	arithmetic_right_shift_operator<emulated_uint64_t> unsignedRightShift;
+	output.emulatedUnsignedRightShifted = unsignedRightShift(emulatedA, input.shift);
+
+	arithmetic_right_shift_operator<emulated_int64_t> signedRightShift;
+	output.emulatedSignedRightShifted = signedRightShift(_static_cast<emulated_int64_t>(emulatedA), input.shift);
+
+	// Morton tests
+	uint64_t2 Vec2A = { input.coordX, input.coordY };
+	uint64_t2 Vec2B = { input.coordZ, input.coordW };
+
+	uint64_t3 Vec3A = { input.coordX, input.coordY, input.coordZ };
+	uint64_t3 Vec3B = { input.coordY, input.coordZ, input.coordW };
+
+	uint64_t4 Vec4A = { input.coordX, input.coordY, input.coordZ, input.coordW };
+	uint64_t4 Vec4B = { input.coordY, input.coordZ, input.coordW, input.coordX };
+
+	int64_t2 Vec2ASigned = int64_t2(Vec2A);
+	int64_t2 Vec2BSigned = int64_t2(Vec2B);
+
+	int64_t3 Vec3ASigned = int64_t3(Vec3A);
+	int64_t3 Vec3BSigned = int64_t3(Vec3B);
+
+	int64_t4 Vec4ASigned = int64_t4(Vec4A);
+	int64_t4 Vec4BSigned = int64_t4(Vec4B);
+
+	morton::code<false, smallBits_2, 2> morton_small_2A = morton::code<false, smallBits_2, 2>::create(Vec2A);
+	morton::code<false, mediumBits_2, 2> morton_medium_2A = morton::code<false, mediumBits_2, 2>::create(Vec2A);
+	morton::code<false, fullBits_2, 2> morton_full_2A = morton::code<false, fullBits_2, 2>::create(Vec2A);
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2A = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2A);
+	morton::code<false, smallBits_2, 2> morton_small_2B = morton::code<false, smallBits_2, 2>::create(Vec2B);
+	morton::code<false, mediumBits_2, 2> morton_medium_2B = morton::code<false, mediumBits_2, 2>::create(Vec2B);
+	morton::code<false, fullBits_2, 2> morton_full_2B = morton::code<false, fullBits_2, 2>::create(Vec2B);
+	morton::code<false, fullBits_2, 2, emulated_uint64_t> morton_emulated_2B = morton::code<false, fullBits_2, 2, emulated_uint64_t>::create(Vec2B);
+
+	morton::code<false, smallBits_3, 3> morton_small_3A = morton::code<false, smallBits_3, 3>::create(Vec3A);
+	morton::code<false, mediumBits_3, 3> morton_medium_3A = morton::code<false, mediumBits_3, 3>::create(Vec3A);
+	morton::code<false, fullBits_3, 3> morton_full_3A = morton::code<false, fullBits_3, 3>::create(Vec3A);
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3A = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3A);
+	morton::code<false, smallBits_3, 3> morton_small_3B = morton::code<false, smallBits_3, 3>::create(Vec3B);
+	morton::code<false, mediumBits_3, 3> morton_medium_3B = morton::code<false, mediumBits_3, 3>::create(Vec3B);
+	morton::code<false, fullBits_3, 3> morton_full_3B = morton::code<false, fullBits_3, 3>::create(Vec3B);
+	morton::code<false, fullBits_3, 3, emulated_uint64_t> morton_emulated_3B = morton::code<false, fullBits_3, 3, emulated_uint64_t>::create(Vec3B);
+
+	morton::code<false, smallBits_4, 4> morton_small_4A = morton::code<false, smallBits_4, 4>::create(Vec4A);
+	morton::code<false, mediumBits_4, 4> morton_medium_4A = morton::code<false, mediumBits_4, 4>::create(Vec4A);
+	morton::code<false, fullBits_4, 4> morton_full_4A = morton::code<false, fullBits_4, 4>::create(Vec4A);
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4A = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4A);
+	morton::code<false, smallBits_4, 4> morton_small_4B = morton::code<false, smallBits_4, 4>::create(Vec4B);
+	morton::code<false, mediumBits_4, 4> morton_medium_4B = morton::code<false, mediumBits_4, 4>::create(Vec4B);
+	morton::code<false, fullBits_4, 4> morton_full_4B = morton::code<false, fullBits_4, 4>::create(Vec4B);
+	morton::code<false, fullBits_4, 4, emulated_uint64_t> morton_emulated_4B = morton::code<false, fullBits_4, 4, emulated_uint64_t>::create(Vec4B);
+
+	morton::code<true, smallBits_2, 2> morton_small_2_signed = morton::code<true, smallBits_2, 2>::create(Vec2ASigned);
+	morton::code<true, mediumBits_2, 2> morton_medium_2_signed = morton::code<true, mediumBits_2, 2>::create(Vec2ASigned);
+	morton::code<true, fullBits_2, 2> morton_full_2_signed = morton::code<true, fullBits_2, 2>::create(Vec2ASigned);
+
+	morton::code<true, smallBits_3, 3> morton_small_3_signed = morton::code<true, smallBits_3, 3>::create(Vec3ASigned);
+	morton::code<true, mediumBits_3, 3> morton_medium_3_signed = morton::code<true, mediumBits_3, 3>::create(Vec3ASigned);
+	morton::code<true, fullBits_3, 3> morton_full_3_signed = morton::code<true, fullBits_3, 3>::create(Vec3ASigned);
+
+	morton::code<true, smallBits_4, 4> morton_small_4_signed = morton::code<true, smallBits_4, 4>::create(Vec4ASigned);
+	morton::code<true, mediumBits_4, 4> morton_medium_4_signed = morton::code<true, mediumBits_4, 4>::create(Vec4ASigned);
+	morton::code<true, fullBits_4, 4> morton_full_4_signed = morton::code<true, fullBits_4, 4>::create(Vec4ASigned);
+
+	// Plus
+	output.mortonPlus_small_2 = morton_small_2A + morton_small_2B;
+	output.mortonPlus_medium_2 = morton_medium_2A + morton_medium_2B;
+	output.mortonPlus_full_2 = morton_full_2A + morton_full_2B;
+	output.mortonPlus_emulated_2 = morton_emulated_2A + morton_emulated_2B;
+
+	output.mortonPlus_small_3 = morton_small_3A + morton_small_3B;
+	output.mortonPlus_medium_3 = morton_medium_3A + morton_medium_3B;
+	output.mortonPlus_full_3 = morton_full_3A + morton_full_3B;
+	output.mortonPlus_emulated_3 = morton_emulated_3A + morton_emulated_3B;
+
+	output.mortonPlus_small_4 = morton_small_4A + morton_small_4B;
+	output.mortonPlus_medium_4 = morton_medium_4A + morton_medium_4B;
+	output.mortonPlus_full_4 = morton_full_4A + morton_full_4B;
+	output.mortonPlus_emulated_4 = morton_emulated_4A + morton_emulated_4B;
+	
+	// Minus
+	output.mortonMinus_small_2 = morton_small_2A - morton_small_2B;
+	output.mortonMinus_medium_2 = morton_medium_2A - morton_medium_2B;
+	output.mortonMinus_full_2 = morton_full_2A - morton_full_2B;
+	output.mortonMinus_emulated_2 = morton_emulated_2A - morton_emulated_2B;
+
+	output.mortonMinus_small_3 = morton_small_3A - morton_small_3B;
+	output.mortonMinus_medium_3 = morton_medium_3A - morton_medium_3B;
+	output.mortonMinus_full_3 = morton_full_3A - morton_full_3B;
+	output.mortonMinus_emulated_3 = morton_emulated_3A - morton_emulated_3B;
+
+	output.mortonMinus_small_4 = morton_small_4A - morton_small_4B;
+	output.mortonMinus_medium_4 = morton_medium_4A - morton_medium_4B;
+	output.mortonMinus_full_4 = morton_full_4A - morton_full_4B;
+	output.mortonMinus_emulated_4 = morton_emulated_4A - morton_emulated_4B;
+	
+	// Coordinate-wise equality
+	output.mortonEqual_small_2 = uint32_t2(morton_small_2A.equal<false>(uint16_t2(Vec2B)));
+	output.mortonEqual_medium_2 = uint32_t2(morton_medium_2A.equal<false>(uint16_t2(Vec2B)));
+	output.mortonEqual_full_2 = uint32_t2(morton_full_2A.equal<false>(uint32_t2(Vec2B)));
+	output.mortonEqual_emulated_2 = uint32_t2(morton_emulated_2A.equal<false>(uint32_t2(Vec2B)));
+
+	output.mortonEqual_small_3 = uint32_t3(morton_small_3A.equal<false>(uint16_t3(Vec3B)));
+	output.mortonEqual_medium_3 = uint32_t3(morton_medium_3A.equal<false>(uint16_t3(Vec3B)));
+	output.mortonEqual_full_3 = uint32_t3(morton_full_3A.equal<false>(uint32_t3(Vec3B)));
+	output.mortonEqual_emulated_3 = uint32_t3(morton_emulated_3A.equal<false>(uint32_t3(Vec3B)));
+
+	output.mortonEqual_small_4 = uint32_t4(morton_small_4A.equal<false>(uint16_t4(Vec4B)));
+	output.mortonEqual_medium_4 = uint32_t4(morton_medium_4A.equal<false>(uint16_t4(Vec4B)));
+	output.mortonEqual_full_4 = uint32_t4(morton_full_4A.equal<false>(uint16_t4(Vec4B)));
+	
+	// Coordinate-wise unsigned inequality (just testing with less)
+	output.mortonUnsignedLess_small_2 = uint32_t2(morton_small_2A.lessThan<false>(uint16_t2(Vec2B)));
+	output.mortonUnsignedLess_medium_2 = uint32_t2(morton_medium_2A.lessThan<false>(uint16_t2(Vec2B)));
+	output.mortonUnsignedLess_full_2 = uint32_t2(morton_full_2A.lessThan<false>(uint32_t2(Vec2B)));
+	output.mortonUnsignedLess_emulated_2 = uint32_t2(morton_emulated_2A.lessThan<false>(uint32_t2(Vec2B)));
+
+	output.mortonUnsignedLess_small_3 = uint32_t3(morton_small_3A.lessThan<false>(uint16_t3(Vec3B)));
+	output.mortonUnsignedLess_medium_3 = uint32_t3(morton_medium_3A.lessThan<false>(uint16_t3(Vec3B)));
+	output.mortonUnsignedLess_full_3 = uint32_t3(morton_full_3A.lessThan<false>(uint32_t3(Vec3B)));
+	output.mortonUnsignedLess_emulated_3 = uint32_t3(morton_emulated_3A.lessThan<false>(uint32_t3(Vec3B)));
+
+	output.mortonUnsignedLess_small_4 = uint32_t4(morton_small_4A.lessThan<false>(uint16_t4(Vec4B)));
+	output.mortonUnsignedLess_medium_4 = uint32_t4(morton_medium_4A.lessThan<false>(uint16_t4(Vec4B)));
+	output.mortonUnsignedLess_full_4 = uint32_t4(morton_full_4A.lessThan<false>(uint16_t4(Vec4B)));
+	
+	// Coordinate-wise signed inequality
+	output.mortonSignedLess_small_2 = uint32_t2(morton_small_2_signed.lessThan<false>(int16_t2(Vec2BSigned)));
+	output.mortonSignedLess_medium_2 = uint32_t2(morton_medium_2_signed.lessThan<false>(int16_t2(Vec2BSigned)));
+	output.mortonSignedLess_full_2 = uint32_t2(morton_full_2_signed.lessThan<false>(int32_t2(Vec2BSigned)));
+
+	output.mortonSignedLess_small_3 = uint32_t3(morton_small_3_signed.lessThan<false>(int16_t3(Vec3BSigned)));
+	output.mortonSignedLess_medium_3 = uint32_t3(morton_medium_3_signed.lessThan<false>(int16_t3(Vec3BSigned)));
+	output.mortonSignedLess_full_3 = uint32_t3(morton_full_3_signed.lessThan<false>(int32_t3(Vec3BSigned)));
+
+	output.mortonSignedLess_small_4 = uint32_t4(morton_small_4_signed.lessThan<false>(int16_t4(Vec4BSigned)));
+	output.mortonSignedLess_medium_4 = uint32_t4(morton_medium_4_signed.lessThan<false>(int16_t4(Vec4BSigned)));
+	output.mortonSignedLess_full_4 = uint32_t4(morton_full_4_signed.lessThan<false>(int16_t4(Vec4BSigned)));
+	
+	// Cast to uint16_t which is what left shift for Mortons expect
+	uint16_t castedShift = uint16_t(input.shift);
+	// Each left shift clamps to correct bits so the result kinda makes sense
+	// Left-shift
+	left_shift_operator<morton::code<false, smallBits_2, 2> > leftShiftSmall2;
+	output.mortonLeftShift_small_2 = leftShiftSmall2(morton_small_2A, castedShift % smallBits_2);
+	left_shift_operator<morton::code<false, mediumBits_2, 2> > leftShiftMedium2;
+	output.mortonLeftShift_medium_2 = leftShiftMedium2(morton_medium_2A, castedShift % mediumBits_2);
+	left_shift_operator<morton::code<false, fullBits_2, 2> > leftShiftFull2;
+	output.mortonLeftShift_full_2 = leftShiftFull2(morton_full_2A, castedShift % fullBits_2);
+	left_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > leftShiftEmulated2;
+	output.mortonLeftShift_emulated_2 = leftShiftEmulated2(morton_emulated_2A, castedShift % fullBits_2);
+
+	left_shift_operator<morton::code<false, smallBits_3, 3> > leftShiftSmall3;
+	output.mortonLeftShift_small_3 = leftShiftSmall3(morton_small_3A, castedShift % smallBits_3);
+	left_shift_operator<morton::code<false, mediumBits_3, 3> > leftShiftMedium3;
+	output.mortonLeftShift_medium_3 = leftShiftMedium3(morton_medium_3A, castedShift % mediumBits_3);
+	left_shift_operator<morton::code<false, fullBits_3, 3> > leftShiftFull3;
+	output.mortonLeftShift_full_3 = leftShiftFull3(morton_full_3A, castedShift % fullBits_3);
+	left_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > leftShiftEmulated3;
+	output.mortonLeftShift_emulated_3 = leftShiftEmulated3(morton_emulated_3A, castedShift % fullBits_3);
+
+	left_shift_operator<morton::code<false, smallBits_4, 4> > leftShiftSmall4;
+	output.mortonLeftShift_small_4 = leftShiftSmall4(morton_small_4A, castedShift % smallBits_4);
+	left_shift_operator<morton::code<false, mediumBits_4, 4> > leftShiftMedium4;
+	output.mortonLeftShift_medium_4 = leftShiftMedium4(morton_medium_4A, castedShift % mediumBits_4);
+	left_shift_operator<morton::code<false, fullBits_4, 4> > leftShiftFull4;
+	output.mortonLeftShift_full_4 = leftShiftFull4(morton_full_4A, castedShift % fullBits_4);
+	left_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > leftShiftEmulated4;
+	output.mortonLeftShift_emulated_4 = leftShiftEmulated4(morton_emulated_4A, castedShift % fullBits_4);
+	
+	// Unsigned right-shift
+	arithmetic_right_shift_operator<morton::code<false, smallBits_2, 2> > rightShiftSmall2;
+	output.mortonUnsignedRightShift_small_2 = rightShiftSmall2(morton_small_2A, castedShift % smallBits_2);
+	arithmetic_right_shift_operator<morton::code<false, mediumBits_2, 2> > rightShiftMedium2;
+	output.mortonUnsignedRightShift_medium_2 = rightShiftMedium2(morton_medium_2A, castedShift % mediumBits_2);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2> > rightShiftFull2;
+	output.mortonUnsignedRightShift_full_2 = rightShiftFull2(morton_full_2A, castedShift % fullBits_2);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_2, 2, emulated_uint64_t> > rightShiftEmulated2;
+	output.mortonUnsignedRightShift_emulated_2 = rightShiftEmulated2(morton_emulated_2A, castedShift % fullBits_2);
+
+	arithmetic_right_shift_operator<morton::code<false, smallBits_3, 3> > rightShiftSmall3;
+	output.mortonUnsignedRightShift_small_3 = rightShiftSmall3(morton_small_3A, castedShift % smallBits_3);
+	arithmetic_right_shift_operator<morton::code<false, mediumBits_3, 3> > rightShiftMedium3;
+	output.mortonUnsignedRightShift_medium_3 = rightShiftMedium3(morton_medium_3A, castedShift % mediumBits_3);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3> > rightShiftFull3;
+	output.mortonUnsignedRightShift_full_3 = rightShiftFull3(morton_full_3A, castedShift % fullBits_3);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_3, 3, emulated_uint64_t> > rightShiftEmulated3;
+	output.mortonUnsignedRightShift_emulated_3 = rightShiftEmulated3(morton_emulated_3A, castedShift % fullBits_3);
+
+	arithmetic_right_shift_operator<morton::code<false, smallBits_4, 4> > rightShiftSmall4;
+	output.mortonUnsignedRightShift_small_4 = rightShiftSmall4(morton_small_4A, castedShift % smallBits_4);
+	arithmetic_right_shift_operator<morton::code<false, mediumBits_4, 4> > rightShiftMedium4;
+	output.mortonUnsignedRightShift_medium_4 = rightShiftMedium4(morton_medium_4A, castedShift % mediumBits_4);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4> > rightShiftFull4;
+	output.mortonUnsignedRightShift_full_4 = rightShiftFull4(morton_full_4A, castedShift % fullBits_4);
+	arithmetic_right_shift_operator<morton::code<false, fullBits_4, 4, emulated_uint64_t> > rightShiftEmulated4;
+	output.mortonUnsignedRightShift_emulated_4 = rightShiftEmulated4(morton_emulated_4A, castedShift % fullBits_4);
+	
+	// Signed right-shift
+	arithmetic_right_shift_operator<morton::code<true, smallBits_2, 2> > rightShiftSignedSmall2;
+	output.mortonSignedRightShift_small_2 = rightShiftSignedSmall2(morton_small_2_signed, castedShift % smallBits_2);
+	arithmetic_right_shift_operator<morton::code<true, mediumBits_2, 2> > rightShiftSignedMedium2;
+	output.mortonSignedRightShift_medium_2 = rightShiftSignedMedium2(morton_medium_2_signed, castedShift % mediumBits_2);
+	arithmetic_right_shift_operator<morton::code<true, fullBits_2, 2> > rightShiftSignedFull2;
+	output.mortonSignedRightShift_full_2 = rightShiftSignedFull2(morton_full_2_signed, castedShift % fullBits_2);
+
+	arithmetic_right_shift_operator<morton::code<true, smallBits_3, 3> > rightShiftSignedSmall3;
+	output.mortonSignedRightShift_small_3 = rightShiftSignedSmall3(morton_small_3_signed, castedShift % smallBits_3);
+	arithmetic_right_shift_operator<morton::code<true, mediumBits_3, 3> > rightShiftSignedMedium3;
+	output.mortonSignedRightShift_medium_3 = rightShiftSignedMedium3(morton_medium_3_signed, castedShift % mediumBits_3);
+	arithmetic_right_shift_operator<morton::code<true, fullBits_3, 3> > rightShiftSignedFull3;
+	output.mortonSignedRightShift_full_3 = rightShiftSignedFull3(morton_full_3_signed, castedShift % fullBits_3);
+
+	arithmetic_right_shift_operator<morton::code<true, smallBits_4, 4> > rightShiftSignedSmall4;
+	output.mortonSignedRightShift_small_4 = rightShiftSignedSmall4(morton_small_4_signed, castedShift % smallBits_4);
+	arithmetic_right_shift_operator<morton::code<true, mediumBits_4, 4> > rightShiftSignedMedium4;
+	output.mortonSignedRightShift_medium_4 = rightShiftSignedMedium4(morton_medium_4_signed, castedShift % mediumBits_4);
+	arithmetic_right_shift_operator<morton::code<true, fullBits_4, 4> > rightShiftSignedFull4;
+	output.mortonSignedRightShift_full_4 = rightShiftSignedFull4(morton_full_4_signed, castedShift % fullBits_4);
+}
\ No newline at end of file
diff --git a/12_Mortons/main.cpp b/12_Mortons/main.cpp
index f83c49b9e..18fd067ec 100644
--- a/12_Mortons/main.cpp
+++ b/12_Mortons/main.cpp
@@ -10,7 +10,7 @@
 #include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
 #include "app_resources/common.hlsl"
-#include "Tester.h"
+#include "CTester.h"
 
 using namespace nbl::core;
 using namespace nbl::hlsl;
@@ -35,24 +35,21 @@ class MortonTest final : public MonoDeviceApplication, public MonoAssetManagerAn
             return false;
         if (!asset_base_t::onAppInitialized(std::move(system)))
             return false;
-        {
-            
-        }
         
-        Tester::PipelineSetupData pplnSetupData;
+        CTester::PipelineSetupData pplnSetupData;
         pplnSetupData.device = m_device;
         pplnSetupData.api = m_api;
         pplnSetupData.assetMgr = m_assetMgr;
         pplnSetupData.logger = m_logger;
         pplnSetupData.physicalDevice = m_physicalDevice;
         pplnSetupData.computeFamilyIndex = getComputeQueue()->getFamilyIndex();
+        // Some tests with mortons with emulated uint storage were cut off, it should be fine since each tested on their own produces correct results for each operator
         {
-            Tester mortonTester;
-            pplnSetupData.testShaderPath = "app_resources/mortonTest.comp.hlsl";
+            CTester mortonTester;
+            pplnSetupData.testShaderPath = "app_resources/test.comp.hlsl";
             mortonTester.setupPipeline<InputTestValues, TestValues>(pplnSetupData);
             mortonTester.performTests();
         }
-        
 
         return true;
     }

From f05dec4652d1af3fa1a4664760efb1f3e934134a Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Mon, 28 Apr 2025 15:29:40 -0300
Subject: [PATCH 9/9] Clarifying comment for blocker issue

---
 12_Mortons/main.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/12_Mortons/main.cpp b/12_Mortons/main.cpp
index 18fd067ec..a05e61842 100644
--- a/12_Mortons/main.cpp
+++ b/12_Mortons/main.cpp
@@ -44,6 +44,7 @@ class MortonTest final : public MonoDeviceApplication, public MonoAssetManagerAn
         pplnSetupData.physicalDevice = m_physicalDevice;
         pplnSetupData.computeFamilyIndex = getComputeQueue()->getFamilyIndex();
         // Some tests with mortons with emulated uint storage were cut off, it should be fine since each tested on their own produces correct results for each operator
+        // Blocked by https://github.com/KhronosGroup/SPIRV-Tools/issues/6104
         {
             CTester mortonTester;
             pplnSetupData.testShaderPath = "app_resources/test.comp.hlsl";