From c23050c1f3d7ffb9c7a9d351b4d47199580d71dd Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 26 May 2025 11:42:58 +0700
Subject: [PATCH 01/14] removed redundant includes

---
 include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl | 4 ----
 1 file changed, 4 deletions(-)
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
index e4a71bdffc..9f62743c1a 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -4,14 +4,10 @@
 #ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_
 #define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_
 
-
 #include "nbl/builtin/hlsl/functional.hlsl"
-#include "nbl/builtin/hlsl/workgroup/ballot.hlsl"
-#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
 #include "nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl"
 
-
 namespace nbl
 {
 namespace hlsl

From 0ccd13f00be22abd846c05a427f2a38bfa02c5e3 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 26 May 2025 15:06:45 +0700
Subject: [PATCH 02/14] added atomic store, load ; int64 specs for others

---
 .../builtin/hlsl/spirv_intrinsics/core.hlsl   | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
index 4885fc11f8..167c2fe5c7 100644
--- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
+++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
@@ -180,6 +180,16 @@ template<typename T, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicAnd)]]
 enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicAnd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
+template<typename T>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicAnd)]]
+enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicAnd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
+template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicAnd)]]
+enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicAnd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
 template<typename T>
 [[vk::ext_instruction(spv::OpAtomicOr)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicOr([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
@@ -188,6 +198,16 @@ template<typename T, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicOr)]]
 enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicOr(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
+template<typename T>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicOr)]]
+enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicOr([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
+template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicOr)]]
+enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicOr(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
 template<typename T>
 [[vk::ext_instruction(spv::OpAtomicXor)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicXor([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
@@ -196,6 +216,16 @@ template<typename T, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicXor)]]
 enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicXor(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
+template<typename T>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicXor)]]
+enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicXor([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
+template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicXor)]]
+enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicXor(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
 template<typename Signed>
 [[vk::ext_instruction( spv::OpAtomicSMin )]]
 enable_if_t<is_same_v<Signed,int32_t>, Signed> atomicSMin([[vk::ext_reference]] int32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
@@ -204,6 +234,16 @@ template<typename Signed, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicSMin)]]
 enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Signed,int32_t>, Signed> atomicSMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
 
+template<typename Signed>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction( spv::OpAtomicSMin )]]
+enable_if_t<is_same_v<Signed,int64_t>, Signed> atomicSMin([[vk::ext_reference]] int32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
+
+template<typename Signed, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicSMin)]]
+enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Signed,int64_t>, Signed> atomicSMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
+
 template<typename Unsigned>
 [[vk::ext_instruction( spv::OpAtomicUMin )]]
 enable_if_t<is_same_v<Unsigned,uint32_t>, Unsigned> atomicUMin([[vk::ext_reference]] Unsigned ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
@@ -212,6 +252,16 @@ template<typename Unsigned, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicUMin)]]
 enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Unsigned,uint32_t>, Unsigned> atomicUMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
 
+template<typename Unsigned>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction( spv::OpAtomicUMin )]]
+enable_if_t<is_same_v<Unsigned,uint64_t>, Unsigned> atomicUMin([[vk::ext_reference]] Unsigned ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
+
+template<typename Unsigned, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicUMin)]]
+enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Unsigned,uint64_t>, Unsigned> atomicUMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
+
 template<typename Signed>
 [[vk::ext_instruction( spv::OpAtomicSMax )]]
 enable_if_t<is_same_v<Signed,int32_t>, Signed> atomicSMax([[vk::ext_reference]] Signed ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
@@ -220,6 +270,16 @@ template<typename Signed, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicSMax)]]
 enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Signed,int32_t>, Signed> atomicSMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
 
+template<typename Signed>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction( spv::OpAtomicSMax )]]
+enable_if_t<is_same_v<Signed,int64_t>, Signed> atomicSMax([[vk::ext_reference]] Signed ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
+
+template<typename Signed, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicSMax)]]
+enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Signed,int64_t>, Signed> atomicSMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
+
 template<typename Unsigned>
 [[vk::ext_instruction( spv::OpAtomicUMax )]]
 enable_if_t<is_same_v<Unsigned,uint32_t>, Unsigned> atomicUMax([[vk::ext_reference]] uint32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
@@ -228,6 +288,16 @@ template<typename Unsigned, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicUMax)]]
 enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Unsigned,uint32_t>, Unsigned> atomicUMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
 
+template<typename Unsigned>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction( spv::OpAtomicUMax )]]
+enable_if_t<is_same_v<Unsigned,uint64_t>, Unsigned> atomicUMax([[vk::ext_reference]] uint32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
+
+template<typename Unsigned, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicUMax)]]
+enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Unsigned,uint64_t>, Unsigned> atomicUMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
+
 template<typename T>
 [[vk::ext_instruction(spv::OpAtomicExchange)]]
 T atomicExchange([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
@@ -244,6 +314,22 @@ template<typename T, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicCompareExchange)]]
 enable_if_t<is_pointer_v<Ptr_T>, T> atomicCompareExchange(Ptr_T ptr, uint32_t memoryScope, uint32_t memSemanticsEqual, uint32_t memSemanticsUnequal, T value, T comparator);
 
+template<typename T>
+[[vk::ext_instruction(spv::OpAtomicLoad)]]
+T atomicLoad([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics);
+
+template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_instruction(spv::OpAtomicLoad)]]
+enable_if_t<is_pointer_v<Ptr_T>, T> atomicLoad(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics);
+
+template<typename T>
+[[vk::ext_instruction(spv::OpAtomicStore)]]
+void atomicStore([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
+template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_instruction(spv::OpAtomicStore)]]
+enable_if_t<is_pointer_v<Ptr_T>, void> atomicStore(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
 
 template<typename T, uint32_t alignment>
 __NBL_CAPABILITY_PhysicalStorageBufferAddresses

From 7e1b0c31b96b01527563fe5b3bf6cdf85fa72bf4 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 26 May 2025 16:39:18 +0700
Subject: [PATCH 03/14] removed unused files in hlsl/scan

---
 .../nbl/builtin/hlsl/scan/declarations.hlsl   |  66 ------
 .../builtin/hlsl/scan/default_scheduler.hlsl  | 221 ------------------
 .../nbl/builtin/hlsl/scan/descriptors.hlsl    |   3 -
 include/nbl/builtin/hlsl/scan/direct.hlsl     |  50 ----
 include/nbl/builtin/hlsl/scan/indirect.hlsl   |  48 ----
 .../builtin/hlsl/scan/parameters_struct.hlsl  |  30 ---
 .../builtin/hlsl/scan/virtual_workgroup.hlsl  |  92 --------
 7 files changed, 510 deletions(-)
 delete mode 100644 include/nbl/builtin/hlsl/scan/declarations.hlsl
 delete mode 100644 include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
 delete mode 100644 include/nbl/builtin/hlsl/scan/descriptors.hlsl
 delete mode 100644 include/nbl/builtin/hlsl/scan/direct.hlsl
 delete mode 100644 include/nbl/builtin/hlsl/scan/indirect.hlsl
 delete mode 100644 include/nbl/builtin/hlsl/scan/parameters_struct.hlsl
 delete mode 100644 include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl

diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl
deleted file mode 100644
index 2d2e66e66d..0000000000
--- a/include/nbl/builtin/hlsl/scan/declarations.hlsl
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_
-#define _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_
-
-// REVIEW: Not sure if this file is needed in HLSL implementation
-
-#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl"
-
-
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	Parameters_t getParameters();
-}
-}
-}
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_
-#endif
-
-#ifndef _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	template<typename Storage_t>
-	void getData(
-		inout Storage_t data,
-		in uint levelInvocationIndex,
-		in uint localWorkgroupIndex,
-		in uint treeLevel,
-		in uint pseudoLevel
-	);
-}
-}
-}
-#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_
-#endif
-
-#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	template<typename Storage_t>
-	void setData(
-		in Storage_t data,
-		in uint levelInvocationIndex,
-		in uint localWorkgroupIndex,
-		in uint treeLevel,
-		in uint pseudoLevel,
-		in bool inRange
-	);
-}
-}
-}
-#define _NBL_HLSL_SCAN_SET_DATA_DECLARED_
-#endif
-
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
deleted file mode 100644
index 450368475d..0000000000
--- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
+++ /dev/null
@@ -1,221 +0,0 @@
-#ifndef _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_
-#define _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_
-
-#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl"
-
-#ifdef __cplusplus
-#define uint uint32_t
-#endif
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{	
-	struct DefaultSchedulerParameters_t
-	{
-		uint finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1];
-		uint cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS];
-
-	};
-}
-}
-}
-
-#ifdef __cplusplus
-#undef uint
-#else
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-namespace scheduler
-{
-	/**
-	 * The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation
-	 * (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions 
-	 * that are used for synchronization etc.
-	 * Remember that CScanner does a Blelloch Scan which works in levels. In each level of the Blelloch scan the array of elements is 
-	 * broken down into sets of size=WorkgroupSize and each set is scanned using Hillis & Steele (aka Stone-Kogge adder). The result of 
-	 * the scan is provided as an array element for the next level of the Blelloch Scan. This means that if we have 10000 elements and 
-	 * WorkgroupSize=250, we will break the array into 40 sets and take their reduction results. The next level of the Blelloch Scan will 
-	 * have an array of size 40. Only a single workgroup will be needed to work on that. After that array is scanned, we use the results 
-	 * in the downsweep phase of Blelloch Scan.
-	 * Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount.
-	 * The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase.
-	 * The intermediate results are stored in a scratch buffer. That buffer's size is is the sum of the element-array size for all the 
-	 * Blelloch levels. Using the previous example, the scratch size should be 10000 + 40.
-	 * 
-	 * Parameter meaning:
-	 * |> lastElement - the index of the last element of each Blelloch level in the scratch buffer
-	 * |> topLevel - the top level the Blelloch Scan will have (this depends on the elementCount and the WorkgroupSize)
-	 * |> temporaryStorageOffset - an offset array for each level of the Blelloch Scan. It is used when storing the REDUCTION result of each workgroup scan
-	 * |> cumulativeWorkgroupCount - the sum-scan of all the workgroups that will need to be launched for each level of the Blelloch Scan (both upsweep and downsweep)
-	 * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps 
-	 *							synchronizing between workgroups with while-loop spinning.
-	 */
-	void computeParameters(in uint elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams)
-	{
-#define WorkgroupCount(Level) (_scanParams.lastElement[Level+1]+1u)
-		_scanParams.lastElement[0] = elementCount-1u;
-		_scanParams.topLevel = firstbithigh(_scanParams.lastElement[0])/_NBL_HLSL_WORKGROUP_SIZE_LOG2_;
-		// REVIEW: _NBL_HLSL_WORKGROUP_SIZE_LOG2_ is defined in files that include THIS file. Why not query the API for workgroup size at runtime?
-		
-		for (uint i=0; i<NBL_BUILTIN_MAX_SCAN_LEVELS/2;)
-		{
-			const uint next = i+1;
-			_scanParams.lastElement[next] = _scanParams.lastElement[i]>>_NBL_HLSL_WORKGROUP_SIZE_LOG2_;
-			i = next;
-		}
-		_schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0);
-		_schedulerParams.finishedFlagOffset[0] = 0u;
-		switch(_scanParams.topLevel)
-		{
-			case 1u:
-				_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+1u;
-				_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(0);
-				// climb up
-				_schedulerParams.finishedFlagOffset[1] = 1u;
-				
-				_scanParams.temporaryStorageOffset[0] = 2u;
-				break;
-			case 2u:
-				_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u;
-				_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0);
-				// climb up
-				_schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1);
-				_schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u;
-				// climb down
-				_schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u;
-				
-				_scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1);
-				_scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0);
-				break;
-			case 3u:
-				_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2);
-				_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u;
-				_schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(2);
-				_schedulerParams.cumulativeWorkgroupCount[5] = _schedulerParams.cumulativeWorkgroupCount[4]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[6] = _schedulerParams.cumulativeWorkgroupCount[5]+WorkgroupCount(0);
-				// climb up
-				_schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1);
-				_schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+WorkgroupCount(2);
-				_schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[2]+1u;
-				// climb down
-				_schedulerParams.finishedFlagOffset[4] = _schedulerParams.finishedFlagOffset[2]+2u;
-				_schedulerParams.finishedFlagOffset[5] = _schedulerParams.finishedFlagOffset[4]+WorkgroupCount(2);
-				
-				_scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[5]+WorkgroupCount(1);
-				_scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0);
-				_scanParams.temporaryStorageOffset[2] = _scanParams.temporaryStorageOffset[1]+WorkgroupCount(1);
-				break;
-			default:
-				break;
-#if NBL_BUILTIN_MAX_SCAN_LEVELS>7
-#error "Switch needs more cases"
-#endif
-		}
-#undef WorkgroupCount
-	}
-	
-	/**
-	 * treeLevel - the current level in the Blelloch Scan
-	 * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. 
-	 * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5.
-	 */
-	template<class ScratchAccessor>
-	bool getWork(in DefaultSchedulerParameters_t params, in uint topLevel, out uint treeLevel, out uint localWorkgroupIndex)
-	{
-		ScratchAccessor sharedScratch;
-		if(SubgroupContiguousIndex() == 0u) 
-		{
-			uint64_t original;
-			InterlockedAdd(scanScratch.workgroupsStarted, 1u, original); // REVIEW: Refactor InterlockedAdd with GLSL terminology? // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
-			sharedScratch.set(SubgroupContiguousIndex(), original);
-		}
-		else if (SubgroupContiguousIndex() == 1u) 
-		{
-			sharedScratch.set(SubgroupContiguousIndex(), 0u);
-		}
-		GroupMemoryBarrierWithGroupSync(); // REVIEW: refactor this somewhere with GLSL terminology?
-		
-		const uint globalWorkgroupIndex; // does every thread need to know?
-		sharedScratch.get(0u, globalWorkgroupIndex);
-		const uint lastLevel = topLevel<<1u;
-		if (SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[SubgroupContiguousIndex()]) 
-		{
-			InterlockedAdd(sharedScratch.get(1u, ?), 1u); // REVIEW: The way scratchaccessoradaptor is implemented (e.g. under subgroup/arithmetic_portability) doesn't allow for atomic ops on the scratch buffer. Should we ask for another implementation that overrides the [] operator ?
-		}
-		GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
-		
-		sharedScratch.get(1u, treeLevel);
-		if(treeLevel>lastLevel)
-			return true;
-		
-		localWorkgroupIndex = globalWorkgroupIndex;
-		const bool dependentLevel = treeLevel != 0u;
-		if(dependentLevel) 
-		{
-			const uint prevLevel = treeLevel - 1u;
-			localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel];
-			if(SubgroupContiguousIndex() == 0u) 
-			{
-				uint dependentsCount = 1u;
-				if(treeLevel <= topLevel) 
-				{
-					dependentsCount = _NBL_HLSL_WORKGROUP_SIZE_; // REVIEW: Defined in the files that include this file?
-					const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel];
-					if (lastWorkgroup) 
-					{
-						const Parameters_t scanParams = getParameters(); // TODO (PentaKon): Undeclared as of now, this should return the Parameters_t from the push constants of (in)direct shader
-						dependentsCount = scanParams.lastElement[treeLevel]+1u;
-						if (treeLevel<topLevel) 
-						{
-							dependentsCount -= scanParams.lastElement[treeLevel+1u]*_NBL_HLSL_WORKGROUP_SIZE_;
-						}
-					}
-				}
-				uint dependentsFinishedFlagOffset = localWorkgroupIndex;
-				if (treeLevel>topLevel) // !(prevLevel<topLevel) TODO: merge with `else` above?
-					dependentsFinishedFlagOffset /= _NBL_HLSL_WORKGROUP_SIZE_;
-				dependentsFinishedFlagOffset += params.finishedFlagOffset[prevLevel];
-				while (scanScratch.data[dependentsFinishedFlagOffset]!=dependentsCount) // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
-					GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
-			}
-		}
-		GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
-		return false;
-	}
-	
-	void markComplete(in DefaultSchedulerParameters_t params, in uint topLevel, in uint treeLevel, in uint localWorkgroupIndex)
-	{
-		GroupMemoryBarrierWithGroupSync(); // must complete writing the data before flags itself as complete  // TODO (PentaKon): Possibly refactor?
-		if (SubgroupContiguousIndex()==0u)
-		{
-			uint finishedFlagOffset = params.finishedFlagOffset[treeLevel];
-			if (treeLevel<topLevel)
-			{
-				finishedFlagOffset += localWorkgroupIndex/_NBL_HLSL_WORKGROUP_SIZE_;
-				InterlockedAdd(scanScratch.data[finishedFlagOffset],1u);
-			}
-			else if (treeLevel!=(topLevel<<1u))
-			{
-				finishedFlagOffset += localWorkgroupIndex;
-				scanScratch.data[finishedFlagOffset] = 1u; // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
-			}
-		}
-	}
-}
-}
-}
-}
-#endif
-
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
deleted file mode 100644
index 86c5589725..0000000000
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ /dev/null
@@ -1,3 +0,0 @@
-
-
-// choerent -> globallycoherent
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl
deleted file mode 100644
index 325a08e3f0..0000000000
--- a/include/nbl/builtin/hlsl/scan/direct.hlsl
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef _NBL_HLSL_WORKGROUP_SIZE_
-#define _NBL_HLSL_WORKGROUP_SIZE_ 256
-#endif
-
-#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
-#include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl"
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-#ifndef _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_
-	cbuffer PC // REVIEW: register and packoffset selection
-	{
-		Parameters_t scanParams;
-		DefaultSchedulerParameters_t schedulerParams;
-	};
-#define _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_
-#endif
-
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-Parameters_t getParameters()
-{
-	return pc.scanParams;
-}
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-#endif
-
-#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-DefaultSchedulerParameters_t getSchedulerParameters()
-{
-	return pc.schedulerParams;
-}
-#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-#endif
-}
-}
-}
-
-#ifndef _NBL_HLSL_MAIN_DEFINED_
-[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)]
-void CSMain()
-{
-	nbl::hlsl::scan::main();
-}
-#define _NBL_HLSL_MAIN_DEFINED_
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/indirect.hlsl b/include/nbl/builtin/hlsl/scan/indirect.hlsl
deleted file mode 100644
index 1191731f65..0000000000
--- a/include/nbl/builtin/hlsl/scan/indirect.hlsl
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _NBL_HLSL_WORKGROUP_SIZE_
-#define _NBL_HLSL_WORKGROUP_SIZE_ 256
-#define _NBL_HLSL_WORKGROUP_SIZE_LOG2_ 8
-#endif
-
-#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
-#include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl"
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-Parameters_t scanParams;
-Parameters_t getParameters()
-{
-	return scanParams;
-}
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-#endif
-
-uint getIndirectElementCount();
-
-#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-DefaultSchedulerParameters_t schedulerParams;
-DefaultSchedulerParameters_t getSchedulerParameters()
-{
-	scheduler::computeParameters(getIndirectElementCount(),scanParams,schedulerParams);
-	return schedulerParams;
-}
-#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-#endif
-}
-}
-}
-
-#ifndef _NBL_HLSL_MAIN_DEFINED_
-[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)]
-void CSMain()
-{
-	if (bool(nbl::hlsl::scan::getIndirectElementCount()))
-		nbl::hlsl::scan::main();
-}
-#define _NBL_HLSL_MAIN_DEFINED_
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl b/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl
deleted file mode 100644
index bfeba13be2..0000000000
--- a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_
-#define _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_
-
-#define NBL_BUILTIN_MAX_SCAN_LEVELS 7
-
-#ifdef __cplusplus
-#define uint uint32_t
-#endif
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	// REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules)
-	struct Parameters_t {
-		uint lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1];
-		uint topLevel;
-		uint temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];
-	}
-}
-}
-}
-
-#ifdef __cplusplus
-#undef uint
-#endif
-
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
deleted file mode 100644
index 488bf29012..0000000000
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifndef _NBL_HLSL_SCAN_VIRTUAL_WORKGROUP_INCLUDED_
-#define _NBL_HLSL_SCAN_VIRTUAL_WORKGROUP_INCLUDED_
-
-// TODO (PentaKon): Decide if these are needed once we have a clearer picture of the refactor
-#include "nbl/builtin/hlsl/limits/numeric.hlsl"
-#include "nbl/builtin/hlsl/math/typeless_arithmetic.hlsl"
-#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined
-#include "nbl/builtin/hlsl/scan/declarations.hlsl"
-
-#include "nbl/builtin/hlsl/binops.hlsl"
-
-#if 0
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	template<class Binop, class Storage_t>
-	void virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex)
-	{
-		const Parameters_t params = getParameters();
-		const uint levelInvocationIndex = localWorkgroupIndex * _NBL_HLSL_WORKGROUP_SIZE_ + SubgroupContiguousIndex();
-		const bool lastInvocationInGroup = SubgroupContiguousIndex() == (_NBL_HLSL_WORKGROUP_SIZE_ - 1);
-
-		const uint lastLevel = params.topLevel << 1u;
-		const uint pseudoLevel = levelInvocationIndex <= params.lastElement[pseudoLevel];
-
-		const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel];
-
-		Storage_t data = Binop::identity();
-		if(inRange)
-		{
-			getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel);
-		}
-
-		if(treeLevel < params.topLevel) 
-		{
-			#error "Must also define some scratch accessor when calling operation()"
-			data = workgroup::reduction<Binop>()(data);
-		}
-		// REVIEW: missing _TYPE_ check and extra case here
-		else if (treeLevel != params.topLevel)
-		{
-			data = workgroup::inclusive_scan<Binop>()(data);
-		}
-		else
-		{
-			data = workgroup::exclusive_scan<Binop>()(data);
-		}
-		setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange);
-	}
-}
-}
-}
-
-#ifndef _NBL_HLSL_SCAN_MAIN_DEFINED_ // TODO REVIEW: Are these needed, can this logic be refactored?
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components
-	void main()
-	{
-		const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters();
-		const uint topLevel = getParameters().topLevel;
-		// persistent workgroups
-		while (true)
-		{
-			uint treeLevel,localWorkgroupIndex;
-			if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex))
-			{
-				return;
-			}
-
-			virtualWorkgroup(treeLevel,localWorkgroupIndex);
-
-			scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex);
-		}
-	}
-}
-}
-}
-#endif
-
-#define _NBL_HLSL_SCAN_MAIN_DEFINED_
-#endif
-
-#endif
\ No newline at end of file

From 9666ce474a71ae1deea9ac5e9193aa816de2ff56 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 26 May 2025 16:53:49 +0700
Subject: [PATCH 04/14] initial global reduce impl

---
 examples_tests                                |   2 +-
 include/nbl/builtin/hlsl/scan/arithmetic.hlsl |  34 +++++
 .../builtin/hlsl/scan/arithmetic_impl.hlsl    | 118 ++++++++++++++++++
 3 files changed, 153 insertions(+), 1 deletion(-)
 create mode 100644 include/nbl/builtin/hlsl/scan/arithmetic.hlsl
 create mode 100644 include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl

diff --git a/examples_tests b/examples_tests
index bb3a901b5d..50647e4803 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit bb3a901b5de72b78246af20072f4489960287204
+Subproject commit 50647e4803afbc2f0ddfd1bed9ba6d5e4e180355
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl
new file mode 100644
index 0000000000..335271f908
--- /dev/null
+++ b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl
@@ -0,0 +1,34 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_
+
+#include "nbl/builtin/hlsl/scan/arithmetic_impl.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities=void>
+struct reduction
+{
+    using scalar_t = typename BinOp::type_t;
+
+    template<class ReadOnlyDataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<ReadOnlyDataAccessor> && ArithmeticSharedMemoryAccessor<ScratchAccessor>)
+    static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)   // scratch bda?
+    {
+        impl::reduce<Config, BinOp, ForwardProgressGuarantees, device_capabilities> fn;
+        scalar_t value = fn.template __call<ReadOnlyDataAccessor,ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
+        return value;
+    }
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
new file mode 100644
index 0000000000..949ded773e
--- /dev/null
+++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
@@ -0,0 +1,118 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_
+
+#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation>
+struct ScanConfiguration
+{
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = _ItemsPerInvocation;
+};
+
+namespace impl
+{
+
+template<typename T>    // only uint32_t or uint64_t for now?
+struct Constants
+{
+    NBL_CONSTEXPR_STATIC_INLINE T NOT_READY = 0;
+    NBL_CONSTEXPR_STATIC_INLINE T LOCAL_COUNT = T(0x1u) << (sizeof(T)*8-2);
+    NBL_CONSTEXPR_STATIC_INLINE T GLOBAL_COUNT = T(0x1u) << (sizeof(T)*8-1);
+    NBL_CONSTEXPR_STATIC_INLINE T STATUS_MASK = LOCAL_COUNT | GLOBAL_COUNT;
+};
+
+template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
+struct reduce
+{
+    using constants_t = Constants<T>;
+    using scalar_t = T;
+    using config_t = Config;
+    using arith_config_t = workgroup2::ArithmeticConfiguration<config_t::WorkgroupSizeLog2, config_t::SubgroupSizeLog2, config_t::ItemsPerInvocation>;
+    using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BinOp, device_capabilities>;
+    using binop_t = BinOp;
+
+    template<class DataAccessor, class ScratchAccessor>
+    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
+    {
+        const scalar_t localReduction = workgroup_reduce_t::__call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
+
+        const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == WorkgroupSize-1);
+        if (lastInvocation)
+        {
+            bda::__ref<uint32_t,4> scratchId = (scratch + glsl::gl_WorkgroupID()).deref();
+            spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT);
+        }
+
+        scalar_t prefix = scalar_t(0);
+        // decoupled lookback
+        if (ForwardProgressGuarantees)
+        {
+            if (lastInvocation) // don't make whole block work and do busy stuff
+            {
+                for (uint32_t prevID = glsl::gl_WorkgroupID()-1; prevID > 0u; prevID--)
+                {
+                    scalar_t value = scalar_t(0);
+                    {
+                        // spin until something is ready
+                        while (value == constants_t::NOT_READY)
+                        {
+                            bda::__ref<uint32_t,4> scratchPrev = (scratch-1).deref();
+                            value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
+                        }
+                    }
+                    prefix += value & (~constants_t::STATUS_MASK);
+
+                    // last was actually a global sum, we have the prefix, we can quit
+                    if (value & constants_t::GLOBAL_COUNT)
+                        break;
+                }
+            }
+            prefix = workgroup::Broadcast(prefix, sharedMemScratchAccessor, WorkgroupSize-1);
+        }
+
+        binop_t binop;
+        scalar_t globalReduction = binop(prefix,localReduction);
+        if (lastInvocation)
+        {
+            bda::__ref<uint32_t,4> scratchId = (scratch + glsl::gl_WorkgroupID()).deref();
+            spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT);
+        }
+
+        // get last item from scratch
+        uint32_t lastWorkgroup = glsl::gl_NumWorkgroups() - 1;
+        bda::__ref<uint32_t,4> scratchLast = (scratch + lastWorkgroup).deref();
+        uint32_t value;
+        {
+            // wait until last workgroup does reduction
+            while (value & constants_t::GLOBAL_COUNT)
+            {
+                value = spirv::atomicLoad(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
+            }
+        }
+        return value & (~constants_t::STATUS_MASK);
+    }
+
+    // bda::_ptr scratch ??
+}
+
+}
+
+}
+}
+}
+
+#endif

From fa7151e96e0907e88463bd14a0ce945cfbcb2165 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 2 Jun 2025 13:53:59 +0700
Subject: [PATCH 05/14] get example

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 6581ed496d..5e971c8a18 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 6581ed496d2fc41cae1dc5c9ceba10f3bdfc5135
+Subproject commit 5e971c8a1812922bbf36ecd969fdfb56a0d7d880

From 20d56d87b7d4d377584de87f0cbdce98de212e71 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 3 Jun 2025 10:18:53 +0700
Subject: [PATCH 06/14] fix missing bits in reduce

---
 examples_tests                                     |  2 +-
 include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples_tests b/examples_tests
index 5e971c8a18..ccb6385c5b 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 5e971c8a1812922bbf36ecd969fdfb56a0d7d880
+Subproject commit ccb6385c5b40c87842b8a950497d065262a91288
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
index 949ded773e..e67a4b023c 100644
--- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
+++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
@@ -21,6 +21,9 @@ struct ScanConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = _ItemsPerInvocation;
+
+    using arith_config_t = workgroup2::ArithmeticConfiguration<config_t::WorkgroupSizeLog2, config_t::SubgroupSizeLog2, config_t::ItemsPerInvocation>;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = arith_config_t::SharedScratchElementCount;
 };
 
 namespace impl
@@ -38,10 +41,10 @@ struct Constants
 template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
 struct reduce
 {
-    using constants_t = Constants<T>;
-    using scalar_t = T;
+    using scalar_t = typename BinOp::type_t;
+    using constants_t = Constants<scalar_t>;
     using config_t = Config;
-    using arith_config_t = workgroup2::ArithmeticConfiguration<config_t::WorkgroupSizeLog2, config_t::SubgroupSizeLog2, config_t::ItemsPerInvocation>;
+    using arith_config_t = typename Config::arith_config_t;
     using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BinOp, device_capabilities>;
     using binop_t = BinOp;
 
@@ -49,11 +52,12 @@ struct reduce
     scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
     {
         const scalar_t localReduction = workgroup_reduce_t::__call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
+        bda::__ptr<T> scratch = dataAccessor.getScratchPtr();   // scratch data should be at least T[NumWorkgroups]
 
         const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == WorkgroupSize-1);
         if (lastInvocation)
         {
-            bda::__ref<uint32_t,4> scratchId = (scratch + glsl::gl_WorkgroupID()).deref();
+            bda::__ref<T> scratchId = (scratch + glsl::gl_WorkgroupID()).deref();
             spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT);
         }
 
@@ -105,8 +109,6 @@ struct reduce
         }
         return value & (~constants_t::STATUS_MASK);
     }
-
-    // bda::_ptr scratch ??
 }
 
 }

From 752d943fe04f0a9282009c9c7705c1ac5101f0e1 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 9 Jun 2025 16:38:53 +0700
Subject: [PATCH 07/14] bug fixes so shader compiles now, but infinite loop
 suspected

---
 examples_tests                                |  2 +-
 include/nbl/builtin/hlsl/scan/arithmetic.hlsl |  4 +--
 .../builtin/hlsl/scan/arithmetic_impl.hlsl    | 33 ++++++++++---------
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/examples_tests b/examples_tests
index 1710b69862..39d7859d28 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 1710b698621796aa767edf7bc940e55e6758c2a8
+Subproject commit 39d7859d2848468f49aef5627bd3f814502a74b5
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl
index 335271f908..95d7a4c045 100644
--- a/include/nbl/builtin/hlsl/scan/arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl
@@ -18,8 +18,8 @@ struct reduction
 {
     using scalar_t = typename BinOp::type_t;
 
-    template<class ReadOnlyDataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<ReadOnlyDataAccessor> && ArithmeticSharedMemoryAccessor<ScratchAccessor>)
-    static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)   // scratch bda?
+    template<class ReadOnlyDataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(workgroup2::ArithmeticReadOnlyDataAccessor<ReadOnlyDataAccessor,scalar_t> && workgroup2::ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t>)
+    static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
     {
         impl::reduce<Config, BinOp, ForwardProgressGuarantees, device_capabilities> fn;
         scalar_t value = fn.template __call<ReadOnlyDataAccessor,ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
index e67a4b023c..3789d2f35a 100644
--- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
+++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
@@ -4,6 +4,7 @@
 #ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_
 #define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_
 
+#include "nbl/builtin/hlsl/bda/__ptr.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
 
 namespace nbl
@@ -22,7 +23,7 @@ struct ScanConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = _ItemsPerInvocation;
 
-    using arith_config_t = workgroup2::ArithmeticConfiguration<config_t::WorkgroupSizeLog2, config_t::SubgroupSizeLog2, config_t::ItemsPerInvocation>;
+    using arith_config_t = workgroup2::ArithmeticConfiguration<WorkgroupSizeLog2, SubgroupSizeLog2, ItemsPerInvocation>;
     NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = arith_config_t::SharedScratchElementCount;
 };
 
@@ -51,13 +52,13 @@ struct reduce
     template<class DataAccessor, class ScratchAccessor>
     scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
     {
-        const scalar_t localReduction = workgroup_reduce_t::__call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
-        bda::__ptr<T> scratch = dataAccessor.getScratchPtr();   // scratch data should be at least T[NumWorkgroups]
+        const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
+        bda::__ptr<scalar_t> scratch = dataAccessor.getScratchPtr();   // scratch data should be at least T[NumWorkgroups]
 
-        const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == WorkgroupSize-1);
+        const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);
         if (lastInvocation)
         {
-            bda::__ref<T> scratchId = (scratch + glsl::gl_WorkgroupID()).deref();
+            bda::__ref<scalar_t> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref();
             spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT);
         }
 
@@ -67,15 +68,16 @@ struct reduce
         {
             if (lastInvocation) // don't make whole block work and do busy stuff
             {
-                for (uint32_t prevID = glsl::gl_WorkgroupID()-1; prevID > 0u; prevID--)
+                for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID > 0u; prevID--)
                 {
                     scalar_t value = scalar_t(0);
                     {
                         // spin until something is ready
                         while (value == constants_t::NOT_READY)
                         {
-                            bda::__ref<uint32_t,4> scratchPrev = (scratch-1).deref();
-                            value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
+                            bda::__ref<scalar_t,4> scratchPrev = (scratch-1).deref();
+                            // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
+                            value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);
                         }
                     }
                     prefix += value & (~constants_t::STATUS_MASK);
@@ -85,31 +87,32 @@ struct reduce
                         break;
                 }
             }
-            prefix = workgroup::Broadcast(prefix, sharedMemScratchAccessor, WorkgroupSize-1);
+            prefix = workgroup::Broadcast(prefix, sharedMemScratchAccessor, Config::WorkgroupSize-1);
         }
 
         binop_t binop;
         scalar_t globalReduction = binop(prefix,localReduction);
         if (lastInvocation)
         {
-            bda::__ref<uint32_t,4> scratchId = (scratch + glsl::gl_WorkgroupID()).deref();
+            bda::__ref<scalar_t,4> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref();
             spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT);
         }
 
         // get last item from scratch
-        uint32_t lastWorkgroup = glsl::gl_NumWorkgroups() - 1;
-        bda::__ref<uint32_t,4> scratchLast = (scratch + lastWorkgroup).deref();
-        uint32_t value;
+        uint32_t lastWorkgroup = glsl::gl_NumWorkGroups().x - 1;
+        bda::__ref<scalar_t,4> scratchLast = (scratch + lastWorkgroup).deref();
+        scalar_t value;
         {
             // wait until last workgroup does reduction
             while (value & constants_t::GLOBAL_COUNT)
             {
-                value = spirv::atomicLoad(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
+                // value = spirv::atomicLoad(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
+                value = spirv::atomicIAdd(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);
             }
         }
         return value & (~constants_t::STATUS_MASK);
     }
-}
+};
 
 }
 

From 6461b360747a2d4378b26a2c0c15fbca1780ebde Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 10 Jun 2025 11:37:10 +0700
Subject: [PATCH 08/14] added branch for no forward progress guarantee (no spin
 wait)

---
 examples_tests                                |  2 +-
 .../builtin/hlsl/scan/arithmetic_impl.hlsl    | 48 +++++++++++++++++--
 2 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/examples_tests b/examples_tests
index 39d7859d28..de60cc1137 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 39d7859d2848468f49aef5627bd3f814502a74b5
+Subproject commit de60cc1137b3850ca7c7590123467e18898c5e98
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
index 3789d2f35a..00537015c7 100644
--- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
+++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
@@ -68,14 +68,16 @@ struct reduce
         {
             if (lastInvocation) // don't make whole block work and do busy stuff
             {
-                for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID > 0u; prevID--)
+                bda::__ptr<scalar_t> scratchIter = scratch;
+                for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--)
                 {
                     scalar_t value = scalar_t(0);
+                    scratchIter = scratchIter-1;
                     {
                         // spin until something is ready
                         while (value == constants_t::NOT_READY)
                         {
-                            bda::__ref<scalar_t,4> scratchPrev = (scratch-1).deref();
+                            bda::__ref<scalar_t,4> scratchPrev = scratchIter.deref();
                             // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
                             value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);
                         }
@@ -89,6 +91,44 @@ struct reduce
             }
             prefix = workgroup::Broadcast(prefix, sharedMemScratchAccessor, Config::WorkgroupSize-1);
         }
+        else
+        {
+            bda::__ptr<scalar_t> scratchIter = scratch;
+            for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--)
+            {
+                scalar_t value = scalar_t(0);
+                scratchIter = scratchIter-1;
+                if (lastInvocation)
+                {
+                    bda::__ref<scalar_t,4> scratchPrev = scratchIter.deref();
+                    // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
+                    value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);
+                }
+                value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1);
+
+                if (value & constants_t::STATUS_MASK)
+                {
+                    prefix += value & (~constants_t::STATUS_MASK);
+
+                    if (value & constants_t::GLOBAL_COUNT)
+                        break;
+                }
+                else    // can't wait/spin, have to do it ourselves
+                {
+                    sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+                    DataAccessor prevDataAccessor = DataAccessor::create(prevID);
+                    const scalar_t prevReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(prevDataAccessor, sharedMemScratchAccessor);
+
+                    // if DoAndRaceStore, stores in place of prev workgroup id as well
+                    // bda::__ref<scalar_t,4> scratchPrev = scratchIter.deref();
+                    // if (lastInvocation)
+                    //     spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT);
+
+                    prefix += prevReduction;
+                }
+            }
+        }
 
         binop_t binop;
         scalar_t globalReduction = binop(prefix,localReduction);
@@ -101,10 +141,10 @@ struct reduce
         // get last item from scratch
         uint32_t lastWorkgroup = glsl::gl_NumWorkGroups().x - 1;
         bda::__ref<scalar_t,4> scratchLast = (scratch + lastWorkgroup).deref();
-        scalar_t value;
+        scalar_t value = scalar_t(0);
         {
             // wait until last workgroup does reduction
-            while (value & constants_t::GLOBAL_COUNT)
+            while (!(value & constants_t::GLOBAL_COUNT))
             {
                 // value = spirv::atomicLoad(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
                 value = spirv::atomicIAdd(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);

From e291940c42bd4ec90b10f9180b6f5954001a6389 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 10 Jun 2025 16:31:51 +0700
Subject: [PATCH 09/14] bug fixes to indexing, forward progress guarantee works
 now

---
 .../builtin/hlsl/scan/arithmetic_impl.hlsl    | 48 ++++++++++++-------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
index 00537015c7..aae1185a53 100644
--- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
+++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
@@ -62,27 +62,39 @@ struct reduce
             spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT);
         }
 
+        // NOTE: just for testing, remove when done
+        // sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier();
+        // uint32_t prev = glsl::gl_WorkGroupID().x==0 ? 0 : glsl::gl_WorkGroupID().x-1;
+        // scalar_t testVal = constants_t::NOT_READY;
+        // if (lastInvocation)
+        //     while (testVal == constants_t::NOT_READY)
+        //         testVal = spirv::atomicIAdd((scratch + prev).deref().__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);
+        // sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier();
+        // testVal = workgroup::Broadcast(testVal, sharedMemScratchAccessor, Config::WorkgroupSize-1);
+        // return testVal;
+
+        binop_t binop;
         scalar_t prefix = scalar_t(0);
         // decoupled lookback
         if (ForwardProgressGuarantees)
         {
             if (lastInvocation) // don't make whole block work and do busy stuff
             {
-                bda::__ptr<scalar_t> scratchIter = scratch;
-                for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--)
+                // for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--)   // won't run properly this way for some reason, results in device lost
+                for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++)
                 {
-                    scalar_t value = scalar_t(0);
-                    scratchIter = scratchIter-1;
+                    const uint32_t prevID = glsl::gl_WorkGroupID().x-i;
+                    scalar_t value = constants_t::NOT_READY;
                     {
                         // spin until something is ready
                         while (value == constants_t::NOT_READY)
                         {
-                            bda::__ref<scalar_t,4> scratchPrev = scratchIter.deref();
+                            bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
                             // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
                             value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);
                         }
                     }
-                    prefix += value & (~constants_t::STATUS_MASK);
+                    prefix = binop(value & (~constants_t::STATUS_MASK), prefix);
 
                     // last was actually a global sum, we have the prefix, we can quit
                     if (value & constants_t::GLOBAL_COUNT)
@@ -93,14 +105,15 @@ struct reduce
         }
         else
         {
-            bda::__ptr<scalar_t> scratchIter = scratch;
-            for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--)
+            bda::__ptr<scalar_t> scratchIter = scratch + glsl::gl_WorkGroupID().x;
+            // for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--)
+            for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++)
             {
+                const uint32_t prevID = glsl::gl_WorkGroupID().x-i;
                 scalar_t value = scalar_t(0);
-                scratchIter = scratchIter-1;
                 if (lastInvocation)
                 {
-                    bda::__ref<scalar_t,4> scratchPrev = scratchIter.deref();
+                    bda::__ref<scalar_t,4> scratchPrev = (scratch + prevID).deref();
                     // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
                     value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);
                 }
@@ -108,7 +121,7 @@ struct reduce
 
                 if (value & constants_t::STATUS_MASK)
                 {
-                    prefix += value & (~constants_t::STATUS_MASK);
+                    prefix = binop(value & (~constants_t::STATUS_MASK), prefix);
 
                     if (value & constants_t::GLOBAL_COUNT)
                         break;
@@ -125,23 +138,23 @@ struct reduce
                     // if (lastInvocation)
                     //     spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT);
 
-                    prefix += prevReduction;
+                    prefix = binop(prevReduction, prefix);
                 }
             }
         }
 
-        binop_t binop;
         scalar_t globalReduction = binop(prefix,localReduction);
         if (lastInvocation)
         {
-            bda::__ref<scalar_t,4> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref();
+            bda::__ref<scalar_t> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref();
             spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT);
         }
 
         // get last item from scratch
-        uint32_t lastWorkgroup = glsl::gl_NumWorkGroups().x - 1;
-        bda::__ref<scalar_t,4> scratchLast = (scratch + lastWorkgroup).deref();
-        scalar_t value = scalar_t(0);
+        const uint32_t lastWorkgroup = glsl::gl_NumWorkGroups().x - 1;
+        bda::__ref<scalar_t> scratchLast = (scratch + lastWorkgroup).deref();
+        scalar_t value = constants_t::NOT_READY;
+        if (lastInvocation)
         {
             // wait until last workgroup does reduction
             while (!(value & constants_t::GLOBAL_COUNT))
@@ -150,6 +163,7 @@ struct reduce
                 value = spirv::atomicIAdd(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);
             }
         }
+        value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1);
         return value & (~constants_t::STATUS_MASK);
     }
 };

From 8665fcc1c96f3a3b74dd14e9daff63650ae5efc2 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 11 Jun 2025 11:35:37 +0700
Subject: [PATCH 10/14] fix to without forward progress guarantee, >2
 workgroups broken somehow

---
 examples_tests                                     | 2 +-
 include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples_tests b/examples_tests
index de60cc1137..b9f515c207 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit de60cc1137b3850ca7c7590123467e18898c5e98
+Subproject commit b9f515c20721e57180a584072be56e6d3b6a1301
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
index aae1185a53..d7591da8d6 100644
--- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
+++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
@@ -105,7 +105,6 @@ struct reduce
         }
         else
         {
-            bda::__ptr<scalar_t> scratchIter = scratch + glsl::gl_WorkGroupID().x;
             // for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--)
             for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++)
             {
@@ -113,7 +112,7 @@ struct reduce
                 scalar_t value = scalar_t(0);
                 if (lastInvocation)
                 {
-                    bda::__ref<scalar_t,4> scratchPrev = (scratch + prevID).deref();
+                    bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
                     // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
                     value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);
                 }
@@ -131,10 +130,11 @@ struct reduce
                     sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier();
 
                     DataAccessor prevDataAccessor = DataAccessor::create(prevID);
+                    prevDataAccessor.begin();   // prepare data accessor if needed (e.g. preload)
                     const scalar_t prevReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(prevDataAccessor, sharedMemScratchAccessor);
 
                     // if DoAndRaceStore, stores in place of prev workgroup id as well
-                    // bda::__ref<scalar_t,4> scratchPrev = scratchIter.deref();
+                    // bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
                     // if (lastInvocation)
                     //     spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT);
 

From 7cde6200f295fd2aa8b50a07477109c1ea35fd5b Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 11 Jun 2025 16:20:29 +0700
Subject: [PATCH 11/14] fix to atomic load/store intrinsics

---
 include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
index 167c2fe5c7..c7a3694d3e 100644
--- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
+++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
@@ -316,7 +316,7 @@ enable_if_t<is_pointer_v<Ptr_T>, T> atomicCompareExchange(Ptr_T ptr, uint32_t me
 
 template<typename T>
 [[vk::ext_instruction(spv::OpAtomicLoad)]]
-T atomicLoad([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics);
+enable_if_t<!is_pointer_v<T>, T> atomicLoad([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics);
 
 template<typename T, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicLoad)]]
@@ -324,7 +324,7 @@ enable_if_t<is_pointer_v<Ptr_T>, T> atomicLoad(Ptr_T ptr, uint32_t memoryScope,
 
 template<typename T>
 [[vk::ext_instruction(spv::OpAtomicStore)]]
-void atomicStore([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+enable_if_t<!is_pointer_v<T>, void> atomicStore([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicStore)]]

From e4a8ac26b9cccc9920054a8a7d4c000ec0605000 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 12 Jun 2025 16:40:46 +0700
Subject: [PATCH 12/14] fix global reduction (only plus atm), moved existing to
 temp scan

---
 examples_tests                                |  2 +-
 include/nbl/builtin/hlsl/scan/arithmetic.hlsl |  7 ++-
 .../builtin/hlsl/scan/arithmetic_impl.hlsl    | 53 +++++++++++--------
 3 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/examples_tests b/examples_tests
index b9f515c207..794b06704b 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit b9f515c20721e57180a584072be56e6d3b6a1301
+Subproject commit 794b06704b611990cc7a6c2dc81d8912db4c747d
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl
index 95d7a4c045..31c596a077 100644
--- a/include/nbl/builtin/hlsl/scan/arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl
@@ -18,12 +18,11 @@ struct reduction
 {
     using scalar_t = typename BinOp::type_t;
 
-    template<class ReadOnlyDataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(workgroup2::ArithmeticReadOnlyDataAccessor<ReadOnlyDataAccessor,scalar_t> && workgroup2::ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t>)
-    static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
+    template<class ReadOnlyDataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>
+    static void __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
     {
         impl::reduce<Config, BinOp, ForwardProgressGuarantees, device_capabilities> fn;
-        scalar_t value = fn.template __call<ReadOnlyDataAccessor,ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
-        return value;
+        fn.template __call<ReadOnlyDataAccessor,OutputAccessor,StatusAccessor,ScratchAccessor>(dataAccessor, outputAccessor, statusAccessor, sharedMemScratchAccessor);
     }
 };
 
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
index d7591da8d6..1be2b11b8e 100644
--- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
+++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
@@ -41,6 +41,30 @@ struct Constants
 
 template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
 struct reduce
+{
+    using scalar_t = typename BinOp::type_t;
+    using arith_config_t = typename Config::arith_config_t;
+    using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BinOp, device_capabilities>;
+
+    template<class DataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
+    {
+        const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
+
+        const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);
+        if (lastInvocation)
+        {
+            // NOTE: there doesn't seem to be a way to set OpMemoryModel yet: https://github.com/microsoft/DirectXShaderCompiler/issues/7180
+            // MakeAvailable semantic requires memory model set to Vulkan instead of GLSL450 currently
+            spirv::atomicIAdd(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction);
+            spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u);
+        }
+    }
+};
+
+// TODO: change this to scan, it totally won't work for reduce anyways
+template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
+struct scan
 {
     using scalar_t = typename BinOp::type_t;
     using constants_t = Constants<scalar_t>;
@@ -59,20 +83,9 @@ struct reduce
         if (lastInvocation)
         {
             bda::__ref<scalar_t> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref();
-            spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT);
+            spirv::atomicStore(scratchId.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT);
         }
 
-        // NOTE: just for testing, remove when done
-        // sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier();
-        // uint32_t prev = glsl::gl_WorkGroupID().x==0 ? 0 : glsl::gl_WorkGroupID().x-1;
-        // scalar_t testVal = constants_t::NOT_READY;
-        // if (lastInvocation)
-        //     while (testVal == constants_t::NOT_READY)
-        //         testVal = spirv::atomicIAdd((scratch + prev).deref().__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);
-        // sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier();
-        // testVal = workgroup::Broadcast(testVal, sharedMemScratchAccessor, Config::WorkgroupSize-1);
-        // return testVal;
-
         binop_t binop;
         scalar_t prefix = scalar_t(0);
         // decoupled lookback
@@ -90,8 +103,7 @@ struct reduce
                         while (value == constants_t::NOT_READY)
                         {
                             bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
-                            // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
-                            value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);
+                            value = spirv::atomicLoad<scalar_t>(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask);
                         }
                     }
                     prefix = binop(value & (~constants_t::STATUS_MASK), prefix);
@@ -113,8 +125,7 @@ struct reduce
                 if (lastInvocation)
                 {
                     bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
-                    // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
-                    value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);
+                    value = spirv::atomicLoad<scalar_t>(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask);
                 }
                 value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1);
 
@@ -136,18 +147,19 @@ struct reduce
                     // if DoAndRaceStore, stores in place of prev workgroup id as well
                     // bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
                     // if (lastInvocation)
-                    //     spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT);
+                    //     spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT);
 
                     prefix = binop(prevReduction, prefix);
                 }
             }
         }
 
-        scalar_t globalReduction = binop(prefix,localReduction);
+        const scalar_t globalReduction = binop(prefix,localReduction);
+        // TODO globalReduction value changing in following block somehow, double check
         if (lastInvocation)
         {
             bda::__ref<scalar_t> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref();
-            spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT);
+            spirv::atomicStore(scratchId.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT);
         }
 
         // get last item from scratch
@@ -159,8 +171,7 @@ struct reduce
             // wait until last workgroup does reduction
             while (!(value & constants_t::GLOBAL_COUNT))
             {
-                // value = spirv::atomicLoad(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask);
-                value = spirv::atomicIAdd(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u);
+                value = spirv::atomicLoad<scalar_t>(scratchLast.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask);
             }
         }
         value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1);

From 426fa6b65265086480fc307ea6a391cfdfed1bbf Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 12 Jun 2025 17:14:54 +0700
Subject: [PATCH 13/14] reduction specializations for other arithmetic ops

---
 examples_tests                                |  2 +-
 .../builtin/hlsl/scan/arithmetic_impl.hlsl    | 78 +++++++++++++++++--
 2 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/examples_tests b/examples_tests
index 794b06704b..86c198e67b 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 794b06704b611990cc7a6c2dc81d8912db4c747d
+Subproject commit 86c198e67b5181a3222e390c7062204cd6adca2e
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
index 1be2b11b8e..0b041ed09a 100644
--- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
+++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
@@ -39,12 +39,50 @@ struct Constants
     NBL_CONSTEXPR_STATIC_INLINE T STATUS_MASK = LOCAL_COUNT | GLOBAL_COUNT;
 };
 
+// NOTE: there doesn't seem to be a way to set OpMemoryModel yet: https://github.com/microsoft/DirectXShaderCompiler/issues/7180
+// MakeAvailable semantic requires memory model set to Vulkan instead of GLSL450 currently
 template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
-struct reduce
+struct reduce;
+
+#define SPECIALIZE(BINOP,ATOMIC_OP) template<class Config, typename T, bool ForwardProgressGuarantees, class device_capabilities>\
+struct reduce<Config, BINOP<T>, ForwardProgressGuarantees, device_capabilities>\
+{\
+    using scalar_t = T;\
+    using arith_config_t = typename Config::arith_config_t;\
+    using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BINOP<T>, device_capabilities>;\
+\
+    template<class DataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>\
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)\
+    {\
+        const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);\
+\
+        const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);\
+        if (lastInvocation)\
+        {\
+            spirv::ATOMIC_OP(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction);\
+            spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u);\
+        }\
+    }\
+}
+
+SPECIALIZE(bit_and,atomicAnd);
+SPECIALIZE(bit_or,atomicOr);
+SPECIALIZE(bit_xor,atomicXor);
+
+SPECIALIZE(plus,atomicIAdd);
+// there's no atomic multiply so we use a CAS loop
+
+SPECIALIZE(minimum,atomicUMin);
+SPECIALIZE(maximum,atomicUMax);
+
+#undef SPECIALIZE
+
+template<class Config, typename T, bool ForwardProgressGuarantees, class device_capabilities>
+struct reduce<Config, multiplies<T>, ForwardProgressGuarantees, device_capabilities>
 {
-    using scalar_t = typename BinOp::type_t;
+    using scalar_t = T;
     using arith_config_t = typename Config::arith_config_t;
-    using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BinOp, device_capabilities>;
+    using workgroup_reduce_t = workgroup2::reduction<arith_config_t, multiplies<T>, device_capabilities>;
 
     template<class DataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>
     void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
@@ -54,14 +92,42 @@ struct reduce
         const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);
         if (lastInvocation)
         {
-            // NOTE: there doesn't seem to be a way to set OpMemoryModel yet: https://github.com/microsoft/DirectXShaderCompiler/issues/7180
-            // MakeAvailable semantic requires memory model set to Vulkan instead of GLSL450 currently
-            spirv::atomicIAdd(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction);
+            {
+                scalar_t actual, expected;
+                actual = multiplies<T>::identity;
+                do
+                {
+                    expected = actual;
+                    scalar_t newVal = expected * localReduction;
+                    actual = spirv::atomicCompareExchange(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, spv::MemorySemanticsAcquireMask, newVal, expected);
+                } while (expected != actual);
+            }
             spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u);
         }
     }
 };
 
+// template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
+// struct reduce;
+// {
+//     using scalar_t = typename BinOp::type_t;
+//     using arith_config_t = typename Config::arith_config_t;
+//     using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BinOp, device_capabilities>;
+
+//     template<class DataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>
+//     void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
+//     {
+//         const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
+
+//         const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);
+//         if (lastInvocation)
+//         {
+//             spirv::atomicIAdd(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction);
+//             spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u);
+//         }
+//     }
+// };
+
 // TODO: change this to scan, it totally won't work for reduce anyways
 template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
 struct scan

From 57f4559a2057b55c412b0e37825e0835d0570533 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 13 Jun 2025 09:55:17 +0700
Subject: [PATCH 14/14] cleaning up reduction

---
 examples_tests                                |  2 +-
 .../builtin/hlsl/scan/arithmetic_impl.hlsl    | 21 -------------------
 2 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/examples_tests b/examples_tests
index 86c198e67b..b210d0d867 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 86c198e67b5181a3222e390c7062204cd6adca2e
+Subproject commit b210d0d86781f672f60d256cc56bf3ab078e8715
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
index 0b041ed09a..a3978df0dc 100644
--- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
+++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
@@ -107,27 +107,6 @@ struct reduce<Config, multiplies<T>, ForwardProgressGuarantees, device_capabilit
     }
 };
 
-// template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
-// struct reduce;
-// {
-//     using scalar_t = typename BinOp::type_t;
-//     using arith_config_t = typename Config::arith_config_t;
-//     using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BinOp, device_capabilities>;
-
-//     template<class DataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>
-//     void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
-//     {
-//         const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
-
-//         const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);
-//         if (lastInvocation)
-//         {
-//             spirv::atomicIAdd(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction);
-//             spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u);
-//         }
-//     }
-// };
-
 // TODO: change this to scan, it totally won't work for reduce anyways
 template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
 struct scan