diff --git a/examples_tests b/examples_tests
index 1710b69862..b210d0d867 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 1710b698621796aa767edf7bc940e55e6758c2a8
+Subproject commit b210d0d86781f672f60d256cc56bf3ab078e8715
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl
new file mode 100644
index 0000000000..31c596a077
--- /dev/null
+++ b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl
@@ -0,0 +1,33 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_
+
+#include "nbl/builtin/hlsl/scan/arithmetic_impl.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities=void>
+struct reduction
+{
+    using scalar_t = typename BinOp::type_t;
+
+    template<class ReadOnlyDataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>
+    static void __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
+    {
+        impl::reduce<Config, BinOp, ForwardProgressGuarantees, device_capabilities> fn;
+        fn.template __call<ReadOnlyDataAccessor,OutputAccessor,StatusAccessor,ScratchAccessor>(dataAccessor, outputAccessor, statusAccessor, sharedMemScratchAccessor);
+    }
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
new file mode 100644
index 0000000000..a3978df0dc
--- /dev/null
+++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
@@ -0,0 +1,233 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_
+
+#include "nbl/builtin/hlsl/bda/__ptr.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation>
+struct ScanConfiguration
+{
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = _ItemsPerInvocation;
+
+    using arith_config_t = workgroup2::ArithmeticConfiguration<WorkgroupSizeLog2, SubgroupSizeLog2, ItemsPerInvocation>;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = arith_config_t::SharedScratchElementCount;
+};
+
+namespace impl
+{
+
+template<typename T>    // only uint32_t or uint64_t for now?
+struct Constants
+{
+    NBL_CONSTEXPR_STATIC_INLINE T NOT_READY = 0;
+    NBL_CONSTEXPR_STATIC_INLINE T LOCAL_COUNT = T(0x1u) << (sizeof(T)*8-2);
+    NBL_CONSTEXPR_STATIC_INLINE T GLOBAL_COUNT = T(0x1u) << (sizeof(T)*8-1);
+    NBL_CONSTEXPR_STATIC_INLINE T STATUS_MASK = LOCAL_COUNT | GLOBAL_COUNT;
+};
+
+// NOTE: there doesn't seem to be a way to set OpMemoryModel yet: https://github.com/microsoft/DirectXShaderCompiler/issues/7180
+// MakeAvailable semantic requires memory model set to Vulkan instead of GLSL450 currently
+template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
+struct reduce;
+
+#define SPECIALIZE(BINOP,ATOMIC_OP) template<class Config, typename T, bool ForwardProgressGuarantees, class device_capabilities>\
+struct reduce<Config, BINOP<T>, ForwardProgressGuarantees, device_capabilities>\
+{\
+    using scalar_t = T;\
+    using arith_config_t = typename Config::arith_config_t;\
+    using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BINOP<T>, device_capabilities>;\
+\
+    template<class DataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>\
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)\
+    {\
+        const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);\
+\
+        const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);\
+        if (lastInvocation)\
+        {\
+            spirv::ATOMIC_OP(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction);\
+            spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u);\
+        }\
+    }\
+}
+
+SPECIALIZE(bit_and,atomicAnd);
+SPECIALIZE(bit_or,atomicOr);
+SPECIALIZE(bit_xor,atomicXor);
+
+SPECIALIZE(plus,atomicIAdd);
+// there's no atomic multiply so we use a CAS loop
+
+SPECIALIZE(minimum,atomicUMin);
+SPECIALIZE(maximum,atomicUMax);
+
+#undef SPECIALIZE
+
+template<class Config, typename T, bool ForwardProgressGuarantees, class device_capabilities>
+struct reduce<Config, multiplies<T>, ForwardProgressGuarantees, device_capabilities>
+{
+    using scalar_t = T;
+    using arith_config_t = typename Config::arith_config_t;
+    using workgroup_reduce_t = workgroup2::reduction<arith_config_t, multiplies<T>, device_capabilities>;
+
+    template<class DataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
+    {
+        const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
+
+        const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);
+        if (lastInvocation)
+        {
+            {
+                scalar_t actual, expected;
+                actual = multiplies<T>::identity;
+                do
+                {
+                    expected = actual;
+                    scalar_t newVal = expected * localReduction;
+                    actual = spirv::atomicCompareExchange(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, spv::MemorySemanticsAcquireMask, newVal, expected);
+                } while (expected != actual);
+            }
+            spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u);
+        }
+    }
+};
+
+// TODO: change this to scan, it totally won't work for reduce anyways
+template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
+struct scan
+{
+    using scalar_t = typename BinOp::type_t;
+    using constants_t = Constants<scalar_t>;
+    using config_t = Config;
+    using arith_config_t = typename Config::arith_config_t;
+    using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BinOp, device_capabilities>;
+    using binop_t = BinOp;
+
+    template<class DataAccessor, class ScratchAccessor>
+    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
+    {
+        const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
+        bda::__ptr<scalar_t> scratch = dataAccessor.getScratchPtr();   // scratch data should be at least T[NumWorkgroups]
+
+        const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);
+        if (lastInvocation)
+        {
+            bda::__ref<scalar_t> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref();
+            spirv::atomicStore(scratchId.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT);
+        }
+
+        binop_t binop;
+        scalar_t prefix = scalar_t(0);
+        // decoupled lookback
+        if (ForwardProgressGuarantees)
+        {
+            if (lastInvocation) // don't make whole block work and do busy stuff
+            {
+                // for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--)   // won't run properly this way for some reason, results in device lost
+                for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++)
+                {
+                    const uint32_t prevID = glsl::gl_WorkGroupID().x-i;
+                    scalar_t value = constants_t::NOT_READY;
+                    {
+                        // spin until something is ready
+                        while (value == constants_t::NOT_READY)
+                        {
+                            bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
+                            value = spirv::atomicLoad<scalar_t>(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask);
+                        }
+                    }
+                    prefix = binop(value & (~constants_t::STATUS_MASK), prefix);
+
+                    // last was actually a global sum, we have the prefix, we can quit
+                    if (value & constants_t::GLOBAL_COUNT)
+                        break;
+                }
+            }
+            prefix = workgroup::Broadcast(prefix, sharedMemScratchAccessor, Config::WorkgroupSize-1);
+        }
+        else
+        {
+            // for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--)
+            for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++)
+            {
+                const uint32_t prevID = glsl::gl_WorkGroupID().x-i;
+                scalar_t value = scalar_t(0);
+                if (lastInvocation)
+                {
+                    bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
+                    value = spirv::atomicLoad<scalar_t>(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask);
+                }
+                value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1);
+
+                if (value & constants_t::STATUS_MASK)
+                {
+                    prefix = binop(value & (~constants_t::STATUS_MASK), prefix);
+
+                    if (value & constants_t::GLOBAL_COUNT)
+                        break;
+                }
+                else    // can't wait/spin, have to do it ourselves
+                {
+                    sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+                    DataAccessor prevDataAccessor = DataAccessor::create(prevID);
+                    prevDataAccessor.begin();   // prepare data accessor if needed (e.g. preload)
+                    const scalar_t prevReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(prevDataAccessor, sharedMemScratchAccessor);
+
+                    // if DoAndRaceStore, stores in place of prev workgroup id as well
+                    // bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
+                    // if (lastInvocation)
+                    //     spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT);
+
+                    prefix = binop(prevReduction, prefix);
+                }
+            }
+        }
+
+        const scalar_t globalReduction = binop(prefix,localReduction);
+        // TODO globalReduction value changing in following block somehow, double check
+        if (lastInvocation)
+        {
+            bda::__ref<scalar_t> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref();
+            spirv::atomicStore(scratchId.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT);
+        }
+
+        // get last item from scratch
+        const uint32_t lastWorkgroup = glsl::gl_NumWorkGroups().x - 1;
+        bda::__ref<scalar_t> scratchLast = (scratch + lastWorkgroup).deref();
+        scalar_t value = constants_t::NOT_READY;
+        if (lastInvocation)
+        {
+            // wait until last workgroup does reduction
+            while (!(value & constants_t::GLOBAL_COUNT))
+            {
+                value = spirv::atomicLoad<scalar_t>(scratchLast.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask);
+            }
+        }
+        value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1);
+        return value & (~constants_t::STATUS_MASK);
+    }
+};
+
+}
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl
deleted file mode 100644
index 2d2e66e66d..0000000000
--- a/include/nbl/builtin/hlsl/scan/declarations.hlsl
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_
-#define _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_
-
-// REVIEW: Not sure if this file is needed in HLSL implementation
-
-#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl"
-
-
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	Parameters_t getParameters();
-}
-}
-}
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_
-#endif
-
-#ifndef _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	template<typename Storage_t>
-	void getData(
-		inout Storage_t data,
-		in uint levelInvocationIndex,
-		in uint localWorkgroupIndex,
-		in uint treeLevel,
-		in uint pseudoLevel
-	);
-}
-}
-}
-#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_
-#endif
-
-#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	template<typename Storage_t>
-	void setData(
-		in Storage_t data,
-		in uint levelInvocationIndex,
-		in uint localWorkgroupIndex,
-		in uint treeLevel,
-		in uint pseudoLevel,
-		in bool inRange
-	);
-}
-}
-}
-#define _NBL_HLSL_SCAN_SET_DATA_DECLARED_
-#endif
-
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
deleted file mode 100644
index 450368475d..0000000000
--- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
+++ /dev/null
@@ -1,221 +0,0 @@
-#ifndef _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_
-#define _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_
-
-#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl"
-
-#ifdef __cplusplus
-#define uint uint32_t
-#endif
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{	
-	struct DefaultSchedulerParameters_t
-	{
-		uint finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1];
-		uint cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS];
-
-	};
-}
-}
-}
-
-#ifdef __cplusplus
-#undef uint
-#else
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-namespace scheduler
-{
-	/**
-	 * The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation
-	 * (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions 
-	 * that are used for synchronization etc.
-	 * Remember that CScanner does a Blelloch Scan which works in levels. In each level of the Blelloch scan the array of elements is 
-	 * broken down into sets of size=WorkgroupSize and each set is scanned using Hillis & Steele (aka Stone-Kogge adder). The result of 
-	 * the scan is provided as an array element for the next level of the Blelloch Scan. This means that if we have 10000 elements and 
-	 * WorkgroupSize=250, we will break the array into 40 sets and take their reduction results. The next level of the Blelloch Scan will 
-	 * have an array of size 40. Only a single workgroup will be needed to work on that. After that array is scanned, we use the results 
-	 * in the downsweep phase of Blelloch Scan.
-	 * Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount.
-	 * The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase.
-	 * The intermediate results are stored in a scratch buffer. That buffer's size is is the sum of the element-array size for all the 
-	 * Blelloch levels. Using the previous example, the scratch size should be 10000 + 40.
-	 * 
-	 * Parameter meaning:
-	 * |> lastElement - the index of the last element of each Blelloch level in the scratch buffer
-	 * |> topLevel - the top level the Blelloch Scan will have (this depends on the elementCount and the WorkgroupSize)
-	 * |> temporaryStorageOffset - an offset array for each level of the Blelloch Scan. It is used when storing the REDUCTION result of each workgroup scan
-	 * |> cumulativeWorkgroupCount - the sum-scan of all the workgroups that will need to be launched for each level of the Blelloch Scan (both upsweep and downsweep)
-	 * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps 
-	 *							synchronizing between workgroups with while-loop spinning.
-	 */
-	void computeParameters(in uint elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams)
-	{
-#define WorkgroupCount(Level) (_scanParams.lastElement[Level+1]+1u)
-		_scanParams.lastElement[0] = elementCount-1u;
-		_scanParams.topLevel = firstbithigh(_scanParams.lastElement[0])/_NBL_HLSL_WORKGROUP_SIZE_LOG2_;
-		// REVIEW: _NBL_HLSL_WORKGROUP_SIZE_LOG2_ is defined in files that include THIS file. Why not query the API for workgroup size at runtime?
-		
-		for (uint i=0; i<NBL_BUILTIN_MAX_SCAN_LEVELS/2;)
-		{
-			const uint next = i+1;
-			_scanParams.lastElement[next] = _scanParams.lastElement[i]>>_NBL_HLSL_WORKGROUP_SIZE_LOG2_;
-			i = next;
-		}
-		_schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0);
-		_schedulerParams.finishedFlagOffset[0] = 0u;
-		switch(_scanParams.topLevel)
-		{
-			case 1u:
-				_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+1u;
-				_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(0);
-				// climb up
-				_schedulerParams.finishedFlagOffset[1] = 1u;
-				
-				_scanParams.temporaryStorageOffset[0] = 2u;
-				break;
-			case 2u:
-				_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u;
-				_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0);
-				// climb up
-				_schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1);
-				_schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u;
-				// climb down
-				_schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u;
-				
-				_scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1);
-				_scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0);
-				break;
-			case 3u:
-				_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2);
-				_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u;
-				_schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(2);
-				_schedulerParams.cumulativeWorkgroupCount[5] = _schedulerParams.cumulativeWorkgroupCount[4]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[6] = _schedulerParams.cumulativeWorkgroupCount[5]+WorkgroupCount(0);
-				// climb up
-				_schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1);
-				_schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+WorkgroupCount(2);
-				_schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[2]+1u;
-				// climb down
-				_schedulerParams.finishedFlagOffset[4] = _schedulerParams.finishedFlagOffset[2]+2u;
-				_schedulerParams.finishedFlagOffset[5] = _schedulerParams.finishedFlagOffset[4]+WorkgroupCount(2);
-				
-				_scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[5]+WorkgroupCount(1);
-				_scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0);
-				_scanParams.temporaryStorageOffset[2] = _scanParams.temporaryStorageOffset[1]+WorkgroupCount(1);
-				break;
-			default:
-				break;
-#if NBL_BUILTIN_MAX_SCAN_LEVELS>7
-#error "Switch needs more cases"
-#endif
-		}
-#undef WorkgroupCount
-	}
-	
-	/**
-	 * treeLevel - the current level in the Blelloch Scan
-	 * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. 
-	 * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5.
-	 */
-	template<class ScratchAccessor>
-	bool getWork(in DefaultSchedulerParameters_t params, in uint topLevel, out uint treeLevel, out uint localWorkgroupIndex)
-	{
-		ScratchAccessor sharedScratch;
-		if(SubgroupContiguousIndex() == 0u) 
-		{
-			uint64_t original;
-			InterlockedAdd(scanScratch.workgroupsStarted, 1u, original); // REVIEW: Refactor InterlockedAdd with GLSL terminology? // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
-			sharedScratch.set(SubgroupContiguousIndex(), original);
-		}
-		else if (SubgroupContiguousIndex() == 1u) 
-		{
-			sharedScratch.set(SubgroupContiguousIndex(), 0u);
-		}
-		GroupMemoryBarrierWithGroupSync(); // REVIEW: refactor this somewhere with GLSL terminology?
-		
-		const uint globalWorkgroupIndex; // does every thread need to know?
-		sharedScratch.get(0u, globalWorkgroupIndex);
-		const uint lastLevel = topLevel<<1u;
-		if (SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[SubgroupContiguousIndex()]) 
-		{
-			InterlockedAdd(sharedScratch.get(1u, ?), 1u); // REVIEW: The way scratchaccessoradaptor is implemented (e.g. under subgroup/arithmetic_portability) doesn't allow for atomic ops on the scratch buffer. Should we ask for another implementation that overrides the [] operator ?
-		}
-		GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
-		
-		sharedScratch.get(1u, treeLevel);
-		if(treeLevel>lastLevel)
-			return true;
-		
-		localWorkgroupIndex = globalWorkgroupIndex;
-		const bool dependentLevel = treeLevel != 0u;
-		if(dependentLevel) 
-		{
-			const uint prevLevel = treeLevel - 1u;
-			localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel];
-			if(SubgroupContiguousIndex() == 0u) 
-			{
-				uint dependentsCount = 1u;
-				if(treeLevel <= topLevel) 
-				{
-					dependentsCount = _NBL_HLSL_WORKGROUP_SIZE_; // REVIEW: Defined in the files that include this file?
-					const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel];
-					if (lastWorkgroup) 
-					{
-						const Parameters_t scanParams = getParameters(); // TODO (PentaKon): Undeclared as of now, this should return the Parameters_t from the push constants of (in)direct shader
-						dependentsCount = scanParams.lastElement[treeLevel]+1u;
-						if (treeLevel<topLevel) 
-						{
-							dependentsCount -= scanParams.lastElement[treeLevel+1u]*_NBL_HLSL_WORKGROUP_SIZE_;
-						}
-					}
-				}
-				uint dependentsFinishedFlagOffset = localWorkgroupIndex;
-				if (treeLevel>topLevel) // !(prevLevel<topLevel) TODO: merge with `else` above?
-					dependentsFinishedFlagOffset /= _NBL_HLSL_WORKGROUP_SIZE_;
-				dependentsFinishedFlagOffset += params.finishedFlagOffset[prevLevel];
-				while (scanScratch.data[dependentsFinishedFlagOffset]!=dependentsCount) // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
-					GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
-			}
-		}
-		GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
-		return false;
-	}
-	
-	void markComplete(in DefaultSchedulerParameters_t params, in uint topLevel, in uint treeLevel, in uint localWorkgroupIndex)
-	{
-		GroupMemoryBarrierWithGroupSync(); // must complete writing the data before flags itself as complete  // TODO (PentaKon): Possibly refactor?
-		if (SubgroupContiguousIndex()==0u)
-		{
-			uint finishedFlagOffset = params.finishedFlagOffset[treeLevel];
-			if (treeLevel<topLevel)
-			{
-				finishedFlagOffset += localWorkgroupIndex/_NBL_HLSL_WORKGROUP_SIZE_;
-				InterlockedAdd(scanScratch.data[finishedFlagOffset],1u);
-			}
-			else if (treeLevel!=(topLevel<<1u))
-			{
-				finishedFlagOffset += localWorkgroupIndex;
-				scanScratch.data[finishedFlagOffset] = 1u; // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
-			}
-		}
-	}
-}
-}
-}
-}
-#endif
-
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
deleted file mode 100644
index 86c5589725..0000000000
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ /dev/null
@@ -1,3 +0,0 @@
-
-
-// choerent -> globallycoherent
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl
deleted file mode 100644
index 325a08e3f0..0000000000
--- a/include/nbl/builtin/hlsl/scan/direct.hlsl
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef _NBL_HLSL_WORKGROUP_SIZE_
-#define _NBL_HLSL_WORKGROUP_SIZE_ 256
-#endif
-
-#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
-#include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl"
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-#ifndef _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_
-	cbuffer PC // REVIEW: register and packoffset selection
-	{
-		Parameters_t scanParams;
-		DefaultSchedulerParameters_t schedulerParams;
-	};
-#define _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_
-#endif
-
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-Parameters_t getParameters()
-{
-	return pc.scanParams;
-}
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-#endif
-
-#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-DefaultSchedulerParameters_t getSchedulerParameters()
-{
-	return pc.schedulerParams;
-}
-#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-#endif
-}
-}
-}
-
-#ifndef _NBL_HLSL_MAIN_DEFINED_
-[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)]
-void CSMain()
-{
-	nbl::hlsl::scan::main();
-}
-#define _NBL_HLSL_MAIN_DEFINED_
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/indirect.hlsl b/include/nbl/builtin/hlsl/scan/indirect.hlsl
deleted file mode 100644
index 1191731f65..0000000000
--- a/include/nbl/builtin/hlsl/scan/indirect.hlsl
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _NBL_HLSL_WORKGROUP_SIZE_
-#define _NBL_HLSL_WORKGROUP_SIZE_ 256
-#define _NBL_HLSL_WORKGROUP_SIZE_LOG2_ 8
-#endif
-
-#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
-#include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl"
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-Parameters_t scanParams;
-Parameters_t getParameters()
-{
-	return scanParams;
-}
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-#endif
-
-uint getIndirectElementCount();
-
-#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-DefaultSchedulerParameters_t schedulerParams;
-DefaultSchedulerParameters_t getSchedulerParameters()
-{
-	scheduler::computeParameters(getIndirectElementCount(),scanParams,schedulerParams);
-	return schedulerParams;
-}
-#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-#endif
-}
-}
-}
-
-#ifndef _NBL_HLSL_MAIN_DEFINED_
-[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)]
-void CSMain()
-{
-	if (bool(nbl::hlsl::scan::getIndirectElementCount()))
-		nbl::hlsl::scan::main();
-}
-#define _NBL_HLSL_MAIN_DEFINED_
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl b/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl
deleted file mode 100644
index bfeba13be2..0000000000
--- a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_
-#define _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_
-
-#define NBL_BUILTIN_MAX_SCAN_LEVELS 7
-
-#ifdef __cplusplus
-#define uint uint32_t
-#endif
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	// REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules)
-	struct Parameters_t {
-		uint lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1];
-		uint topLevel;
-		uint temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];
-	}
-}
-}
-}
-
-#ifdef __cplusplus
-#undef uint
-#endif
-
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
deleted file mode 100644
index 488bf29012..0000000000
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifndef _NBL_HLSL_SCAN_VIRTUAL_WORKGROUP_INCLUDED_
-#define _NBL_HLSL_SCAN_VIRTUAL_WORKGROUP_INCLUDED_
-
-// TODO (PentaKon): Decide if these are needed once we have a clearer picture of the refactor
-#include "nbl/builtin/hlsl/limits/numeric.hlsl"
-#include "nbl/builtin/hlsl/math/typeless_arithmetic.hlsl"
-#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined
-#include "nbl/builtin/hlsl/scan/declarations.hlsl"
-
-#include "nbl/builtin/hlsl/binops.hlsl"
-
-#if 0
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	template<class Binop, class Storage_t>
-	void virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex)
-	{
-		const Parameters_t params = getParameters();
-		const uint levelInvocationIndex = localWorkgroupIndex * _NBL_HLSL_WORKGROUP_SIZE_ + SubgroupContiguousIndex();
-		const bool lastInvocationInGroup = SubgroupContiguousIndex() == (_NBL_HLSL_WORKGROUP_SIZE_ - 1);
-
-		const uint lastLevel = params.topLevel << 1u;
-		const uint pseudoLevel = levelInvocationIndex <= params.lastElement[pseudoLevel];
-
-		const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel];
-
-		Storage_t data = Binop::identity();
-		if(inRange)
-		{
-			getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel);
-		}
-
-		if(treeLevel < params.topLevel) 
-		{
-			#error "Must also define some scratch accessor when calling operation()"
-			data = workgroup::reduction<Binop>()(data);
-		}
-		// REVIEW: missing _TYPE_ check and extra case here
-		else if (treeLevel != params.topLevel)
-		{
-			data = workgroup::inclusive_scan<Binop>()(data);
-		}
-		else
-		{
-			data = workgroup::exclusive_scan<Binop>()(data);
-		}
-		setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange);
-	}
-}
-}
-}
-
-#ifndef _NBL_HLSL_SCAN_MAIN_DEFINED_ // TODO REVIEW: Are these needed, can this logic be refactored?
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components
-	void main()
-	{
-		const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters();
-		const uint topLevel = getParameters().topLevel;
-		// persistent workgroups
-		while (true)
-		{
-			uint treeLevel,localWorkgroupIndex;
-			if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex))
-			{
-				return;
-			}
-
-			virtualWorkgroup(treeLevel,localWorkgroupIndex);
-
-			scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex);
-		}
-	}
-}
-}
-}
-#endif
-
-#define _NBL_HLSL_SCAN_MAIN_DEFINED_
-#endif
-
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
index 4885fc11f8..c7a3694d3e 100644
--- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
+++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
@@ -180,6 +180,16 @@ template<typename T, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicAnd)]]
 enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicAnd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
+template<typename T>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicAnd)]]
+enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicAnd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
+template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicAnd)]]
+enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicAnd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
 template<typename T>
 [[vk::ext_instruction(spv::OpAtomicOr)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicOr([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
@@ -188,6 +198,16 @@ template<typename T, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicOr)]]
 enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicOr(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
+template<typename T>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicOr)]]
+enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicOr([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
+template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicOr)]]
+enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicOr(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
 template<typename T>
 [[vk::ext_instruction(spv::OpAtomicXor)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicXor([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
@@ -196,6 +216,16 @@ template<typename T, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicXor)]]
 enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicXor(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
+template<typename T>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicXor)]]
+enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicXor([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
+template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicXor)]]
+enable_if_t<is_pointer_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicXor(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
 template<typename Signed>
 [[vk::ext_instruction( spv::OpAtomicSMin )]]
 enable_if_t<is_same_v<Signed,int32_t>, Signed> atomicSMin([[vk::ext_reference]] int32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
@@ -204,6 +234,16 @@ template<typename Signed, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicSMin)]]
 enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Signed,int32_t>, Signed> atomicSMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
 
+template<typename Signed>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction( spv::OpAtomicSMin )]]
+enable_if_t<is_same_v<Signed,int64_t>, Signed> atomicSMin([[vk::ext_reference]] int32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
+
+template<typename Signed, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicSMin)]]
+enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Signed,int64_t>, Signed> atomicSMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
+
 template<typename Unsigned>
 [[vk::ext_instruction( spv::OpAtomicUMin )]]
 enable_if_t<is_same_v<Unsigned,uint32_t>, Unsigned> atomicUMin([[vk::ext_reference]] Unsigned ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
@@ -212,6 +252,16 @@ template<typename Unsigned, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicUMin)]]
 enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Unsigned,uint32_t>, Unsigned> atomicUMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
 
+template<typename Unsigned>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction( spv::OpAtomicUMin )]]
+enable_if_t<is_same_v<Unsigned,uint64_t>, Unsigned> atomicUMin([[vk::ext_reference]] Unsigned ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
+
+template<typename Unsigned, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicUMin)]]
+enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Unsigned,uint64_t>, Unsigned> atomicUMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
+
 template<typename Signed>
 [[vk::ext_instruction( spv::OpAtomicSMax )]]
 enable_if_t<is_same_v<Signed,int32_t>, Signed> atomicSMax([[vk::ext_reference]] Signed ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
@@ -220,6 +270,16 @@ template<typename Signed, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicSMax)]]
 enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Signed,int32_t>, Signed> atomicSMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
 
+template<typename Signed>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction( spv::OpAtomicSMax )]]
+enable_if_t<is_same_v<Signed,int64_t>, Signed> atomicSMax([[vk::ext_reference]] Signed ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
+
+template<typename Signed, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicSMax)]]
+enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Signed,int64_t>, Signed> atomicSMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value);
+
 template<typename Unsigned>
 [[vk::ext_instruction( spv::OpAtomicUMax )]]
 enable_if_t<is_same_v<Unsigned,uint32_t>, Unsigned> atomicUMax([[vk::ext_reference]] uint32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
@@ -228,6 +288,16 @@ template<typename Unsigned, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicUMax)]]
 enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Unsigned,uint32_t>, Unsigned> atomicUMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
 
+template<typename Unsigned>
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction( spv::OpAtomicUMax )]]
+enable_if_t<is_same_v<Unsigned,uint64_t>, Unsigned> atomicUMax([[vk::ext_reference]] uint32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
+
+template<typename Unsigned, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityInt64Atomics)]]
+[[vk::ext_instruction(spv::OpAtomicUMax)]]
+enable_if_t<is_pointer_v<Ptr_T> && is_same_v<Unsigned,uint64_t>, Unsigned> atomicUMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value);
+
 template<typename T>
 [[vk::ext_instruction(spv::OpAtomicExchange)]]
 T atomicExchange([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
@@ -244,6 +314,22 @@ template<typename T, typename Ptr_T> // DXC Workaround
 [[vk::ext_instruction(spv::OpAtomicCompareExchange)]]
 enable_if_t<is_pointer_v<Ptr_T>, T> atomicCompareExchange(Ptr_T ptr, uint32_t memoryScope, uint32_t memSemanticsEqual, uint32_t memSemanticsUnequal, T value, T comparator);
 
+template<typename T>
+[[vk::ext_instruction(spv::OpAtomicLoad)]]
+enable_if_t<!is_pointer_v<T>, T> atomicLoad([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics);
+
+template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_instruction(spv::OpAtomicLoad)]]
+enable_if_t<is_pointer_v<Ptr_T>, T> atomicLoad(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics);
+
+template<typename T>
+[[vk::ext_instruction(spv::OpAtomicStore)]]
+enable_if_t<!is_pointer_v<T>, void> atomicStore([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
+template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_instruction(spv::OpAtomicStore)]]
+enable_if_t<is_pointer_v<Ptr_T>, void> atomicStore(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
+
 
 template<typename T, uint32_t alignment>
 __NBL_CAPABILITY_PhysicalStorageBufferAddresses
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
index 62a9fb7bef..045ecbde51 100644
--- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -4,12 +4,10 @@
 #ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_
 #define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_
 
-
 #include "nbl/builtin/hlsl/functional.hlsl"
 #include "nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl"
 #include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl"
 
-
 namespace nbl
 {
 namespace hlsl