From c23050c1f3d7ffb9c7a9d351b4d47199580d71dd Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 26 May 2025 11:42:58 +0700 Subject: [PATCH 01/14] removed redundant includes --- include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl index e4a71bdffc..9f62743c1a 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl @@ -4,14 +4,10 @@ #ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_ #define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_ - #include "nbl/builtin/hlsl/functional.hlsl" -#include "nbl/builtin/hlsl/workgroup/ballot.hlsl" -#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" #include "nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl" #include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl" - namespace nbl { namespace hlsl From 0ccd13f00be22abd846c05a427f2a38bfa02c5e3 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 26 May 2025 15:06:45 +0700 Subject: [PATCH 02/14] added atomic store, load ; int64 specs for others --- .../builtin/hlsl/spirv_intrinsics/core.hlsl | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl index 4885fc11f8..167c2fe5c7 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl @@ -180,6 +180,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicAnd)]] enable_if_t && (is_same_v || is_same_v), T> atomicAnd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicAnd)]] +enable_if_t || is_same_v, T> atomicAnd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicAnd)]] +enable_if_t && (is_same_v || is_same_v), T> atomicAnd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + template [[vk::ext_instruction(spv::OpAtomicOr)]] enable_if_t || is_same_v, T> atomicOr([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); @@ -188,6 +198,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicOr)]] enable_if_t && (is_same_v || is_same_v), T> atomicOr(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicOr)]] +enable_if_t || is_same_v, T> atomicOr([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicOr)]] +enable_if_t && (is_same_v || is_same_v), T> atomicOr(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + template [[vk::ext_instruction(spv::OpAtomicXor)]] enable_if_t || is_same_v, T> atomicXor([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); @@ -196,6 +216,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicXor)]] enable_if_t && (is_same_v || is_same_v), T> atomicXor(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicXor)]] +enable_if_t || is_same_v, T> atomicXor([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicXor)]] +enable_if_t && (is_same_v || is_same_v), T> atomicXor(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + template [[vk::ext_instruction( spv::OpAtomicSMin )]] enable_if_t, Signed> atomicSMin([[vk::ext_reference]] int32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); @@ -204,6 +234,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicSMin)]] enable_if_t && is_same_v, Signed> atomicSMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction( spv::OpAtomicSMin )]] +enable_if_t, Signed> atomicSMin([[vk::ext_reference]] int32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicSMin)]] +enable_if_t && is_same_v, Signed> atomicSMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); + template [[vk::ext_instruction( spv::OpAtomicUMin )]] enable_if_t, Unsigned> atomicUMin([[vk::ext_reference]] Unsigned ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); @@ -212,6 +252,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicUMin)]] enable_if_t && is_same_v, Unsigned> atomicUMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction( spv::OpAtomicUMin )]] +enable_if_t, Unsigned> atomicUMin([[vk::ext_reference]] Unsigned ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicUMin)]] +enable_if_t && is_same_v, Unsigned> atomicUMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); + template [[vk::ext_instruction( spv::OpAtomicSMax )]] enable_if_t, Signed> atomicSMax([[vk::ext_reference]] Signed ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); @@ -220,6 +270,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicSMax)]] enable_if_t && is_same_v, Signed> atomicSMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction( spv::OpAtomicSMax )]] +enable_if_t, Signed> atomicSMax([[vk::ext_reference]] Signed ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicSMax)]] +enable_if_t && is_same_v, Signed> atomicSMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); + template [[vk::ext_instruction( spv::OpAtomicUMax )]] enable_if_t, Unsigned> atomicUMax([[vk::ext_reference]] uint32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); @@ -228,6 +288,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicUMax)]] enable_if_t && is_same_v, Unsigned> atomicUMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction( spv::OpAtomicUMax )]] +enable_if_t, Unsigned> atomicUMax([[vk::ext_reference]] uint32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicUMax)]] +enable_if_t && is_same_v, Unsigned> atomicUMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); + template [[vk::ext_instruction(spv::OpAtomicExchange)]] T atomicExchange([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); @@ -244,6 +314,22 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicCompareExchange)]] enable_if_t, T> atomicCompareExchange(Ptr_T ptr, uint32_t memoryScope, uint32_t memSemanticsEqual, uint32_t memSemanticsUnequal, T value, T comparator); +template +[[vk::ext_instruction(spv::OpAtomicLoad)]] +T atomicLoad([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics); + +template // DXC Workaround +[[vk::ext_instruction(spv::OpAtomicLoad)]] +enable_if_t, T> atomicLoad(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics); + +template +[[vk::ext_instruction(spv::OpAtomicStore)]] +void atomicStore([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + +template // DXC Workaround +[[vk::ext_instruction(spv::OpAtomicStore)]] +enable_if_t, void> atomicStore(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + template __NBL_CAPABILITY_PhysicalStorageBufferAddresses From 7e1b0c31b96b01527563fe5b3bf6cdf85fa72bf4 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 26 May 2025 16:39:18 +0700 Subject: [PATCH 03/14] removed unused files in hlsl/scan --- .../nbl/builtin/hlsl/scan/declarations.hlsl | 66 ------ .../builtin/hlsl/scan/default_scheduler.hlsl | 221 ------------------ .../nbl/builtin/hlsl/scan/descriptors.hlsl | 3 - include/nbl/builtin/hlsl/scan/direct.hlsl | 50 ---- include/nbl/builtin/hlsl/scan/indirect.hlsl | 48 ---- .../builtin/hlsl/scan/parameters_struct.hlsl | 30 --- .../builtin/hlsl/scan/virtual_workgroup.hlsl | 92 -------- 7 files changed, 510 deletions(-) delete mode 100644 include/nbl/builtin/hlsl/scan/declarations.hlsl delete mode 100644 include/nbl/builtin/hlsl/scan/default_scheduler.hlsl delete mode 100644 include/nbl/builtin/hlsl/scan/descriptors.hlsl delete mode 100644 include/nbl/builtin/hlsl/scan/direct.hlsl delete mode 100644 include/nbl/builtin/hlsl/scan/indirect.hlsl delete mode 100644 include/nbl/builtin/hlsl/scan/parameters_struct.hlsl delete mode 100644 include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl deleted file mode 100644 index 2d2e66e66d..0000000000 --- a/include/nbl/builtin/hlsl/scan/declarations.hlsl +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_ -#define _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_ - -// REVIEW: Not sure if this file is needed in HLSL implementation - -#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl" - - -#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_ -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - Parameters_t getParameters(); -} -} -} -#define _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_ -#endif - -#ifndef _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - template - void getData( - inout Storage_t data, - in uint levelInvocationIndex, - in uint localWorkgroupIndex, - in uint treeLevel, - in uint pseudoLevel - ); -} -} -} -#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ -#endif - -#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_ -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - template - void setData( - in Storage_t data, - in uint levelInvocationIndex, - in uint localWorkgroupIndex, - in uint treeLevel, - in uint pseudoLevel, - in bool inRange - ); -} -} -} -#define _NBL_HLSL_SCAN_SET_DATA_DECLARED_ -#endif - -#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl deleted file mode 100644 index 450368475d..0000000000 --- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl +++ /dev/null @@ -1,221 +0,0 @@ -#ifndef _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_ -#define _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_ - -#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl" - -#ifdef __cplusplus -#define uint uint32_t -#endif - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - struct DefaultSchedulerParameters_t - { - uint finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1]; - uint cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS]; - - }; -} -} -} - -#ifdef __cplusplus -#undef uint -#else - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ -namespace scheduler -{ - /** - * The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation - * (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions - * that are used for synchronization etc. - * Remember that CScanner does a Blelloch Scan which works in levels. In each level of the Blelloch scan the array of elements is - * broken down into sets of size=WorkgroupSize and each set is scanned using Hillis & Steele (aka Stone-Kogge adder). The result of - * the scan is provided as an array element for the next level of the Blelloch Scan. This means that if we have 10000 elements and - * WorkgroupSize=250, we will break the array into 40 sets and take their reduction results. The next level of the Blelloch Scan will - * have an array of size 40. Only a single workgroup will be needed to work on that. After that array is scanned, we use the results - * in the downsweep phase of Blelloch Scan. - * Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount. - * The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase. - * The intermediate results are stored in a scratch buffer. That buffer's size is is the sum of the element-array size for all the - * Blelloch levels. Using the previous example, the scratch size should be 10000 + 40. - * - * Parameter meaning: - * |> lastElement - the index of the last element of each Blelloch level in the scratch buffer - * |> topLevel - the top level the Blelloch Scan will have (this depends on the elementCount and the WorkgroupSize) - * |> temporaryStorageOffset - an offset array for each level of the Blelloch Scan. It is used when storing the REDUCTION result of each workgroup scan - * |> cumulativeWorkgroupCount - the sum-scan of all the workgroups that will need to be launched for each level of the Blelloch Scan (both upsweep and downsweep) - * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps - * synchronizing between workgroups with while-loop spinning. - */ - void computeParameters(in uint elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams) - { -#define WorkgroupCount(Level) (_scanParams.lastElement[Level+1]+1u) - _scanParams.lastElement[0] = elementCount-1u; - _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0])/_NBL_HLSL_WORKGROUP_SIZE_LOG2_; - // REVIEW: _NBL_HLSL_WORKGROUP_SIZE_LOG2_ is defined in files that include THIS file. Why not query the API for workgroup size at runtime? - - for (uint i=0; i>_NBL_HLSL_WORKGROUP_SIZE_LOG2_; - i = next; - } - _schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0); - _schedulerParams.finishedFlagOffset[0] = 0u; - switch(_scanParams.topLevel) - { - case 1u: - _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+1u; - _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(0); - // climb up - _schedulerParams.finishedFlagOffset[1] = 1u; - - _scanParams.temporaryStorageOffset[0] = 2u; - break; - case 2u: - _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u; - _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0); - // climb up - _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); - _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u; - // climb down - _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u; - - _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1); - _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); - break; - case 3u: - _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2); - _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u; - _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(2); - _schedulerParams.cumulativeWorkgroupCount[5] = _schedulerParams.cumulativeWorkgroupCount[4]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[6] = _schedulerParams.cumulativeWorkgroupCount[5]+WorkgroupCount(0); - // climb up - _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); - _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+WorkgroupCount(2); - _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[2]+1u; - // climb down - _schedulerParams.finishedFlagOffset[4] = _schedulerParams.finishedFlagOffset[2]+2u; - _schedulerParams.finishedFlagOffset[5] = _schedulerParams.finishedFlagOffset[4]+WorkgroupCount(2); - - _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[5]+WorkgroupCount(1); - _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); - _scanParams.temporaryStorageOffset[2] = _scanParams.temporaryStorageOffset[1]+WorkgroupCount(1); - break; - default: - break; -#if NBL_BUILTIN_MAX_SCAN_LEVELS>7 -#error "Switch needs more cases" -#endif - } -#undef WorkgroupCount - } - - /** - * treeLevel - the current level in the Blelloch Scan - * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. - * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5. - */ - template - bool getWork(in DefaultSchedulerParameters_t params, in uint topLevel, out uint treeLevel, out uint localWorkgroupIndex) - { - ScratchAccessor sharedScratch; - if(SubgroupContiguousIndex() == 0u) - { - uint64_t original; - InterlockedAdd(scanScratch.workgroupsStarted, 1u, original); // REVIEW: Refactor InterlockedAdd with GLSL terminology? // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared - sharedScratch.set(SubgroupContiguousIndex(), original); - } - else if (SubgroupContiguousIndex() == 1u) - { - sharedScratch.set(SubgroupContiguousIndex(), 0u); - } - GroupMemoryBarrierWithGroupSync(); // REVIEW: refactor this somewhere with GLSL terminology? - - const uint globalWorkgroupIndex; // does every thread need to know? - sharedScratch.get(0u, globalWorkgroupIndex); - const uint lastLevel = topLevel<<1u; - if (SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[SubgroupContiguousIndex()]) - { - InterlockedAdd(sharedScratch.get(1u, ?), 1u); // REVIEW: The way scratchaccessoradaptor is implemented (e.g. under subgroup/arithmetic_portability) doesn't allow for atomic ops on the scratch buffer. Should we ask for another implementation that overrides the [] operator ? - } - GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor? - - sharedScratch.get(1u, treeLevel); - if(treeLevel>lastLevel) - return true; - - localWorkgroupIndex = globalWorkgroupIndex; - const bool dependentLevel = treeLevel != 0u; - if(dependentLevel) - { - const uint prevLevel = treeLevel - 1u; - localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel]; - if(SubgroupContiguousIndex() == 0u) - { - uint dependentsCount = 1u; - if(treeLevel <= topLevel) - { - dependentsCount = _NBL_HLSL_WORKGROUP_SIZE_; // REVIEW: Defined in the files that include this file? - const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel]; - if (lastWorkgroup) - { - const Parameters_t scanParams = getParameters(); // TODO (PentaKon): Undeclared as of now, this should return the Parameters_t from the push constants of (in)direct shader - dependentsCount = scanParams.lastElement[treeLevel]+1u; - if (treeLeveltopLevel) // !(prevLevel globallycoherent \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl deleted file mode 100644 index 325a08e3f0..0000000000 --- a/include/nbl/builtin/hlsl/scan/direct.hlsl +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef _NBL_HLSL_WORKGROUP_SIZE_ -#define _NBL_HLSL_WORKGROUP_SIZE_ 256 -#endif - -#include "nbl/builtin/hlsl/scan/descriptors.hlsl" -#include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl" -#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl" - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ -#ifndef _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_ - cbuffer PC // REVIEW: register and packoffset selection - { - Parameters_t scanParams; - DefaultSchedulerParameters_t schedulerParams; - }; -#define _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_ -#endif - -#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ -Parameters_t getParameters() -{ - return pc.scanParams; -} -#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ -#endif - -#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ -DefaultSchedulerParameters_t getSchedulerParameters() -{ - return pc.schedulerParams; -} -#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ -#endif -} -} -} - -#ifndef _NBL_HLSL_MAIN_DEFINED_ -[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)] -void CSMain() -{ - nbl::hlsl::scan::main(); -} -#define _NBL_HLSL_MAIN_DEFINED_ -#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/indirect.hlsl b/include/nbl/builtin/hlsl/scan/indirect.hlsl deleted file mode 100644 index 1191731f65..0000000000 --- a/include/nbl/builtin/hlsl/scan/indirect.hlsl +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef _NBL_HLSL_WORKGROUP_SIZE_ -#define _NBL_HLSL_WORKGROUP_SIZE_ 256 -#define _NBL_HLSL_WORKGROUP_SIZE_LOG2_ 8 -#endif - -#include "nbl/builtin/hlsl/scan/descriptors.hlsl" -#include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl" -#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl" - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ -#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ -Parameters_t scanParams; -Parameters_t getParameters() -{ - return scanParams; -} -#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ -#endif - -uint getIndirectElementCount(); - -#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ -DefaultSchedulerParameters_t schedulerParams; -DefaultSchedulerParameters_t getSchedulerParameters() -{ - scheduler::computeParameters(getIndirectElementCount(),scanParams,schedulerParams); - return schedulerParams; -} -#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ -#endif -} -} -} - -#ifndef _NBL_HLSL_MAIN_DEFINED_ -[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)] -void CSMain() -{ - if (bool(nbl::hlsl::scan::getIndirectElementCount())) - nbl::hlsl::scan::main(); -} -#define _NBL_HLSL_MAIN_DEFINED_ -#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl b/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl deleted file mode 100644 index bfeba13be2..0000000000 --- a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_ -#define _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_ - -#define NBL_BUILTIN_MAX_SCAN_LEVELS 7 - -#ifdef __cplusplus -#define uint uint32_t -#endif - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules) - struct Parameters_t { - uint lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1]; - uint topLevel; - uint temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2]; - } -} -} -} - -#ifdef __cplusplus -#undef uint -#endif - -#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl deleted file mode 100644 index 488bf29012..0000000000 --- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl +++ /dev/null @@ -1,92 +0,0 @@ -#ifndef _NBL_HLSL_SCAN_VIRTUAL_WORKGROUP_INCLUDED_ -#define _NBL_HLSL_SCAN_VIRTUAL_WORKGROUP_INCLUDED_ - -// TODO (PentaKon): Decide if these are needed once we have a clearer picture of the refactor -#include "nbl/builtin/hlsl/limits/numeric.hlsl" -#include "nbl/builtin/hlsl/math/typeless_arithmetic.hlsl" -#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined -#include "nbl/builtin/hlsl/scan/declarations.hlsl" - -#include "nbl/builtin/hlsl/binops.hlsl" - -#if 0 -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - template - void virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex) - { - const Parameters_t params = getParameters(); - const uint levelInvocationIndex = localWorkgroupIndex * _NBL_HLSL_WORKGROUP_SIZE_ + SubgroupContiguousIndex(); - const bool lastInvocationInGroup = SubgroupContiguousIndex() == (_NBL_HLSL_WORKGROUP_SIZE_ - 1); - - const uint lastLevel = params.topLevel << 1u; - const uint pseudoLevel = levelInvocationIndex <= params.lastElement[pseudoLevel]; - - const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; - - Storage_t data = Binop::identity(); - if(inRange) - { - getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel); - } - - if(treeLevel < params.topLevel) - { - #error "Must also define some scratch accessor when calling operation()" - data = workgroup::reduction()(data); - } - // REVIEW: missing _TYPE_ check and extra case here - else if (treeLevel != params.topLevel) - { - data = workgroup::inclusive_scan()(data); - } - else - { - data = workgroup::exclusive_scan()(data); - } - setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange); - } -} -} -} - -#ifndef _NBL_HLSL_SCAN_MAIN_DEFINED_ // TODO REVIEW: Are these needed, can this logic be refactored? -#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl" -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components - void main() - { - const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters(); - const uint topLevel = getParameters().topLevel; - // persistent workgroups - while (true) - { - uint treeLevel,localWorkgroupIndex; - if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex)) - { - return; - } - - virtualWorkgroup(treeLevel,localWorkgroupIndex); - - scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex); - } - } -} -} -} -#endif - -#define _NBL_HLSL_SCAN_MAIN_DEFINED_ -#endif - -#endif \ No newline at end of file From 9666ce474a71ae1deea9ac5e9193aa816de2ff56 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 26 May 2025 16:53:49 +0700 Subject: [PATCH 04/14] initial global reduce impl --- examples_tests | 2 +- include/nbl/builtin/hlsl/scan/arithmetic.hlsl | 34 +++++ .../builtin/hlsl/scan/arithmetic_impl.hlsl | 118 ++++++++++++++++++ 3 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 include/nbl/builtin/hlsl/scan/arithmetic.hlsl create mode 100644 include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl diff --git a/examples_tests b/examples_tests index bb3a901b5d..50647e4803 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit bb3a901b5de72b78246af20072f4489960287204 +Subproject commit 50647e4803afbc2f0ddfd1bed9ba6d5e4e180355 diff --git a/include/nbl/builtin/hlsl/scan/arithmetic.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl new file mode 100644 index 0000000000..335271f908 --- /dev/null +++ b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl @@ -0,0 +1,34 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_ +#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_ + +#include "nbl/builtin/hlsl/scan/arithmetic_impl.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace scan +{ + +template +struct reduction +{ + using scalar_t = typename BinOp::type_t; + + template && ArithmeticSharedMemoryAccessor) + static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) // scratch bda? + { + impl::reduce fn; + scalar_t value = fn.template __call(dataAccessor, sharedMemScratchAccessor); + return value; + } +}; + +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl new file mode 100644 index 0000000000..949ded773e --- /dev/null +++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl @@ -0,0 +1,118 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_ + +#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace scan +{ + +template +struct ScanConfiguration +{ + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = _ItemsPerInvocation; +}; + +namespace impl +{ + +template // only uint32_t or uint64_t for now? +struct Constants +{ + NBL_CONSTEXPR_STATIC_INLINE T NOT_READY = 0; + NBL_CONSTEXPR_STATIC_INLINE T LOCAL_COUNT = T(0x1u) << (sizeof(T)*8-2); + NBL_CONSTEXPR_STATIC_INLINE T GLOBAL_COUNT = T(0x1u) << (sizeof(T)*8-1); + NBL_CONSTEXPR_STATIC_INLINE T STATUS_MASK = LOCAL_COUNT | GLOBAL_COUNT; +}; + +template +struct reduce +{ + using constants_t = Constants; + using scalar_t = T; + using config_t = Config; + using arith_config_t = workgroup2::ArithmeticConfiguration; + using workgroup_reduce_t = workgroup2::reduction; + using binop_t = BinOp; + + template + scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) + { + const scalar_t localReduction = workgroup_reduce_t::__call(dataAccessor, sharedMemScratchAccessor); + + const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == WorkgroupSize-1); + if (lastInvocation) + { + bda::__ref scratchId = (scratch + glsl::gl_WorkgroupID()).deref(); + spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT); + } + + scalar_t prefix = scalar_t(0); + // decoupled lookback + if (ForwardProgressGuarantees) + { + if (lastInvocation) // don't make whole block work and do busy stuff + { + for (uint32_t prevID = glsl::gl_WorkgroupID()-1; prevID > 0u; prevID--) + { + scalar_t value = scalar_t(0); + { + // spin until something is ready + while (value == constants_t::NOT_READY) + { + bda::__ref scratchPrev = (scratch-1).deref(); + value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); + } + } + prefix += value & (~constants_t::STATUS_MASK); + + // last was actually a global sum, we have the prefix, we can quit + if (value & constants_t::GLOBAL_COUNT) + break; + } + } + prefix = workgroup::Broadcast(prefix, sharedMemScratchAccessor, WorkgroupSize-1); + } + + binop_t binop; + scalar_t globalReduction = binop(prefix,localReduction); + if (lastInvocation) + { + bda::__ref scratchId = (scratch + glsl::gl_WorkgroupID()).deref(); + spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT); + } + + // get last item from scratch + uint32_t lastWorkgroup = glsl::gl_NumWorkgroups() - 1; + bda::__ref scratchLast = (scratch + lastWorkgroup).deref(); + uint32_t value; + { + // wait until last workgroup does reduction + while (value & constants_t::GLOBAL_COUNT) + { + value = spirv::atomicLoad(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); + } + } + return value & (~constants_t::STATUS_MASK); + } + + // bda::_ptr scratch ?? +} + +} + +} +} +} + +#endif From fa7151e96e0907e88463bd14a0ce945cfbcb2165 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 2 Jun 2025 13:53:59 +0700 Subject: [PATCH 05/14] get example --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 6581ed496d..5e971c8a18 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 6581ed496d2fc41cae1dc5c9ceba10f3bdfc5135 +Subproject commit 5e971c8a1812922bbf36ecd969fdfb56a0d7d880 From 20d56d87b7d4d377584de87f0cbdce98de212e71 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 3 Jun 2025 10:18:53 +0700 Subject: [PATCH 06/14] fix missing bits in reduce --- examples_tests | 2 +- include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/examples_tests b/examples_tests index 5e971c8a18..ccb6385c5b 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 5e971c8a1812922bbf36ecd969fdfb56a0d7d880 +Subproject commit ccb6385c5b40c87842b8a950497d065262a91288 diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl index 949ded773e..e67a4b023c 100644 --- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl +++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl @@ -21,6 +21,9 @@ struct ScanConfiguration NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2; NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = _ItemsPerInvocation; + + using arith_config_t = workgroup2::ArithmeticConfiguration; + NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = arith_config_t::SharedScratchElementCount; }; namespace impl @@ -38,10 +41,10 @@ struct Constants template struct reduce { - using constants_t = Constants; - using scalar_t = T; + using scalar_t = typename BinOp::type_t; + using constants_t = Constants; using config_t = Config; - using arith_config_t = workgroup2::ArithmeticConfiguration; + using arith_config_t = typename Config::arith_config_t; using workgroup_reduce_t = workgroup2::reduction; using binop_t = BinOp; @@ -49,11 +52,12 @@ struct reduce scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) { const scalar_t localReduction = workgroup_reduce_t::__call(dataAccessor, sharedMemScratchAccessor); + bda::__ptr scratch = dataAccessor.getScratchPtr(); // scratch data should be at least T[NumWorkgroups] const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == WorkgroupSize-1); if (lastInvocation) { - bda::__ref scratchId = (scratch + glsl::gl_WorkgroupID()).deref(); + bda::__ref scratchId = (scratch + glsl::gl_WorkgroupID()).deref(); spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT); } @@ -105,8 +109,6 @@ struct reduce } return value & (~constants_t::STATUS_MASK); } - - // bda::_ptr scratch ?? } } From 752d943fe04f0a9282009c9c7705c1ac5101f0e1 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 9 Jun 2025 16:38:53 +0700 Subject: [PATCH 07/14] bug fixes so shader compiles now, but infinite loop suspected --- examples_tests | 2 +- include/nbl/builtin/hlsl/scan/arithmetic.hlsl | 4 +-- .../builtin/hlsl/scan/arithmetic_impl.hlsl | 33 ++++++++++--------- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/examples_tests b/examples_tests index 1710b69862..39d7859d28 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 1710b698621796aa767edf7bc940e55e6758c2a8 +Subproject commit 39d7859d2848468f49aef5627bd3f814502a74b5 diff --git a/include/nbl/builtin/hlsl/scan/arithmetic.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl index 335271f908..95d7a4c045 100644 --- a/include/nbl/builtin/hlsl/scan/arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl @@ -18,8 +18,8 @@ struct reduction { using scalar_t = typename BinOp::type_t; - template && ArithmeticSharedMemoryAccessor) - static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) // scratch bda? + template && workgroup2::ArithmeticSharedMemoryAccessor) + static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) { impl::reduce fn; scalar_t value = fn.template __call(dataAccessor, sharedMemScratchAccessor); diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl index e67a4b023c..3789d2f35a 100644 --- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl +++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl @@ -4,6 +4,7 @@ #ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_ #define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_ +#include "nbl/builtin/hlsl/bda/__ptr.hlsl" #include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" namespace nbl @@ -22,7 +23,7 @@ struct ScanConfiguration NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = _ItemsPerInvocation; - using arith_config_t = workgroup2::ArithmeticConfiguration; + using arith_config_t = workgroup2::ArithmeticConfiguration; NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = arith_config_t::SharedScratchElementCount; }; @@ -51,13 +52,13 @@ struct reduce template scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) { - const scalar_t localReduction = workgroup_reduce_t::__call(dataAccessor, sharedMemScratchAccessor); - bda::__ptr scratch = dataAccessor.getScratchPtr(); // scratch data should be at least T[NumWorkgroups] + const scalar_t localReduction = workgroup_reduce_t::template __call(dataAccessor, sharedMemScratchAccessor); + bda::__ptr scratch = dataAccessor.getScratchPtr(); // scratch data should be at least T[NumWorkgroups] - const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == WorkgroupSize-1); + const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1); if (lastInvocation) { - bda::__ref scratchId = (scratch + glsl::gl_WorkgroupID()).deref(); + bda::__ref scratchId = (scratch + glsl::gl_WorkGroupID().x).deref(); spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT); } @@ -67,15 +68,16 @@ struct reduce { if (lastInvocation) // don't make whole block work and do busy stuff { - for (uint32_t prevID = glsl::gl_WorkgroupID()-1; prevID > 0u; prevID--) + for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID > 0u; prevID--) { scalar_t value = scalar_t(0); { // spin until something is ready while (value == constants_t::NOT_READY) { - bda::__ref scratchPrev = (scratch-1).deref(); - value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); + bda::__ref scratchPrev = (scratch-1).deref(); + // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); + value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); } } prefix += value & (~constants_t::STATUS_MASK); @@ -85,31 +87,32 @@ struct reduce break; } } - prefix = workgroup::Broadcast(prefix, sharedMemScratchAccessor, WorkgroupSize-1); + prefix = workgroup::Broadcast(prefix, sharedMemScratchAccessor, Config::WorkgroupSize-1); } binop_t binop; scalar_t globalReduction = binop(prefix,localReduction); if (lastInvocation) { - bda::__ref scratchId = (scratch + glsl::gl_WorkgroupID()).deref(); + bda::__ref scratchId = (scratch + glsl::gl_WorkGroupID().x).deref(); spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT); } // get last item from scratch - uint32_t lastWorkgroup = glsl::gl_NumWorkgroups() - 1; - bda::__ref scratchLast = (scratch + lastWorkgroup).deref(); - uint32_t value; + uint32_t lastWorkgroup = glsl::gl_NumWorkGroups().x - 1; + bda::__ref scratchLast = (scratch + lastWorkgroup).deref(); + scalar_t value; { // wait until last workgroup does reduction while (value & constants_t::GLOBAL_COUNT) { - value = spirv::atomicLoad(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); + // value = spirv::atomicLoad(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); + value = spirv::atomicIAdd(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); } } return value & (~constants_t::STATUS_MASK); } -} +}; } From 6461b360747a2d4378b26a2c0c15fbca1780ebde Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 10 Jun 2025 11:37:10 +0700 Subject: [PATCH 08/14] added branch for no forward progress guarantee (no spin wait) --- examples_tests | 2 +- .../builtin/hlsl/scan/arithmetic_impl.hlsl | 48 +++++++++++++++++-- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/examples_tests b/examples_tests index 39d7859d28..de60cc1137 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 39d7859d2848468f49aef5627bd3f814502a74b5 +Subproject commit de60cc1137b3850ca7c7590123467e18898c5e98 diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl index 3789d2f35a..00537015c7 100644 --- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl +++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl @@ -68,14 +68,16 @@ struct reduce { if (lastInvocation) // don't make whole block work and do busy stuff { - for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID > 0u; prevID--) + bda::__ptr scratchIter = scratch; + for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--) { scalar_t value = scalar_t(0); + scratchIter = scratchIter-1; { // spin until something is ready while (value == constants_t::NOT_READY) { - bda::__ref scratchPrev = (scratch-1).deref(); + bda::__ref scratchPrev = scratchIter.deref(); // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); } @@ -89,6 +91,44 @@ struct reduce } prefix = workgroup::Broadcast(prefix, sharedMemScratchAccessor, Config::WorkgroupSize-1); } + else + { + bda::__ptr scratchIter = scratch; + for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--) + { + scalar_t value = scalar_t(0); + scratchIter = scratchIter-1; + if (lastInvocation) + { + bda::__ref scratchPrev = scratchIter.deref(); + // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); + value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); + } + value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1); + + if (value & constants_t::STATUS_MASK) + { + prefix += value & (~constants_t::STATUS_MASK); + + if (value & constants_t::GLOBAL_COUNT) + break; + } + else // can't wait/spin, have to do it ourselves + { + sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier(); + + DataAccessor prevDataAccessor = DataAccessor::create(prevID); + const scalar_t prevReduction = workgroup_reduce_t::template __call(prevDataAccessor, sharedMemScratchAccessor); + + // if DoAndRaceStore, stores in place of prev workgroup id as well + // bda::__ref scratchPrev = scratchIter.deref(); + // if (lastInvocation) + // spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT); + + prefix += prevReduction; + } + } + } binop_t binop; scalar_t globalReduction = binop(prefix,localReduction); @@ -101,10 +141,10 @@ struct reduce // get last item from scratch uint32_t lastWorkgroup = glsl::gl_NumWorkGroups().x - 1; bda::__ref scratchLast = (scratch + lastWorkgroup).deref(); - scalar_t value; + scalar_t value = scalar_t(0); { // wait until last workgroup does reduction - while (value & constants_t::GLOBAL_COUNT) + while (!(value & constants_t::GLOBAL_COUNT)) { // value = spirv::atomicLoad(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); value = spirv::atomicIAdd(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); From e291940c42bd4ec90b10f9180b6f5954001a6389 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 10 Jun 2025 16:31:51 +0700 Subject: [PATCH 09/14] bug fixes to indexing, forward progress guarantee works now --- .../builtin/hlsl/scan/arithmetic_impl.hlsl | 48 ++++++++++++------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl index 00537015c7..aae1185a53 100644 --- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl +++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl @@ -62,27 +62,39 @@ struct reduce spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT); } + // NOTE: just for testing, remove when done + // sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier(); + // uint32_t prev = glsl::gl_WorkGroupID().x==0 ? 0 : glsl::gl_WorkGroupID().x-1; + // scalar_t testVal = constants_t::NOT_READY; + // if (lastInvocation) + // while (testVal == constants_t::NOT_READY) + // testVal = spirv::atomicIAdd((scratch + prev).deref().__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); + // sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier(); + // testVal = workgroup::Broadcast(testVal, sharedMemScratchAccessor, Config::WorkgroupSize-1); + // return testVal; + + binop_t binop; scalar_t prefix = scalar_t(0); // decoupled lookback if (ForwardProgressGuarantees) { if (lastInvocation) // don't make whole block work and do busy stuff { - bda::__ptr scratchIter = scratch; - for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--) + // for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--) // won't run properly this way for some reason, results in device lost + for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++) { - scalar_t value = scalar_t(0); - scratchIter = scratchIter-1; + const uint32_t prevID = glsl::gl_WorkGroupID().x-i; + scalar_t value = constants_t::NOT_READY; { // spin until something is ready while (value == constants_t::NOT_READY) { - bda::__ref scratchPrev = scratchIter.deref(); + bda::__ref scratchPrev = (scratch + prevID).deref(); // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); } } - prefix += value & (~constants_t::STATUS_MASK); + prefix = binop(value & (~constants_t::STATUS_MASK), prefix); // last was actually a global sum, we have the prefix, we can quit if (value & constants_t::GLOBAL_COUNT) @@ -93,14 +105,15 @@ struct reduce } else { - bda::__ptr scratchIter = scratch; - for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--) + bda::__ptr scratchIter = scratch + glsl::gl_WorkGroupID().x; + // for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--) + for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++) { + const uint32_t prevID = glsl::gl_WorkGroupID().x-i; scalar_t value = scalar_t(0); - scratchIter = scratchIter-1; if (lastInvocation) { - bda::__ref scratchPrev = scratchIter.deref(); + bda::__ref scratchPrev = (scratch + prevID).deref(); // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); } @@ -108,7 +121,7 @@ struct reduce if (value & constants_t::STATUS_MASK) { - prefix += value & (~constants_t::STATUS_MASK); + prefix = binop(value & (~constants_t::STATUS_MASK), prefix); if (value & constants_t::GLOBAL_COUNT) break; @@ -125,23 +138,23 @@ struct reduce // if (lastInvocation) // spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT); - prefix += prevReduction; + prefix = binop(prevReduction, prefix); } } } - binop_t binop; scalar_t globalReduction = binop(prefix,localReduction); if (lastInvocation) { - bda::__ref scratchId = (scratch + glsl::gl_WorkGroupID().x).deref(); + bda::__ref scratchId = (scratch + glsl::gl_WorkGroupID().x).deref(); spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT); } // get last item from scratch - uint32_t lastWorkgroup = glsl::gl_NumWorkGroups().x - 1; - bda::__ref scratchLast = (scratch + lastWorkgroup).deref(); - scalar_t value = scalar_t(0); + const uint32_t lastWorkgroup = glsl::gl_NumWorkGroups().x - 1; + bda::__ref scratchLast = (scratch + lastWorkgroup).deref(); + scalar_t value = constants_t::NOT_READY; + if (lastInvocation) { // wait until last workgroup does reduction while (!(value & constants_t::GLOBAL_COUNT)) @@ -150,6 +163,7 @@ struct reduce value = spirv::atomicIAdd(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); } } + value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1); return value & (~constants_t::STATUS_MASK); } }; From 8665fcc1c96f3a3b74dd14e9daff63650ae5efc2 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 11 Jun 2025 11:35:37 +0700 Subject: [PATCH 10/14] fix to without forward progress guarantee, >2 workgroups broken somehow --- examples_tests | 2 +- include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples_tests b/examples_tests index de60cc1137..b9f515c207 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit de60cc1137b3850ca7c7590123467e18898c5e98 +Subproject commit b9f515c20721e57180a584072be56e6d3b6a1301 diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl index aae1185a53..d7591da8d6 100644 --- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl +++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl @@ -105,7 +105,6 @@ struct reduce } else { - bda::__ptr scratchIter = scratch + glsl::gl_WorkGroupID().x; // for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--) for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++) { @@ -113,7 +112,7 @@ struct reduce scalar_t value = scalar_t(0); if (lastInvocation) { - bda::__ref scratchPrev = (scratch + prevID).deref(); + bda::__ref scratchPrev = (scratch + prevID).deref(); // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); } @@ -131,10 +130,11 @@ struct reduce sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier(); DataAccessor prevDataAccessor = DataAccessor::create(prevID); + prevDataAccessor.begin(); // prepare data accessor if needed (e.g. preload) const scalar_t prevReduction = workgroup_reduce_t::template __call(prevDataAccessor, sharedMemScratchAccessor); // if DoAndRaceStore, stores in place of prev workgroup id as well - // bda::__ref scratchPrev = scratchIter.deref(); + // bda::__ref scratchPrev = (scratch + prevID).deref(); // if (lastInvocation) // spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT); From 7cde6200f295fd2aa8b50a07477109c1ea35fd5b Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 11 Jun 2025 16:20:29 +0700 Subject: [PATCH 11/14] fix to atomic load/store intrinsics --- include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl index 167c2fe5c7..c7a3694d3e 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl @@ -316,7 +316,7 @@ enable_if_t, T> atomicCompareExchange(Ptr_T ptr, uint32_t me template [[vk::ext_instruction(spv::OpAtomicLoad)]] -T atomicLoad([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics); +enable_if_t, T> atomicLoad([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics); template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicLoad)]] @@ -324,7 +324,7 @@ enable_if_t, T> atomicLoad(Ptr_T ptr, uint32_t memoryScope, template [[vk::ext_instruction(spv::OpAtomicStore)]] -void atomicStore([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); +enable_if_t, void> atomicStore([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicStore)]] From e4a8ac26b9cccc9920054a8a7d4c000ec0605000 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 12 Jun 2025 16:40:46 +0700 Subject: [PATCH 12/14] fix global reduction (only plus atm), moved existing to temp scan --- examples_tests | 2 +- include/nbl/builtin/hlsl/scan/arithmetic.hlsl | 7 ++- .../builtin/hlsl/scan/arithmetic_impl.hlsl | 53 +++++++++++-------- 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/examples_tests b/examples_tests index b9f515c207..794b06704b 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit b9f515c20721e57180a584072be56e6d3b6a1301 +Subproject commit 794b06704b611990cc7a6c2dc81d8912db4c747d diff --git a/include/nbl/builtin/hlsl/scan/arithmetic.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl index 95d7a4c045..31c596a077 100644 --- a/include/nbl/builtin/hlsl/scan/arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl @@ -18,12 +18,11 @@ struct reduction { using scalar_t = typename BinOp::type_t; - template && workgroup2::ArithmeticSharedMemoryAccessor) - static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) + template + static void __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) { impl::reduce fn; - scalar_t value = fn.template __call(dataAccessor, sharedMemScratchAccessor); - return value; + fn.template __call(dataAccessor, outputAccessor, statusAccessor, sharedMemScratchAccessor); } }; diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl index d7591da8d6..1be2b11b8e 100644 --- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl +++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl @@ -41,6 +41,30 @@ struct Constants template struct reduce +{ + using scalar_t = typename BinOp::type_t; + using arith_config_t = typename Config::arith_config_t; + using workgroup_reduce_t = workgroup2::reduction; + + template + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) + { + const scalar_t localReduction = workgroup_reduce_t::template __call(dataAccessor, sharedMemScratchAccessor); + + const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1); + if (lastInvocation) + { + // NOTE: there doesn't seem to be a way to set OpMemoryModel yet: https://github.com/microsoft/DirectXShaderCompiler/issues/7180 + // MakeAvailable semantic requires memory model set to Vulkan instead of GLSL450 currently + spirv::atomicIAdd(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction); + spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u); + } + } +}; + +// TODO: change this to scan, it totally won't work for reduce anyways +template +struct scan { using scalar_t = typename BinOp::type_t; using constants_t = Constants; @@ -59,20 +83,9 @@ struct reduce if (lastInvocation) { bda::__ref scratchId = (scratch + glsl::gl_WorkGroupID().x).deref(); - spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT); + spirv::atomicStore(scratchId.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT); } - // NOTE: just for testing, remove when done - // sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier(); - // uint32_t prev = glsl::gl_WorkGroupID().x==0 ? 0 : glsl::gl_WorkGroupID().x-1; - // scalar_t testVal = constants_t::NOT_READY; - // if (lastInvocation) - // while (testVal == constants_t::NOT_READY) - // testVal = spirv::atomicIAdd((scratch + prev).deref().__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); - // sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier(); - // testVal = workgroup::Broadcast(testVal, sharedMemScratchAccessor, Config::WorkgroupSize-1); - // return testVal; - binop_t binop; scalar_t prefix = scalar_t(0); // decoupled lookback @@ -90,8 +103,7 @@ struct reduce while (value == constants_t::NOT_READY) { bda::__ref scratchPrev = (scratch + prevID).deref(); - // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); - value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); + value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask); } } prefix = binop(value & (~constants_t::STATUS_MASK), prefix); @@ -113,8 +125,7 @@ struct reduce if (lastInvocation) { bda::__ref scratchPrev = (scratch + prevID).deref(); - // value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); - value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); + value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask); } value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1); @@ -136,18 +147,19 @@ struct reduce // if DoAndRaceStore, stores in place of prev workgroup id as well // bda::__ref scratchPrev = (scratch + prevID).deref(); // if (lastInvocation) - // spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT); + // spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT); prefix = binop(prevReduction, prefix); } } } - scalar_t globalReduction = binop(prefix,localReduction); + const scalar_t globalReduction = binop(prefix,localReduction); + // TODO globalReduction value changing in following block somehow, double check if (lastInvocation) { bda::__ref scratchId = (scratch + glsl::gl_WorkGroupID().x).deref(); - spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT); + spirv::atomicStore(scratchId.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT); } // get last item from scratch @@ -159,8 +171,7 @@ struct reduce // wait until last workgroup does reduction while (!(value & constants_t::GLOBAL_COUNT)) { - // value = spirv::atomicLoad(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); - value = spirv::atomicIAdd(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); + value = spirv::atomicLoad(scratchLast.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask); } } value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1); From 426fa6b65265086480fc307ea6a391cfdfed1bbf Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 12 Jun 2025 17:14:54 +0700 Subject: [PATCH 13/14] reduction specializations for other arithmetic ops --- examples_tests | 2 +- .../builtin/hlsl/scan/arithmetic_impl.hlsl | 78 +++++++++++++++++-- 2 files changed, 73 insertions(+), 7 deletions(-) diff --git a/examples_tests b/examples_tests index 794b06704b..86c198e67b 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 794b06704b611990cc7a6c2dc81d8912db4c747d +Subproject commit 86c198e67b5181a3222e390c7062204cd6adca2e diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl index 1be2b11b8e..0b041ed09a 100644 --- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl +++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl @@ -39,12 +39,50 @@ struct Constants NBL_CONSTEXPR_STATIC_INLINE T STATUS_MASK = LOCAL_COUNT | GLOBAL_COUNT; }; +// NOTE: there doesn't seem to be a way to set OpMemoryModel yet: https://github.com/microsoft/DirectXShaderCompiler/issues/7180 +// MakeAvailable semantic requires memory model set to Vulkan instead of GLSL450 currently template -struct reduce +struct reduce; + +#define SPECIALIZE(BINOP,ATOMIC_OP) template\ +struct reduce, ForwardProgressGuarantees, device_capabilities>\ +{\ + using scalar_t = T;\ + using arith_config_t = typename Config::arith_config_t;\ + using workgroup_reduce_t = workgroup2::reduction, device_capabilities>;\ +\ + template\ + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)\ + {\ + const scalar_t localReduction = workgroup_reduce_t::template __call(dataAccessor, sharedMemScratchAccessor);\ +\ + const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);\ + if (lastInvocation)\ + {\ + spirv::ATOMIC_OP(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction);\ + spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u);\ + }\ + }\ +} + +SPECIALIZE(bit_and,atomicAnd); +SPECIALIZE(bit_or,atomicOr); +SPECIALIZE(bit_xor,atomicXor); + +SPECIALIZE(plus,atomicIAdd); +// there's no atomic multiply so we use a CAS loop + +SPECIALIZE(minimum,atomicUMin); +SPECIALIZE(maximum,atomicUMax); + +#undef SPECIALIZE + +template +struct reduce, ForwardProgressGuarantees, device_capabilities> { - using scalar_t = typename BinOp::type_t; + using scalar_t = T; using arith_config_t = typename Config::arith_config_t; - using workgroup_reduce_t = workgroup2::reduction; + using workgroup_reduce_t = workgroup2::reduction, device_capabilities>; template void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) @@ -54,14 +92,42 @@ struct reduce const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1); if (lastInvocation) { - // NOTE: there doesn't seem to be a way to set OpMemoryModel yet: https://github.com/microsoft/DirectXShaderCompiler/issues/7180 - // MakeAvailable semantic requires memory model set to Vulkan instead of GLSL450 currently - spirv::atomicIAdd(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction); + { + scalar_t actual, expected; + actual = multiplies::identity; + do + { + expected = actual; + scalar_t newVal = expected * localReduction; + actual = spirv::atomicCompareExchange(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, spv::MemorySemanticsAcquireMask, newVal, expected); + } while (expected != actual); + } spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u); } } }; +// template +// struct reduce; +// { +// using scalar_t = typename BinOp::type_t; +// using arith_config_t = typename Config::arith_config_t; +// using workgroup_reduce_t = workgroup2::reduction; + +// template +// void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) +// { +// const scalar_t localReduction = workgroup_reduce_t::template __call(dataAccessor, sharedMemScratchAccessor); + +// const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1); +// if (lastInvocation) +// { +// spirv::atomicIAdd(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction); +// spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u); +// } +// } +// }; + // TODO: change this to scan, it totally won't work for reduce anyways template struct scan From 57f4559a2057b55c412b0e37825e0835d0570533 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 13 Jun 2025 09:55:17 +0700 Subject: [PATCH 14/14] cleaning up reduction --- examples_tests | 2 +- .../builtin/hlsl/scan/arithmetic_impl.hlsl | 21 ------------------- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/examples_tests b/examples_tests index 86c198e67b..b210d0d867 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 86c198e67b5181a3222e390c7062204cd6adca2e +Subproject commit b210d0d86781f672f60d256cc56bf3ab078e8715 diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl index 0b041ed09a..a3978df0dc 100644 --- a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl +++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl @@ -107,27 +107,6 @@ struct reduce, ForwardProgressGuarantees, device_capabilit } }; -// template -// struct reduce; -// { -// using scalar_t = typename BinOp::type_t; -// using arith_config_t = typename Config::arith_config_t; -// using workgroup_reduce_t = workgroup2::reduction; - -// template -// void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) -// { -// const scalar_t localReduction = workgroup_reduce_t::template __call(dataAccessor, sharedMemScratchAccessor); - -// const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1); -// if (lastInvocation) -// { -// spirv::atomicIAdd(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction); -// spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u); -// } -// } -// }; - // TODO: change this to scan, it totally won't work for reduce anyways template struct scan