diff --git a/examples_tests b/examples_tests index 1710b69862..b210d0d867 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 1710b698621796aa767edf7bc940e55e6758c2a8 +Subproject commit b210d0d86781f672f60d256cc56bf3ab078e8715 diff --git a/include/nbl/builtin/hlsl/scan/arithmetic.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl new file mode 100644 index 0000000000..31c596a077 --- /dev/null +++ b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl @@ -0,0 +1,33 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_ +#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_ + +#include "nbl/builtin/hlsl/scan/arithmetic_impl.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace scan +{ + +template +struct reduction +{ + using scalar_t = typename BinOp::type_t; + + template + static void __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) + { + impl::reduce fn; + fn.template __call(dataAccessor, outputAccessor, statusAccessor, sharedMemScratchAccessor); + } +}; + +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl new file mode 100644 index 0000000000..a3978df0dc --- /dev/null +++ b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl @@ -0,0 +1,233 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_ + +#include "nbl/builtin/hlsl/bda/__ptr.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace scan +{ + +template +struct ScanConfiguration +{ + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = _ItemsPerInvocation; + + using arith_config_t = workgroup2::ArithmeticConfiguration; + NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = arith_config_t::SharedScratchElementCount; +}; + +namespace impl +{ + +template // only uint32_t or uint64_t for now? +struct Constants +{ + NBL_CONSTEXPR_STATIC_INLINE T NOT_READY = 0; + NBL_CONSTEXPR_STATIC_INLINE T LOCAL_COUNT = T(0x1u) << (sizeof(T)*8-2); + NBL_CONSTEXPR_STATIC_INLINE T GLOBAL_COUNT = T(0x1u) << (sizeof(T)*8-1); + NBL_CONSTEXPR_STATIC_INLINE T STATUS_MASK = LOCAL_COUNT | GLOBAL_COUNT; +}; + +// NOTE: there doesn't seem to be a way to set OpMemoryModel yet: https://github.com/microsoft/DirectXShaderCompiler/issues/7180 +// MakeAvailable semantic requires memory model set to Vulkan instead of GLSL450 currently +template +struct reduce; + +#define SPECIALIZE(BINOP,ATOMIC_OP) template\ +struct reduce, ForwardProgressGuarantees, device_capabilities>\ +{\ + using scalar_t = T;\ + using arith_config_t = typename Config::arith_config_t;\ + using workgroup_reduce_t = workgroup2::reduction, device_capabilities>;\ +\ + template\ + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)\ + {\ + const scalar_t localReduction = workgroup_reduce_t::template __call(dataAccessor, sharedMemScratchAccessor);\ +\ + const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);\ + if (lastInvocation)\ + {\ + spirv::ATOMIC_OP(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction);\ + spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u);\ + }\ + }\ +} + +SPECIALIZE(bit_and,atomicAnd); +SPECIALIZE(bit_or,atomicOr); +SPECIALIZE(bit_xor,atomicXor); + +SPECIALIZE(plus,atomicIAdd); +// there's no atomic multiply so we use a CAS loop + +SPECIALIZE(minimum,atomicUMin); +SPECIALIZE(maximum,atomicUMax); + +#undef SPECIALIZE + +template +struct reduce, ForwardProgressGuarantees, device_capabilities> +{ + using scalar_t = T; + using arith_config_t = typename Config::arith_config_t; + using workgroup_reduce_t = workgroup2::reduction, device_capabilities>; + + template + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) + { + const scalar_t localReduction = workgroup_reduce_t::template __call(dataAccessor, sharedMemScratchAccessor); + + const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1); + if (lastInvocation) + { + { + scalar_t actual, expected; + actual = multiplies::identity; + do + { + expected = actual; + scalar_t newVal = expected * localReduction; + actual = spirv::atomicCompareExchange(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, spv::MemorySemanticsAcquireMask, newVal, expected); + } while (expected != actual); + } + spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u); + } + } +}; + +// TODO: change this to scan, it totally won't work for reduce anyways +template +struct scan +{ + using scalar_t = typename BinOp::type_t; + using constants_t = Constants; + using config_t = Config; + using arith_config_t = typename Config::arith_config_t; + using workgroup_reduce_t = workgroup2::reduction; + using binop_t = BinOp; + + template + scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) + { + const scalar_t localReduction = workgroup_reduce_t::template __call(dataAccessor, sharedMemScratchAccessor); + bda::__ptr scratch = dataAccessor.getScratchPtr(); // scratch data should be at least T[NumWorkgroups] + + const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1); + if (lastInvocation) + { + bda::__ref scratchId = (scratch + glsl::gl_WorkGroupID().x).deref(); + spirv::atomicStore(scratchId.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT); + } + + binop_t binop; + scalar_t prefix = scalar_t(0); + // decoupled lookback + if (ForwardProgressGuarantees) + { + if (lastInvocation) // don't make whole block work and do busy stuff + { + // for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--) // won't run properly this way for some reason, results in device lost + for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++) + { + const uint32_t prevID = glsl::gl_WorkGroupID().x-i; + scalar_t value = constants_t::NOT_READY; + { + // spin until something is ready + while (value == constants_t::NOT_READY) + { + bda::__ref scratchPrev = (scratch + prevID).deref(); + value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask); + } + } + prefix = binop(value & (~constants_t::STATUS_MASK), prefix); + + // last was actually a global sum, we have the prefix, we can quit + if (value & constants_t::GLOBAL_COUNT) + break; + } + } + prefix = workgroup::Broadcast(prefix, sharedMemScratchAccessor, Config::WorkgroupSize-1); + } + else + { + // for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--) + for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++) + { + const uint32_t prevID = glsl::gl_WorkGroupID().x-i; + scalar_t value = scalar_t(0); + if (lastInvocation) + { + bda::__ref scratchPrev = (scratch + prevID).deref(); + value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask); + } + value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1); + + if (value & constants_t::STATUS_MASK) + { + prefix = binop(value & (~constants_t::STATUS_MASK), prefix); + + if (value & constants_t::GLOBAL_COUNT) + break; + } + else // can't wait/spin, have to do it ourselves + { + sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier(); + + DataAccessor prevDataAccessor = DataAccessor::create(prevID); + prevDataAccessor.begin(); // prepare data accessor if needed (e.g. preload) + const scalar_t prevReduction = workgroup_reduce_t::template __call(prevDataAccessor, sharedMemScratchAccessor); + + // if DoAndRaceStore, stores in place of prev workgroup id as well + // bda::__ref scratchPrev = (scratch + prevID).deref(); + // if (lastInvocation) + // spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT); + + prefix = binop(prevReduction, prefix); + } + } + } + + const scalar_t globalReduction = binop(prefix,localReduction); + // TODO globalReduction value changing in following block somehow, double check + if (lastInvocation) + { + bda::__ref scratchId = (scratch + glsl::gl_WorkGroupID().x).deref(); + spirv::atomicStore(scratchId.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT); + } + + // get last item from scratch + const uint32_t lastWorkgroup = glsl::gl_NumWorkGroups().x - 1; + bda::__ref scratchLast = (scratch + lastWorkgroup).deref(); + scalar_t value = constants_t::NOT_READY; + if (lastInvocation) + { + // wait until last workgroup does reduction + while (!(value & constants_t::GLOBAL_COUNT)) + { + value = spirv::atomicLoad(scratchLast.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask); + } + } + value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1); + return value & (~constants_t::STATUS_MASK); + } +}; + +} + +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl deleted file mode 100644 index 2d2e66e66d..0000000000 --- a/include/nbl/builtin/hlsl/scan/declarations.hlsl +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_ -#define _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_ - -// REVIEW: Not sure if this file is needed in HLSL implementation - -#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl" - - -#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_ -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - Parameters_t getParameters(); -} -} -} -#define _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_ -#endif - -#ifndef _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - template - void getData( - inout Storage_t data, - in uint levelInvocationIndex, - in uint localWorkgroupIndex, - in uint treeLevel, - in uint pseudoLevel - ); -} -} -} -#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ -#endif - -#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_ -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - template - void setData( - in Storage_t data, - in uint levelInvocationIndex, - in uint localWorkgroupIndex, - in uint treeLevel, - in uint pseudoLevel, - in bool inRange - ); -} -} -} -#define _NBL_HLSL_SCAN_SET_DATA_DECLARED_ -#endif - -#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl deleted file mode 100644 index 450368475d..0000000000 --- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl +++ /dev/null @@ -1,221 +0,0 @@ -#ifndef _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_ -#define _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_ - -#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl" - -#ifdef __cplusplus -#define uint uint32_t -#endif - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - struct DefaultSchedulerParameters_t - { - uint finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1]; - uint cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS]; - - }; -} -} -} - -#ifdef __cplusplus -#undef uint -#else - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ -namespace scheduler -{ - /** - * The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation - * (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions - * that are used for synchronization etc. - * Remember that CScanner does a Blelloch Scan which works in levels. In each level of the Blelloch scan the array of elements is - * broken down into sets of size=WorkgroupSize and each set is scanned using Hillis & Steele (aka Stone-Kogge adder). The result of - * the scan is provided as an array element for the next level of the Blelloch Scan. This means that if we have 10000 elements and - * WorkgroupSize=250, we will break the array into 40 sets and take their reduction results. The next level of the Blelloch Scan will - * have an array of size 40. Only a single workgroup will be needed to work on that. After that array is scanned, we use the results - * in the downsweep phase of Blelloch Scan. - * Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount. - * The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase. - * The intermediate results are stored in a scratch buffer. That buffer's size is is the sum of the element-array size for all the - * Blelloch levels. Using the previous example, the scratch size should be 10000 + 40. - * - * Parameter meaning: - * |> lastElement - the index of the last element of each Blelloch level in the scratch buffer - * |> topLevel - the top level the Blelloch Scan will have (this depends on the elementCount and the WorkgroupSize) - * |> temporaryStorageOffset - an offset array for each level of the Blelloch Scan. It is used when storing the REDUCTION result of each workgroup scan - * |> cumulativeWorkgroupCount - the sum-scan of all the workgroups that will need to be launched for each level of the Blelloch Scan (both upsweep and downsweep) - * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps - * synchronizing between workgroups with while-loop spinning. - */ - void computeParameters(in uint elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams) - { -#define WorkgroupCount(Level) (_scanParams.lastElement[Level+1]+1u) - _scanParams.lastElement[0] = elementCount-1u; - _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0])/_NBL_HLSL_WORKGROUP_SIZE_LOG2_; - // REVIEW: _NBL_HLSL_WORKGROUP_SIZE_LOG2_ is defined in files that include THIS file. Why not query the API for workgroup size at runtime? - - for (uint i=0; i>_NBL_HLSL_WORKGROUP_SIZE_LOG2_; - i = next; - } - _schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0); - _schedulerParams.finishedFlagOffset[0] = 0u; - switch(_scanParams.topLevel) - { - case 1u: - _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+1u; - _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(0); - // climb up - _schedulerParams.finishedFlagOffset[1] = 1u; - - _scanParams.temporaryStorageOffset[0] = 2u; - break; - case 2u: - _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u; - _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0); - // climb up - _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); - _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u; - // climb down - _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u; - - _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1); - _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); - break; - case 3u: - _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2); - _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u; - _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(2); - _schedulerParams.cumulativeWorkgroupCount[5] = _schedulerParams.cumulativeWorkgroupCount[4]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[6] = _schedulerParams.cumulativeWorkgroupCount[5]+WorkgroupCount(0); - // climb up - _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); - _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+WorkgroupCount(2); - _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[2]+1u; - // climb down - _schedulerParams.finishedFlagOffset[4] = _schedulerParams.finishedFlagOffset[2]+2u; - _schedulerParams.finishedFlagOffset[5] = _schedulerParams.finishedFlagOffset[4]+WorkgroupCount(2); - - _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[5]+WorkgroupCount(1); - _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); - _scanParams.temporaryStorageOffset[2] = _scanParams.temporaryStorageOffset[1]+WorkgroupCount(1); - break; - default: - break; -#if NBL_BUILTIN_MAX_SCAN_LEVELS>7 -#error "Switch needs more cases" -#endif - } -#undef WorkgroupCount - } - - /** - * treeLevel - the current level in the Blelloch Scan - * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. - * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5. - */ - template - bool getWork(in DefaultSchedulerParameters_t params, in uint topLevel, out uint treeLevel, out uint localWorkgroupIndex) - { - ScratchAccessor sharedScratch; - if(SubgroupContiguousIndex() == 0u) - { - uint64_t original; - InterlockedAdd(scanScratch.workgroupsStarted, 1u, original); // REVIEW: Refactor InterlockedAdd with GLSL terminology? // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared - sharedScratch.set(SubgroupContiguousIndex(), original); - } - else if (SubgroupContiguousIndex() == 1u) - { - sharedScratch.set(SubgroupContiguousIndex(), 0u); - } - GroupMemoryBarrierWithGroupSync(); // REVIEW: refactor this somewhere with GLSL terminology? - - const uint globalWorkgroupIndex; // does every thread need to know? - sharedScratch.get(0u, globalWorkgroupIndex); - const uint lastLevel = topLevel<<1u; - if (SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[SubgroupContiguousIndex()]) - { - InterlockedAdd(sharedScratch.get(1u, ?), 1u); // REVIEW: The way scratchaccessoradaptor is implemented (e.g. under subgroup/arithmetic_portability) doesn't allow for atomic ops on the scratch buffer. Should we ask for another implementation that overrides the [] operator ? - } - GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor? - - sharedScratch.get(1u, treeLevel); - if(treeLevel>lastLevel) - return true; - - localWorkgroupIndex = globalWorkgroupIndex; - const bool dependentLevel = treeLevel != 0u; - if(dependentLevel) - { - const uint prevLevel = treeLevel - 1u; - localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel]; - if(SubgroupContiguousIndex() == 0u) - { - uint dependentsCount = 1u; - if(treeLevel <= topLevel) - { - dependentsCount = _NBL_HLSL_WORKGROUP_SIZE_; // REVIEW: Defined in the files that include this file? - const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel]; - if (lastWorkgroup) - { - const Parameters_t scanParams = getParameters(); // TODO (PentaKon): Undeclared as of now, this should return the Parameters_t from the push constants of (in)direct shader - dependentsCount = scanParams.lastElement[treeLevel]+1u; - if (treeLeveltopLevel) // !(prevLevel globallycoherent \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl deleted file mode 100644 index 325a08e3f0..0000000000 --- a/include/nbl/builtin/hlsl/scan/direct.hlsl +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef _NBL_HLSL_WORKGROUP_SIZE_ -#define _NBL_HLSL_WORKGROUP_SIZE_ 256 -#endif - -#include "nbl/builtin/hlsl/scan/descriptors.hlsl" -#include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl" -#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl" - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ -#ifndef _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_ - cbuffer PC // REVIEW: register and packoffset selection - { - Parameters_t scanParams; - DefaultSchedulerParameters_t schedulerParams; - }; -#define _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_ -#endif - -#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ -Parameters_t getParameters() -{ - return pc.scanParams; -} -#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ -#endif - -#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ -DefaultSchedulerParameters_t getSchedulerParameters() -{ - return pc.schedulerParams; -} -#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ -#endif -} -} -} - -#ifndef _NBL_HLSL_MAIN_DEFINED_ -[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)] -void CSMain() -{ - nbl::hlsl::scan::main(); -} -#define _NBL_HLSL_MAIN_DEFINED_ -#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/indirect.hlsl b/include/nbl/builtin/hlsl/scan/indirect.hlsl deleted file mode 100644 index 1191731f65..0000000000 --- a/include/nbl/builtin/hlsl/scan/indirect.hlsl +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef _NBL_HLSL_WORKGROUP_SIZE_ -#define _NBL_HLSL_WORKGROUP_SIZE_ 256 -#define _NBL_HLSL_WORKGROUP_SIZE_LOG2_ 8 -#endif - -#include "nbl/builtin/hlsl/scan/descriptors.hlsl" -#include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl" -#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl" - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ -#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ -Parameters_t scanParams; -Parameters_t getParameters() -{ - return scanParams; -} -#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_ -#endif - -uint getIndirectElementCount(); - -#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ -DefaultSchedulerParameters_t schedulerParams; -DefaultSchedulerParameters_t getSchedulerParameters() -{ - scheduler::computeParameters(getIndirectElementCount(),scanParams,schedulerParams); - return schedulerParams; -} -#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_ -#endif -} -} -} - -#ifndef _NBL_HLSL_MAIN_DEFINED_ -[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)] -void CSMain() -{ - if (bool(nbl::hlsl::scan::getIndirectElementCount())) - nbl::hlsl::scan::main(); -} -#define _NBL_HLSL_MAIN_DEFINED_ -#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl b/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl deleted file mode 100644 index bfeba13be2..0000000000 --- a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_ -#define _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_ - -#define NBL_BUILTIN_MAX_SCAN_LEVELS 7 - -#ifdef __cplusplus -#define uint uint32_t -#endif - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules) - struct Parameters_t { - uint lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1]; - uint topLevel; - uint temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2]; - } -} -} -} - -#ifdef __cplusplus -#undef uint -#endif - -#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl deleted file mode 100644 index 488bf29012..0000000000 --- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl +++ /dev/null @@ -1,92 +0,0 @@ -#ifndef _NBL_HLSL_SCAN_VIRTUAL_WORKGROUP_INCLUDED_ -#define _NBL_HLSL_SCAN_VIRTUAL_WORKGROUP_INCLUDED_ - -// TODO (PentaKon): Decide if these are needed once we have a clearer picture of the refactor -#include "nbl/builtin/hlsl/limits/numeric.hlsl" -#include "nbl/builtin/hlsl/math/typeless_arithmetic.hlsl" -#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined -#include "nbl/builtin/hlsl/scan/declarations.hlsl" - -#include "nbl/builtin/hlsl/binops.hlsl" - -#if 0 -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - template - void virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex) - { - const Parameters_t params = getParameters(); - const uint levelInvocationIndex = localWorkgroupIndex * _NBL_HLSL_WORKGROUP_SIZE_ + SubgroupContiguousIndex(); - const bool lastInvocationInGroup = SubgroupContiguousIndex() == (_NBL_HLSL_WORKGROUP_SIZE_ - 1); - - const uint lastLevel = params.topLevel << 1u; - const uint pseudoLevel = levelInvocationIndex <= params.lastElement[pseudoLevel]; - - const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; - - Storage_t data = Binop::identity(); - if(inRange) - { - getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel); - } - - if(treeLevel < params.topLevel) - { - #error "Must also define some scratch accessor when calling operation()" - data = workgroup::reduction()(data); - } - // REVIEW: missing _TYPE_ check and extra case here - else if (treeLevel != params.topLevel) - { - data = workgroup::inclusive_scan()(data); - } - else - { - data = workgroup::exclusive_scan()(data); - } - setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange); - } -} -} -} - -#ifndef _NBL_HLSL_SCAN_MAIN_DEFINED_ // TODO REVIEW: Are these needed, can this logic be refactored? -#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl" -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components - void main() - { - const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters(); - const uint topLevel = getParameters().topLevel; - // persistent workgroups - while (true) - { - uint treeLevel,localWorkgroupIndex; - if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex)) - { - return; - } - - virtualWorkgroup(treeLevel,localWorkgroupIndex); - - scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex); - } - } -} -} -} -#endif - -#define _NBL_HLSL_SCAN_MAIN_DEFINED_ -#endif - -#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl index 4885fc11f8..c7a3694d3e 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl @@ -180,6 +180,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicAnd)]] enable_if_t && (is_same_v || is_same_v), T> atomicAnd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicAnd)]] +enable_if_t || is_same_v, T> atomicAnd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicAnd)]] +enable_if_t && (is_same_v || is_same_v), T> atomicAnd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + template [[vk::ext_instruction(spv::OpAtomicOr)]] enable_if_t || is_same_v, T> atomicOr([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); @@ -188,6 +198,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicOr)]] enable_if_t && (is_same_v || is_same_v), T> atomicOr(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicOr)]] +enable_if_t || is_same_v, T> atomicOr([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicOr)]] +enable_if_t && (is_same_v || is_same_v), T> atomicOr(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + template [[vk::ext_instruction(spv::OpAtomicXor)]] enable_if_t || is_same_v, T> atomicXor([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); @@ -196,6 +216,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicXor)]] enable_if_t && (is_same_v || is_same_v), T> atomicXor(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicXor)]] +enable_if_t || is_same_v, T> atomicXor([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicXor)]] +enable_if_t && (is_same_v || is_same_v), T> atomicXor(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + template [[vk::ext_instruction( spv::OpAtomicSMin )]] enable_if_t, Signed> atomicSMin([[vk::ext_reference]] int32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); @@ -204,6 +234,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicSMin)]] enable_if_t && is_same_v, Signed> atomicSMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction( spv::OpAtomicSMin )]] +enable_if_t, Signed> atomicSMin([[vk::ext_reference]] int32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicSMin)]] +enable_if_t && is_same_v, Signed> atomicSMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); + template [[vk::ext_instruction( spv::OpAtomicUMin )]] enable_if_t, Unsigned> atomicUMin([[vk::ext_reference]] Unsigned ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); @@ -212,6 +252,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicUMin)]] enable_if_t && is_same_v, Unsigned> atomicUMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction( spv::OpAtomicUMin )]] +enable_if_t, Unsigned> atomicUMin([[vk::ext_reference]] Unsigned ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicUMin)]] +enable_if_t && is_same_v, Unsigned> atomicUMin(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); + template [[vk::ext_instruction( spv::OpAtomicSMax )]] enable_if_t, Signed> atomicSMax([[vk::ext_reference]] Signed ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); @@ -220,6 +270,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicSMax)]] enable_if_t && is_same_v, Signed> atomicSMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction( spv::OpAtomicSMax )]] +enable_if_t, Signed> atomicSMax([[vk::ext_reference]] Signed ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicSMax)]] +enable_if_t && is_same_v, Signed> atomicSMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Signed value); + template [[vk::ext_instruction( spv::OpAtomicUMax )]] enable_if_t, Unsigned> atomicUMax([[vk::ext_reference]] uint32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); @@ -228,6 +288,16 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicUMax)]] enable_if_t && is_same_v, Unsigned> atomicUMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); +template +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction( spv::OpAtomicUMax )]] +enable_if_t, Unsigned> atomicUMax([[vk::ext_reference]] uint32_t ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); + +template // DXC Workaround +[[vk::ext_capability(spv::CapabilityInt64Atomics)]] +[[vk::ext_instruction(spv::OpAtomicUMax)]] +enable_if_t && is_same_v, Unsigned> atomicUMax(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, Unsigned value); + template [[vk::ext_instruction(spv::OpAtomicExchange)]] T atomicExchange([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); @@ -244,6 +314,22 @@ template // DXC Workaround [[vk::ext_instruction(spv::OpAtomicCompareExchange)]] enable_if_t, T> atomicCompareExchange(Ptr_T ptr, uint32_t memoryScope, uint32_t memSemanticsEqual, uint32_t memSemanticsUnequal, T value, T comparator); +template +[[vk::ext_instruction(spv::OpAtomicLoad)]] +enable_if_t, T> atomicLoad([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics); + +template // DXC Workaround +[[vk::ext_instruction(spv::OpAtomicLoad)]] +enable_if_t, T> atomicLoad(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics); + +template +[[vk::ext_instruction(spv::OpAtomicStore)]] +enable_if_t, void> atomicStore([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + +template // DXC Workaround +[[vk::ext_instruction(spv::OpAtomicStore)]] +enable_if_t, void> atomicStore(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); + template __NBL_CAPABILITY_PhysicalStorageBufferAddresses diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl index 62a9fb7bef..045ecbde51 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl @@ -4,12 +4,10 @@ #ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_ #define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_ - #include "nbl/builtin/hlsl/functional.hlsl" #include "nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl" #include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl" - namespace nbl { namespace hlsl