-
Notifications
You must be signed in to change notification settings - Fork 65
Block Chain Scan #884
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Block Chain Scan #884
Changes from 13 commits
c23050c
0ccd13f
7e1b0c3
9666ce4
1f67be3
fa7151e
20d56d8
0ee12b0
752d943
0098b3e
6461b36
e291940
8665fcc
7cde620
e4a8ac2
426fa6b
57f4559
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_ | ||
#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_ | ||
|
||
#include "nbl/builtin/hlsl/scan/arithmetic_impl.hlsl" | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace scan | ||
{ | ||
|
||
template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities=void> | ||
struct reduction | ||
{ | ||
using scalar_t = typename BinOp::type_t; | ||
|
||
template<class ReadOnlyDataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(workgroup2::ArithmeticReadOnlyDataAccessor<ReadOnlyDataAccessor,scalar_t> && workgroup2::ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t>) | ||
static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you need a 3rd accessor which is an atomic accessor (to both accumulate the result and figure out when everyone is done) |
||
{ | ||
impl::reduce<Config, BinOp, ForwardProgressGuarantees, device_capabilities> fn; | ||
scalar_t value = fn.template __call<ReadOnlyDataAccessor,ScratchAccessor>(dataAccessor, sharedMemScratchAccessor); | ||
return value; | ||
} | ||
}; | ||
|
||
} | ||
} | ||
} | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,177 @@ | ||
// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_ | ||
#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_ | ||
|
||
#include "nbl/builtin/hlsl/bda/__ptr.hlsl" | ||
#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace scan | ||
{ | ||
|
||
template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation> | ||
struct ScanConfiguration | ||
{ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = _ItemsPerInvocation; | ||
|
||
using arith_config_t = workgroup2::ArithmeticConfiguration<WorkgroupSizeLog2, SubgroupSizeLog2, ItemsPerInvocation>; | ||
NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = arith_config_t::SharedScratchElementCount; | ||
}; | ||
|
||
namespace impl | ||
{ | ||
|
||
template<typename T> // only uint32_t or uint64_t for now? | ||
struct Constants | ||
{ | ||
NBL_CONSTEXPR_STATIC_INLINE T NOT_READY = 0; | ||
NBL_CONSTEXPR_STATIC_INLINE T LOCAL_COUNT = T(0x1u) << (sizeof(T)*8-2); | ||
NBL_CONSTEXPR_STATIC_INLINE T GLOBAL_COUNT = T(0x1u) << (sizeof(T)*8-1); | ||
NBL_CONSTEXPR_STATIC_INLINE T STATUS_MASK = LOCAL_COUNT | GLOBAL_COUNT; | ||
}; | ||
Comment on lines
+33
to
+40
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can use not everything needs to be a crazy template |
||
|
||
template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities> | ||
struct reduce | ||
{ | ||
using scalar_t = typename BinOp::type_t; | ||
using constants_t = Constants<scalar_t>; | ||
using config_t = Config; | ||
using arith_config_t = typename Config::arith_config_t; | ||
using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BinOp, device_capabilities>; | ||
using binop_t = BinOp; | ||
|
||
template<class DataAccessor, class ScratchAccessor> | ||
scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor) | ||
Comment on lines
+121
to
+122
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you have the readonly accessor to get your element, and you have the scratch memory accessor (for the workgroup scans/reductions) but you don't have:
|
||
{ | ||
const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor); | ||
bda::__ptr<scalar_t> scratch = dataAccessor.getScratchPtr(); // scratch data should be at least T[NumWorkgroups] | ||
|
||
const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1); | ||
if (lastInvocation) | ||
{ | ||
bda::__ref<scalar_t> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref(); | ||
spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you want to separate the storage of the reduction from the flags I think. |
||
} | ||
|
||
// NOTE: just for testing, remove when done | ||
// sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier(); | ||
// uint32_t prev = glsl::gl_WorkGroupID().x==0 ? 0 : glsl::gl_WorkGroupID().x-1; | ||
// scalar_t testVal = constants_t::NOT_READY; | ||
// if (lastInvocation) | ||
// while (testVal == constants_t::NOT_READY) | ||
// testVal = spirv::atomicIAdd((scratch + prev).deref().__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); | ||
// sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier(); | ||
// testVal = workgroup::Broadcast(testVal, sharedMemScratchAccessor, Config::WorkgroupSize-1); | ||
// return testVal; | ||
|
||
binop_t binop; | ||
scalar_t prefix = scalar_t(0); | ||
// decoupled lookback | ||
if (ForwardProgressGuarantees) | ||
{ | ||
if (lastInvocation) // don't make whole block work and do busy stuff | ||
{ | ||
// for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--) // won't run properly this way for some reason, results in device lost | ||
for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++) | ||
Comment on lines
+139
to
+142
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually using the whole workgroup or at least subgroup (benchmark it) would be much faster here, so each invocation checks a workgroup and you can use You'd also be able to accumulate the |
||
{ | ||
const uint32_t prevID = glsl::gl_WorkGroupID().x-i; | ||
Comment on lines
+142
to
+144
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't use |
||
scalar_t value = constants_t::NOT_READY; | ||
{ | ||
// spin until something is ready | ||
while (value == constants_t::NOT_READY) | ||
{ | ||
bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref(); | ||
// value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); | ||
value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you'll have multiple workgroups doing this, you'll mess up the results, you want to accumulate those locally here in a register so while you're walking backwards, you only do Also this requires that you have two different scratch store locations:
If you keep the global and local on the same address, you get nasty data races because the status is a flag and not a mutex, so you can overwrite a local result with a global while another workgroup reads and it thinks that the value it reads is a local result because flag is not updated yet. P.S. You probably don't want to be writing out the GLOBAL results and updating status flags here even though you can, because other workgroups before you are obviously "just about" to write out their results, and this way you'll just introduce more uncached memory traffic. |
||
} | ||
} | ||
prefix = binop(value & (~constants_t::STATUS_MASK), prefix); | ||
|
||
// last was actually a global sum, we have the prefix, we can quit | ||
if (value & constants_t::GLOBAL_COUNT) | ||
break; | ||
} | ||
} | ||
prefix = workgroup::Broadcast(prefix, sharedMemScratchAccessor, Config::WorkgroupSize-1); | ||
} | ||
else | ||
{ | ||
// for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--) | ||
for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++) | ||
{ | ||
const uint32_t prevID = glsl::gl_WorkGroupID().x-i; | ||
scalar_t value = scalar_t(0); | ||
if (lastInvocation) | ||
{ | ||
bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref(); | ||
// value = spirv::atomicLoad(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); | ||
value = spirv::atomicIAdd(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); | ||
} | ||
value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1); | ||
|
||
if (value & constants_t::STATUS_MASK) | ||
{ | ||
prefix = binop(value & (~constants_t::STATUS_MASK), prefix); | ||
|
||
if (value & constants_t::GLOBAL_COUNT) | ||
break; | ||
} | ||
else // can't wait/spin, have to do it ourselves | ||
{ | ||
sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier(); | ||
|
||
DataAccessor prevDataAccessor = DataAccessor::create(prevID); | ||
prevDataAccessor.begin(); // prepare data accessor if needed (e.g. preload) | ||
const scalar_t prevReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(prevDataAccessor, sharedMemScratchAccessor); | ||
|
||
// if DoAndRaceStore, stores in place of prev workgroup id as well | ||
// bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref(); | ||
// if (lastInvocation) | ||
// spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT); | ||
|
||
prefix = binop(prevReduction, prefix); | ||
} | ||
} | ||
} | ||
|
||
scalar_t globalReduction = binop(prefix,localReduction); | ||
if (lastInvocation) | ||
{ | ||
bda::__ref<scalar_t> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref(); | ||
spirv::atomicUMax(scratchId.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT); | ||
} | ||
|
||
// get last item from scratch | ||
const uint32_t lastWorkgroup = glsl::gl_NumWorkGroups().x - 1; | ||
bda::__ref<scalar_t> scratchLast = (scratch + lastWorkgroup).deref(); | ||
scalar_t value = constants_t::NOT_READY; | ||
if (lastInvocation) | ||
{ | ||
// wait until last workgroup does reduction | ||
while (!(value & constants_t::GLOBAL_COUNT)) | ||
{ | ||
// value = spirv::atomicLoad(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask); | ||
value = spirv::atomicIAdd(scratchLast.__get_spv_ptr(), spv::ScopeWorkgroup, spv::MemorySemanticsAcquireMask, 0u); | ||
} | ||
} | ||
value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1); | ||
return value & (~constants_t::STATUS_MASK); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This won't work even with forward progress guarantees, you just need to let the workgroup quit |
||
} | ||
}; | ||
|
||
} | ||
|
||
} | ||
} | ||
} | ||
|
||
#endif |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we should make a fake device feature called
forwardProgressGuarantees
which is basically always false