Skip to content

Commit 4be1a3c

Browse files
committed
Fix implementation problems in HLSL and write initial transferProperties function
1 parent 279c220 commit 4be1a3c

File tree

4 files changed

+101
-16
lines changed

4 files changed

+101
-16
lines changed

include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,22 @@ namespace property_pools
1414
template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2, uint64_t DstIndexSizeLog2>
1515
struct TransferLoop
1616
{
17-
void iteration(uint propertyId, uint64_t propertySize, uint64_t srcAddr, uint64_t dstAddr, uint invocationIndex)
17+
void iteration(uint propertyId, TransferRequest transferRequest, uint invocationIndex)
1818
{
19-
const uint64_t srcOffset = uint64_t(invocationIndex) * (uint64_t(1) << SrcIndexSizeLog2) * propertySize;
20-
const uint64_t dstOffset = uint64_t(invocationIndex) * (uint64_t(1) << DstIndexSizeLog2) * propertySize;
19+
const uint64_t srcIndexSize = uint64_t(1) << SrcIndexSizeLog2;
20+
const uint64_t dstIndexSize = uint64_t(1) << DstIndexSizeLog2;
21+
22+
const uint64_t srcOffset = uint64_t(invocationIndex) * srcIndexSize * transferRequest.propertySize;
23+
const uint64_t dstOffset = uint64_t(invocationIndex) * dstIndexSize * transferRequest.propertySize;
2124

22-
const uint64_t srcIndexAddress = Fill ? srcAddr + srcOffset : srcAddr;
23-
const uint64_t dstIndexAddress = Fill ? dstAddr + dstOffset : dstAddr;
25+
const uint64_t srcIndexAddress = Fill ? transferRequest.srcIndexAddr + srcOffset : transferRequest.srcIndexAddr;
26+
const uint64_t dstIndexAddress = Fill ? transferRequest.dstIndexAddr + dstOffset : transferRequest.dstIndexAddr;
27+
28+
const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad<uint32_t>(srcIndexAddress);
29+
const uint64_t dstAddressBufferOffset = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad<uint32_t>(dstIndexAddress);
2430

25-
const uint64_t srcAddressMapped = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad<uint64_t>(srcIndexAddress);
26-
const uint64_t dstAddressMapped = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad<uint64_t>(dstIndexAddress);
31+
const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize;
32+
const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize;
2733

2834
if (SrcIndexSizeLog2 == 0) {} // we can't write individual bytes
2935
else if (SrcIndexSizeLog2 == 1) vk::RawBufferStore<uint16_t>(dstAddressMapped, vk::RawBufferLoad<uint16_t>(srcAddressMapped));
@@ -35,17 +41,21 @@ struct TransferLoop
3541
{
3642
uint64_t elementCount = uint64_t(transferRequest.elementCount32)
3743
| uint64_t(transferRequest.elementCountExtra) << 32;
38-
uint lastInvocation = min(elementCount, globals.endOffset);
39-
for (uint invocationIndex = globals.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize)
44+
uint64_t lastInvocation = min(elementCount, globals.endOffset);
45+
for (uint64_t invocationIndex = globals.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize)
4046
{
41-
iteration(propertyId, transferRequest.propertySize, transferRequest.srcAddr, transferRequest.dstAddr, invocationIndex);
47+
iteration(propertyId, transferRequest, invocationIndex);
4248
}
4349
}
4450
};
4551

4652
// For creating permutations of the functions based on parameters that are constant over the transfer request
4753
// These branches should all be scalar, and because of how templates are compiled statically, the loops shouldn't have any
4854
// branching within them
55+
//
56+
// Permutations:
57+
// 2 (fill or not) * 2 (src index iota or not) * 2 (dst index iota or not) * 4 (src index size) * 4 (dst index size)
58+
// Total amount of permutations: 128
4959

5060
template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2>
5161
struct TransferLoopPermutationSrcIndexSizeLog
@@ -76,7 +86,7 @@ struct TransferLoopPermutationSrcIota
7686
{
7787
void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
7888
{
79-
bool dstIota = transferRequest.dstAddr == 0;
89+
bool dstIota = transferRequest.dstIndexAddr == 0;
8090
if (dstIota) { TransferLoopPermutationDstIota<Fill, SrcIndexIota, true> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
8191
else { TransferLoopPermutationDstIota<Fill, SrcIndexIota, false> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
8292
}
@@ -87,7 +97,7 @@ struct TransferLoopPermutationFill
8797
{
8898
void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
8999
{
90-
bool srcIota = transferRequest.srcAddr == 0;
100+
bool srcIota = transferRequest.srcIndexAddr == 0;
91101
if (srcIota) { TransferLoopPermutationSrcIota<Fill, true> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
92102
else { TransferLoopPermutationSrcIota<Fill, false> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
93103
}

include/nbl/builtin/hlsl/property_pool/transfer.hlsl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#ifndef _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_
22
#define _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_
33

4+
#include "nbl/builtin/hlsl/cpp_compat.hlsl"
5+
46
namespace nbl
57
{
68
namespace hlsl
@@ -40,6 +42,9 @@ struct GlobalPushContants
4042
// BDA address (GPU pointer) into the transfer commands buffer
4143
uint64_t transferCommandsAddress;
4244
};
45+
46+
NBL_CONSTEXPR uint32_t MaxPropertiesPerDispatch = 128;
47+
4348
}
4449
}
4550
}

include/nbl/video/utilities/CPropertyPoolHandler.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,12 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ
7474
asset::SBufferRange<IGPUBuffer> memblock = {};
7575
E_FLAG flags = EF_NONE;
7676
uint16_t elementSize = 0u;
77-
uint32_t elementCount = 0u;
77+
uint64_t elementCount = 0u;
7878
// the source or destination buffer depending on the transfer type
7979
asset::SBufferBinding<video::IGPUBuffer> buffer = {};
8080
// can be invalid, if invalid, treated like an implicit {0,1,2,3,...} iota view
81-
uint32_t srcAddressesOffset = IPropertyPool::invalid;
82-
uint32_t dstAddressesOffset = IPropertyPool::invalid;
81+
uint64_t srcAddressesOffset = IPropertyPool::invalid;
82+
uint64_t dstAddressesOffset = IPropertyPool::invalid;
8383
};
8484
// Fence must be not pending yet, `cmdbuf` must be already in recording state.
8585
[[nodiscard]] bool transferProperties(
@@ -206,7 +206,7 @@ class NBL_API2 CPropertyPoolHandler final : public core::IReferenceCounted, publ
206206
protected:
207207
~CPropertyPoolHandler() {}
208208

209-
static inline constexpr auto MaxPropertiesPerDispatch = 0u; // TODO
209+
static inline constexpr auto MaxPropertiesPerDispatch = nbl::hlsl::property_pools::MaxPropertiesPerDispatch;
210210
static inline constexpr auto DescriptorCacheSize = 128u;
211211

212212

src/nbl/video/utilities/CPropertyPoolHandler.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ CPropertyPoolHandler::CPropertyPoolHandler(core::smart_refctd_ptr<ILogicalDevice
3131
return gpuShader;
3232
};
3333
auto shader = loadShader("../../../include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl");
34+
const asset::SPushConstantRange baseDWORD = {asset::IShader::ESS_COMPUTE,0u,sizeof(nbl::hlsl::property_pools::GlobalPushContants)};
35+
auto layout = m_device->createPipelineLayout(&baseDWORD,&baseDWORD+1u);
36+
m_pipeline = m_device->createComputePipeline(nullptr,std::move(layout),std::move(shader));
3437

3538
#if 0
3639
const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
@@ -89,6 +92,73 @@ bool CPropertyPoolHandler::transferProperties(
8992
system::logger_opt_ptr logger, const uint32_t baseDWORD, const uint32_t endDWORD
9093
)
9194
{
95+
if (requestsBegin==requestsEnd)
96+
return true;
97+
if (!scratch.buffer || !scratch.buffer->getCreationParams().usage.hasFlags(IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF))
98+
{
99+
logger.log("CPropertyPoolHandler: Need a valid scratch buffer which can have updates staged from the commandbuffer!",system::ILogger::ELL_ERROR);
100+
return false;
101+
}
102+
// TODO: validate usage flags
103+
uint32_t maxScratchSize = MaxPropertiesPerDispatch * sizeof(nbl::hlsl::property_pools::TransferRequest);
104+
if (scratch.offset + maxScratchSize > scratch.buffer->getSize())
105+
logger.log("CPropertyPoolHandler: The scratch buffer binding provided might not be big enough in the worst case! (Scratch buffer size: %i Max scratch size: %i)",
106+
system::ILogger::ELL_WARNING,
107+
scratch.buffer->getSize() - scratch.offset,
108+
maxScratchSize);
109+
110+
const auto totalProps = std::distance(requestsBegin,requestsEnd);
111+
bool success = true;
112+
113+
uint32_t numberOfPasses = totalProps / MaxPropertiesPerDispatch;
114+
nbl::hlsl::property_pools::TransferRequest transferRequestsData[MaxPropertiesPerDispatch];
115+
uint64_t scratchBufferDeviceAddr = m_device->getBufferDeviceAddress(scratch.buffer.get()) + scratch.offset;
116+
uint64_t addressBufferDeviceAddr = m_device->getBufferDeviceAddress(addresses.buffer.get()) + addresses.offset;
117+
118+
for (uint32_t transferPassRequestsIndex = 0; transferPassRequestsIndex < totalProps; transferPassRequestsIndex += MaxPropertiesPerDispatch)
119+
{
120+
const TransferRequest* transferPassRequests = requestsBegin + transferPassRequestsIndex;
121+
uint32_t requestsThisPass = core::min<uint32_t>(std::distance(transferPassRequests, requestsEnd), MaxPropertiesPerDispatch);
122+
uint64_t maxElements = 0;
123+
for (uint32_t i = 0; i < requestsThisPass; i ++)
124+
{
125+
auto& transferRequest = transferRequestsData[i];
126+
auto srcRequest = transferPassRequests + i;
127+
transferRequest.srcAddr = m_device->getBufferDeviceAddress(srcRequest->memblock.buffer.get()) + srcRequest->memblock.offset;
128+
transferRequest.dstAddr = m_device->getBufferDeviceAddress(srcRequest->buffer.buffer.get()) + srcRequest->buffer.offset;
129+
transferRequest.srcIndexAddr = srcRequest->srcAddressesOffset ? addressBufferDeviceAddr + srcRequest->srcAddressesOffset : 0;
130+
transferRequest.dstIndexAddr = srcRequest->dstAddressesOffset ? addressBufferDeviceAddr + srcRequest->dstAddressesOffset : 0;
131+
transferRequest.elementCount32 = uint32_t(srcRequest->elementCount & (uint64_t(1) << 32) - 1);
132+
transferRequest.elementCountExtra = uint32_t(srcRequest->elementCount >> 32);
133+
transferRequest.propertySize = srcRequest->elementSize;
134+
transferRequest.fill = 0; // TODO
135+
transferRequest.srcIndexSizeLog2 = 1u; // TODO
136+
transferRequest.dstIndexSizeLog2 = 1u; // TODO
137+
138+
maxElements = core::max<uint64_t>(maxElements, srcRequest->elementCount);
139+
}
140+
cmdbuf->updateBuffer(scratch.buffer.get(),scratch.offset,sizeof(TransferRequest)*requestsThisPass,transferRequestsData);
141+
// TODO: pipeline barrier
142+
cmdbuf->bindComputePipeline(m_pipeline.get());
143+
144+
nbl::hlsl::property_pools::GlobalPushContants pushConstants;
145+
{
146+
pushConstants.beginOffset = baseDWORD;
147+
pushConstants.endOffset = endDWORD;
148+
pushConstants.transferCommandsAddress = scratchBufferDeviceAddr;
149+
}
150+
cmdbuf->pushConstants(m_pipeline->getLayout(), asset::IShader::ESS_COMPUTE, 0u, sizeof(nbl::hlsl::property_pools::GlobalPushContants), &pushConstants);
151+
152+
// dispatch
153+
{
154+
const auto& limits = m_device->getPhysicalDevice()->getLimits();
155+
const auto invocationCoarseness = limits.maxOptimallyResidentWorkgroupInvocations * requestsThisPass;
156+
cmdbuf->dispatch(limits.computeOptimalPersistentWorkgroupDispatchSize(maxElements,invocationCoarseness), requestsThisPass, 1u);
157+
}
158+
// TODO: pipeline barrier
159+
}
160+
161+
return success;
92162
#if 0
93163
if (requestsBegin==requestsEnd)
94164
return true;

0 commit comments

Comments
 (0)