Skip to content

Commit 1a0c998

Browse files
committed
Apply fixes to property pool stuff
1 parent b8db8c9 commit 1a0c998

File tree

2 files changed

+52
-26
lines changed

2 files changed

+52
-26
lines changed

include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,20 @@ struct TransferLoop
1919
const uint64_t srcIndexSize = uint64_t(1) << SrcIndexSizeLog2;
2020
const uint64_t dstIndexSize = uint64_t(1) << DstIndexSizeLog2;
2121

22-
const uint64_t srcOffset = invocationIndex * srcIndexSize * transferRequest.propertySize;
23-
const uint64_t dstOffset = invocationIndex * dstIndexSize * transferRequest.propertySize;
22+
// Fill: Always use offset 0 on src
23+
const uint64_t srcOffset = Fill ? 0 : invocationIndex * transferRequest.propertySize;
24+
const uint64_t dstOffset = invocationIndex * transferRequest.propertySize;
2425

25-
const uint64_t srcIndexAddress = Fill ? transferRequest.srcIndexAddr + srcOffset : transferRequest.srcIndexAddr;
26-
const uint64_t dstIndexAddress = Fill ? transferRequest.dstIndexAddr + dstOffset : transferRequest.dstIndexAddr;
27-
28-
const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad<uint32_t>(srcIndexAddress);
29-
const uint64_t dstAddressBufferOffset = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad<uint32_t>(dstIndexAddress);
26+
// IOTA: Use the index as the fetching offset
27+
// Non IOTA: Read the address buffer ("index buffer") to select fetching offset
28+
const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcOffset : vk::RawBufferLoad<uint32_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint32_t));
29+
const uint64_t dstAddressBufferOffset = DstIndexIota ? dstOffset : vk::RawBufferLoad<uint32_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint32_t));
3030

3131
const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize;
3232
const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize;
3333

34+
//vk::RawBufferStore<uint64_t>(transferRequest.dstAddr + invocationIndex * sizeof(uint64_t) * 2, srcAddressMapped,8);
35+
//vk::RawBufferStore<uint64_t>(transferRequest.dstAddr + invocationIndex * sizeof(uint64_t) * 2 + sizeof(uint64_t), dstAddressMapped,8);
3436
if (SrcIndexSizeLog2 == 0) {} // we can't write individual bytes
3537
else if (SrcIndexSizeLog2 == 1) vk::RawBufferStore<uint16_t>(dstAddressMapped, vk::RawBufferLoad<uint16_t>(srcAddressMapped));
3638
else if (SrcIndexSizeLog2 == 2) vk::RawBufferStore<uint32_t>(dstAddressMapped, vk::RawBufferLoad<uint32_t>(srcAddressMapped));
@@ -111,36 +113,49 @@ void main(uint32_t3 dispatchId)
111113

112114
// Loading transfer request from the pointer (can't use struct
113115
// with BDA on HLSL SPIRV)
116+
uint64_t transferCmdAddr = globals.transferCommandsAddress + sizeof(TransferRequest) * propertyId;
114117
TransferRequest transferRequest;
115-
transferRequest.srcAddr = vk::RawBufferLoad<uint>(globals.transferCommandsAddress) | vk::RawBufferLoad<uint>(globals.transferCommandsAddress + sizeof(uint)) << 32;
116-
transferRequest.dstAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof(uint64_t));
117-
transferRequest.srcIndexAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof(uint64_t) * 2);
118-
transferRequest.dstIndexAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof(uint64_t) * 3);
118+
transferRequest.srcAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr,8);
119+
transferRequest.dstAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t),8);
120+
transferRequest.srcIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 2,8);
121+
transferRequest.dstIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 3,8);
119122
// Remaining elements are part of the same bitfield
120123
// TODO: Do this only using raw buffer load?
121-
uint64_t bitfieldType = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof(uint64_t) * 4);
124+
uint64_t bitfieldType = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 4,8);
122125
transferRequest.elementCount32 = uint32_t(bitfieldType);
123-
transferRequest.elementCountExtra = uint32_t(bitfieldType);
124-
transferRequest.propertySize = uint32_t(bitfieldType >> 3);
125-
transferRequest.fill = uint32_t(bitfieldType >> (3 + 24));
126-
transferRequest.srcIndexSizeLog2 = uint32_t(bitfieldType >> (3 + 24 + 1));
127-
transferRequest.dstIndexSizeLog2 = uint32_t(bitfieldType >> (3 + 24 + 1 + 2));
126+
transferRequest.elementCountExtra = uint32_t(bitfieldType >> 32);
127+
transferRequest.propertySize = uint32_t(bitfieldType >> (32 + 3));
128+
transferRequest.fill = uint32_t(bitfieldType >> (32 + 3 + 24));
129+
transferRequest.srcIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1));
130+
transferRequest.dstIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1 + 2));
128131

129132
const uint dispatchSize = nbl::hlsl::device_capabilities_traits<device_capabilities>::maxOptimallyResidentWorkgroupInvocations;
130133
const bool fill = transferRequest.fill == 1;
131134

132-
vk::RawBufferStore<uint64_t>(globals.transferCommandsAddress + 40 * 3, transferRequest.srcAddr);
133-
vk::RawBufferStore<uint64_t>(globals.transferCommandsAddress + 40 * 4, transferRequest.dstAddr);
134-
vk::RawBufferStore<uint>(globals.transferCommandsAddress + 40 * 5, vk::RawBufferLoad<uint>(transferRequest.srcAddr + sizeof(uint16_t) * 3));
135-
//if (fill) { TransferLoopPermutationFill<true> loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); }
136-
//else { TransferLoopPermutationFill<false> loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); }
135+
//uint64_t debugWriteAddr = transferRequest.dstAddr + sizeof(uint64_t) * 9 * propertyId;
136+
//vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 0, transferRequest.srcAddr,8);
137+
//vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 1, transferRequest.dstAddr,8);
138+
//vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 2, transferRequest.srcIndexAddr,8);
139+
//vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 3, transferRequest.dstIndexAddr,8);
140+
//uint64_t elementCount = uint64_t(transferRequest.elementCount32)
141+
// | uint64_t(transferRequest.elementCountExtra) << 32;
142+
//vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 4, elementCount,8);
143+
//vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 5, transferRequest.propertySize,4);
144+
//vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 6, transferRequest.fill,4);
145+
//vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 7, transferRequest.srcIndexSizeLog2,4);
146+
//vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 8, transferRequest.dstIndexSizeLog2,4);
147+
//vk::RawBufferStore<uint64_t>(transferRequest.dstAddr + sizeof(uint64_t) * invocationIndex, invocationIndex,8);
148+
149+
if (fill) { TransferLoopPermutationFill<true> loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); }
150+
else { TransferLoopPermutationFill<false> loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); }
137151
}
138152

139153
}
140154
}
141155
}
142156

143-
[numthreads(1,1,1)]
157+
// TODO: instead use some sort of replace function for getting optimal size?
158+
[numthreads(512,1,1)]
144159
void main(uint32_t3 dispatchId : SV_DispatchThreadID)
145160
{
146161
nbl::hlsl::property_pools::main<nbl::hlsl::jit::device_capabilities>(dispatchId);

src/nbl/video/utilities/CPropertyPoolHandler.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,8 @@ bool CPropertyPoolHandler::transferProperties(
130130
auto srcRequest = transferPassRequests + i;
131131
transferRequest.srcAddr = srcRequest->memblock.buffer.get()->getDeviceAddress() + srcRequest->memblock.offset;
132132
transferRequest.dstAddr = srcRequest->buffer.buffer.get()->getDeviceAddress() + srcRequest->buffer.offset;
133-
transferRequest.srcIndexAddr = srcRequest->srcAddressesOffset ? addressBufferDeviceAddr + srcRequest->srcAddressesOffset : 0;
134-
transferRequest.dstIndexAddr = srcRequest->dstAddressesOffset ? addressBufferDeviceAddr + srcRequest->dstAddressesOffset : 0;
133+
transferRequest.srcIndexAddr = srcRequest->srcAddressesOffset != IPropertyPool::invalid ? addressBufferDeviceAddr + srcRequest->srcAddressesOffset : 0;
134+
transferRequest.dstIndexAddr = srcRequest->dstAddressesOffset != IPropertyPool::invalid ? addressBufferDeviceAddr + srcRequest->dstAddressesOffset : 0;
135135
transferRequest.elementCount32 = uint32_t(srcRequest->elementCount & (uint64_t(1) << 32) - 1);
136136
transferRequest.elementCountExtra = uint32_t(srcRequest->elementCount >> 32);
137137
transferRequest.propertySize = srcRequest->elementSize;
@@ -144,7 +144,18 @@ bool CPropertyPoolHandler::transferProperties(
144144
maxElements = core::max<uint64_t>(maxElements, srcRequest->elementCount);
145145
}
146146
cmdbuf->updateBuffer({ scratch.offset,sizeof(TransferRequest) * requestsThisPass, core::smart_refctd_ptr(scratch.buffer) }, transferRequestsData);
147-
// TODO: pipeline barrier
147+
148+
const asset::SMemoryBarrier barriers[1] = { {
149+
.srcStageMask = asset::PIPELINE_STAGE_FLAGS::COPY_BIT,
150+
.srcAccessMask = asset::ACCESS_FLAGS::TRANSFER_WRITE_BIT,
151+
.dstStageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
152+
.dstAccessMask = asset::ACCESS_FLAGS::SHADER_READ_BITS
153+
} };
154+
cmdbuf->pipelineBarrier(asset::EDF_NONE,IGPUCommandBuffer::SPipelineBarrierDependencyInfo{
155+
.memBarriers = barriers
156+
// TODO: .bufBarriers = instead
157+
});
158+
148159
cmdbuf->bindComputePipeline(m_pipeline.get());
149160

150161
nbl::hlsl::property_pools::GlobalPushContants pushConstants;

0 commit comments

Comments
 (0)