@@ -19,18 +19,20 @@ struct TransferLoop
19
19
const uint64_t srcIndexSize = uint64_t (1 ) << SrcIndexSizeLog2;
20
20
const uint64_t dstIndexSize = uint64_t (1 ) << DstIndexSizeLog2;
21
21
22
- const uint64_t srcOffset = invocationIndex * srcIndexSize * transferRequest.propertySize;
23
- const uint64_t dstOffset = invocationIndex * dstIndexSize * transferRequest.propertySize;
22
+ // Fill: Always use offset 0 on src
23
+ const uint64_t srcOffset = Fill ? 0 : invocationIndex * transferRequest.propertySize;
24
+ const uint64_t dstOffset = invocationIndex * transferRequest.propertySize;
24
25
25
- const uint64_t srcIndexAddress = Fill ? transferRequest.srcIndexAddr + srcOffset : transferRequest.srcIndexAddr;
26
- const uint64_t dstIndexAddress = Fill ? transferRequest.dstIndexAddr + dstOffset : transferRequest.dstIndexAddr;
27
-
28
- const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad<uint32_t>(srcIndexAddress);
29
- const uint64_t dstAddressBufferOffset = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad<uint32_t>(dstIndexAddress);
26
+ // IOTA: Use the index as the fetching offset
27
+ // Non IOTA: Read the address buffer ("index buffer") to select fetching offset
28
+ const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcOffset : vk::RawBufferLoad<uint32_t>(transferRequest.srcIndexAddr + srcOffset * sizeof (uint32_t));
29
+ const uint64_t dstAddressBufferOffset = DstIndexIota ? dstOffset : vk::RawBufferLoad<uint32_t>(transferRequest.dstIndexAddr + dstOffset * sizeof (uint32_t));
30
30
31
31
const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize;
32
32
const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize;
33
33
34
+ //vk::RawBufferStore<uint64_t>(transferRequest.dstAddr + invocationIndex * sizeof(uint64_t) * 2, srcAddressMapped,8);
35
+ //vk::RawBufferStore<uint64_t>(transferRequest.dstAddr + invocationIndex * sizeof(uint64_t) * 2 + sizeof(uint64_t), dstAddressMapped,8);
34
36
if (SrcIndexSizeLog2 == 0 ) {} // we can't write individual bytes
35
37
else if (SrcIndexSizeLog2 == 1 ) vk::RawBufferStore<uint16_t>(dstAddressMapped, vk::RawBufferLoad<uint16_t>(srcAddressMapped));
36
38
else if (SrcIndexSizeLog2 == 2 ) vk::RawBufferStore<uint32_t>(dstAddressMapped, vk::RawBufferLoad<uint32_t>(srcAddressMapped));
@@ -111,36 +113,49 @@ void main(uint32_t3 dispatchId)
111
113
112
114
// Loading transfer request from the pointer (can't use struct
113
115
// with BDA on HLSL SPIRV)
116
+ uint64_t transferCmdAddr = globals.transferCommandsAddress + sizeof (TransferRequest) * propertyId;
114
117
TransferRequest transferRequest;
115
- transferRequest.srcAddr = vk::RawBufferLoad<uint >(globals.transferCommandsAddress) | vk::RawBufferLoad< uint >(globals.transferCommandsAddress + sizeof ( uint )) << 32 ;
116
- transferRequest.dstAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t));
117
- transferRequest.srcIndexAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 2 );
118
- transferRequest.dstIndexAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 3 );
118
+ transferRequest.srcAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr, 8 ) ;
119
+ transferRequest.dstAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof (uint64_t), 8 );
120
+ transferRequest.srcIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof (uint64_t) * 2 , 8 );
121
+ transferRequest.dstIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof (uint64_t) * 3 , 8 );
119
122
// Remaining elements are part of the same bitfield
120
123
// TODO: Do this only using raw buffer load?
121
- uint64_t bitfieldType = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 4 );
124
+ uint64_t bitfieldType = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof (uint64_t) * 4 , 8 );
122
125
transferRequest.elementCount32 = uint32_t (bitfieldType);
123
- transferRequest.elementCountExtra = uint32_t (bitfieldType);
124
- transferRequest.propertySize = uint32_t (bitfieldType >> 3 );
125
- transferRequest.fill = uint32_t (bitfieldType >> (3 + 24 ));
126
- transferRequest.srcIndexSizeLog2 = uint32_t (bitfieldType >> (3 + 24 + 1 ));
127
- transferRequest.dstIndexSizeLog2 = uint32_t (bitfieldType >> (3 + 24 + 1 + 2 ));
126
+ transferRequest.elementCountExtra = uint32_t (bitfieldType >> 32 );
127
+ transferRequest.propertySize = uint32_t (bitfieldType >> ( 32 + 3 ) );
128
+ transferRequest.fill = uint32_t (bitfieldType >> (32 + 3 + 24 ));
129
+ transferRequest.srcIndexSizeLog2 = uint32_t (bitfieldType >> (32 + 3 + 24 + 1 ));
130
+ transferRequest.dstIndexSizeLog2 = uint32_t (bitfieldType >> (32 + 3 + 24 + 1 + 2 ));
128
131
129
132
const uint dispatchSize = nbl::hlsl::device_capabilities_traits<device_capabilities>::maxOptimallyResidentWorkgroupInvocations;
130
133
const bool fill = transferRequest.fill == 1 ;
131
134
132
- vk::RawBufferStore<uint64_t>(globals.transferCommandsAddress + 40 * 3 , transferRequest.srcAddr);
133
- vk::RawBufferStore<uint64_t>(globals.transferCommandsAddress + 40 * 4 , transferRequest.dstAddr);
134
- vk::RawBufferStore<uint >(globals.transferCommandsAddress + 40 * 5 , vk::RawBufferLoad<uint >(transferRequest.srcAddr + sizeof (uint16_t) * 3 ));
135
- //if (fill) { TransferLoopPermutationFill<true> loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); }
136
- //else { TransferLoopPermutationFill<false> loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); }
135
+ //uint64_t debugWriteAddr = transferRequest.dstAddr + sizeof(uint64_t) * 9 * propertyId;
136
+ //vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 0, transferRequest.srcAddr,8);
137
+ //vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 1, transferRequest.dstAddr,8);
138
+ //vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 2, transferRequest.srcIndexAddr,8);
139
+ //vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 3, transferRequest.dstIndexAddr,8);
140
+ //uint64_t elementCount = uint64_t(transferRequest.elementCount32)
141
+ // | uint64_t(transferRequest.elementCountExtra) << 32;
142
+ //vk::RawBufferStore<uint64_t>(debugWriteAddr + sizeof(uint64_t) * 4, elementCount,8);
143
+ //vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 5, transferRequest.propertySize,4);
144
+ //vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 6, transferRequest.fill,4);
145
+ //vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 7, transferRequest.srcIndexSizeLog2,4);
146
+ //vk::RawBufferStore<uint32_t>(debugWriteAddr + sizeof(uint64_t) * 8, transferRequest.dstIndexSizeLog2,4);
147
+ //vk::RawBufferStore<uint64_t>(transferRequest.dstAddr + sizeof(uint64_t) * invocationIndex, invocationIndex,8);
148
+
149
+ if (fill) { TransferLoopPermutationFill<true > loop; loop.copyLoop (invocationIndex, propertyId, transferRequest, dispatchSize); }
150
+ else { TransferLoopPermutationFill<false > loop; loop.copyLoop (invocationIndex, propertyId, transferRequest, dispatchSize); }
137
151
}
138
152
139
153
}
140
154
}
141
155
}
142
156
143
- [numthreads (1 ,1 ,1 )]
157
+ // TODO: instead use some sort of replace function for getting optimal size?
158
+ [numthreads (512 ,1 ,1 )]
144
159
void main (uint32_t3 dispatchId : SV_DispatchThreadID )
145
160
{
146
161
nbl::hlsl::property_pools::main<nbl::hlsl::jit::device_capabilities>(dispatchId);
0 commit comments