@@ -14,16 +14,22 @@ namespace property_pools
14
14
template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2, uint64_t DstIndexSizeLog2>
15
15
struct TransferLoop
16
16
{
17
- void iteration (uint propertyId, uint64_t propertySize, uint64_t srcAddr, uint64_t dstAddr , uint invocationIndex)
17
+ void iteration (uint propertyId, TransferRequest transferRequest , uint invocationIndex)
18
18
{
19
- const uint64_t srcOffset = uint64_t (invocationIndex) * (uint64_t (1 ) << SrcIndexSizeLog2) * propertySize;
20
- const uint64_t dstOffset = uint64_t (invocationIndex) * (uint64_t (1 ) << DstIndexSizeLog2) * propertySize;
19
+ const uint64_t srcIndexSize = uint64_t (1 ) << SrcIndexSizeLog2;
20
+ const uint64_t dstIndexSize = uint64_t (1 ) << DstIndexSizeLog2;
21
+
22
+ const uint64_t srcOffset = uint64_t (invocationIndex) * srcIndexSize * transferRequest.propertySize;
23
+ const uint64_t dstOffset = uint64_t (invocationIndex) * dstIndexSize * transferRequest.propertySize;
21
24
22
- const uint64_t srcIndexAddress = Fill ? srcAddr + srcOffset : srcAddr;
23
- const uint64_t dstIndexAddress = Fill ? dstAddr + dstOffset : dstAddr;
25
+ const uint64_t srcIndexAddress = Fill ? transferRequest.srcIndexAddr + srcOffset : transferRequest.srcIndexAddr;
26
+ const uint64_t dstIndexAddress = Fill ? transferRequest.dstIndexAddr + dstOffset : transferRequest.dstIndexAddr;
27
+
28
+ const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad<uint32_t>(srcIndexAddress);
29
+ const uint64_t dstAddressBufferOffset = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad<uint32_t>(dstIndexAddress);
24
30
25
- const uint64_t srcAddressMapped = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad<uint64_t>(srcIndexAddress) ;
26
- const uint64_t dstAddressMapped = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad<uint64_t>(dstIndexAddress) ;
31
+ const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize ;
32
+ const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize ;
27
33
28
34
if (SrcIndexSizeLog2 == 0 ) {} // we can't write individual bytes
29
35
else if (SrcIndexSizeLog2 == 1 ) vk::RawBufferStore<uint16_t>(dstAddressMapped, vk::RawBufferLoad<uint16_t>(srcAddressMapped));
@@ -35,17 +41,21 @@ struct TransferLoop
35
41
{
36
42
uint64_t elementCount = uint64_t (transferRequest.elementCount32)
37
43
| uint64_t (transferRequest.elementCountExtra) << 32 ;
38
- uint lastInvocation = min (elementCount, globals.endOffset);
39
- for (uint invocationIndex = globals.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize)
44
+ uint64_t lastInvocation = min (elementCount, globals.endOffset);
45
+ for (uint64_t invocationIndex = globals.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize)
40
46
{
41
- iteration (propertyId, transferRequest.propertySize, transferRequest.srcAddr, transferRequest.dstAddr , invocationIndex);
47
+ iteration (propertyId, transferRequest, invocationIndex);
42
48
}
43
49
}
44
50
};
45
51
46
52
// For creating permutations of the functions based on parameters that are constant over the transfer request
47
53
// These branches should all be scalar, and because of how templates are compiled statically, the loops shouldn't have any
48
54
// branching within them
55
+ //
56
+ // Permutations:
57
+ // 2 (fill or not) * 2 (src index iota or not) * 2 (dst index iota or not) * 4 (src index size) * 4 (dst index size)
58
+ // Total amount of permutations: 128
49
59
50
60
template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2>
51
61
struct TransferLoopPermutationSrcIndexSizeLog
@@ -76,7 +86,7 @@ struct TransferLoopPermutationSrcIota
76
86
{
77
87
void copyLoop (uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
78
88
{
79
- bool dstIota = transferRequest.dstAddr == 0 ;
89
+ bool dstIota = transferRequest.dstIndexAddr == 0 ;
80
90
if (dstIota) { TransferLoopPermutationDstIota<Fill, SrcIndexIota, true > loop; loop.copyLoop (baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
81
91
else { TransferLoopPermutationDstIota<Fill, SrcIndexIota, false > loop; loop.copyLoop (baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
82
92
}
@@ -87,7 +97,7 @@ struct TransferLoopPermutationFill
87
97
{
88
98
void copyLoop (uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
89
99
{
90
- bool srcIota = transferRequest.srcAddr == 0 ;
100
+ bool srcIota = transferRequest.srcIndexAddr == 0 ;
91
101
if (srcIota) { TransferLoopPermutationSrcIota<Fill, true > loop; loop.copyLoop (baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
92
102
else { TransferLoopPermutationSrcIota<Fill, false > loop; loop.copyLoop (baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
93
103
}
0 commit comments