1
1
#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
2
+ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
2
3
#include "nbl/builtin/hlsl/property_pool/transfer.hlsl"
3
4
5
+ // https://github.com/microsoft/DirectXShaderCompiler/issues/6144
6
+ template<typename capability_traits=nbl::hlsl::jit::device_capabilities_traits>
7
+ uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize () {
8
+ return uint32_t3 (capability_traits::maxOptimallyResidentWorkgroupInvocations, 1 , 1 );
9
+ }
10
+
11
+ [[numthreads (1 , 1 , 1 )]
12
+ void main (uint32_t3 dispatchId : SV_DispatchThreadID )
13
+ {
14
+ nbl::hlsl::property_pool::main (dispatchId);
15
+ }
16
+
4
17
namespace nbl
5
18
{
6
19
namespace hlsl
7
20
{
8
21
namespace property_pools
9
22
{
10
- // https://github.com/microsoft/DirectXShaderCompiler/issues/6144
11
- template<typename capability_traits=nbl::hlsl::jit::device_capabilities_traits>
12
- uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize () {
13
- return uint32_t3 (capability_traits::maxOptimallyResidentWorkgroupInvocations, 1 , 1 );
14
- }
15
23
16
24
[[vk::push_constant]] GlobalPushContants globals;
17
25
@@ -37,13 +45,13 @@ struct TransferLoop
37
45
38
46
void copyLoop (uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
39
47
{
40
- uint lastInvocation = min (transferRequest.elementCount, gloabls .endOffset);
48
+ uint lastInvocation = min (transferRequest.elementCount, globals .endOffset);
41
49
for (uint invocationIndex = globals.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize)
42
50
{
43
51
iteration (propertyId, transferRequest.propertySize, transferRequest.srcAddr, transferRequest.dstAddr, invocationIndex);
44
52
}
45
53
}
46
- }
54
+ };
47
55
48
56
// For creating permutations of the functions based on parameters that are constant over the transfer request
49
57
// These branches should all be scalar, and because of how templates work, the loops shouldn't have any
@@ -59,7 +67,7 @@ struct TransferLoopPermutationSrcIndexSizeLog
59
67
else if (transferRequest.dstIndexSizeLog2 == 2 ) TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 2 >.copyLoop (baseInvocationIndex, propertyId, transferRequest, dispatchSize);
60
68
else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 3 >.copyLoop (baseInvocationIndex, propertyId, transferRequest, dispatchSize);
61
69
}
62
- }
70
+ };
63
71
64
72
template<bool Fill, bool SrcIndexIota, bool DstIndexIota>
65
73
struct TransferLoopPermutationDstIota
@@ -71,7 +79,7 @@ struct TransferLoopPermutationDstIota
71
79
else if (transferRequest.srcIndexSizeLog2 == 2 ) TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 2 >.copyLoop (baseInvocationIndex, propertyId, transferRequest, dispatchSize);
72
80
else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 3 >.copyLoop (baseInvocationIndex, propertyId, transferRequest, dispatchSize);
73
81
}
74
- }
82
+ };
75
83
76
84
template<bool Fill, bool SrcIndexIota>
77
85
struct TransferLoopPermutationSrcIota
@@ -82,7 +90,7 @@ struct TransferLoopPermutationSrcIota
82
90
if (dstIota) TransferLoopPermutationDstIota<Fill, SrcIndexIota, true >.copyLoop (baseInvocationIndex, propertyId, transferRequest, dispatchSize);
83
91
else TransferLoopPermutationDstIota<Fill, SrcIndexIota, false >.copyLoop (baseInvocationIndex, propertyId, transferRequest, dispatchSize);
84
92
}
85
- }
93
+ };
86
94
87
95
template<bool Fill>
88
96
struct TransferLoopPermutationFill
@@ -93,9 +101,9 @@ struct TransferLoopPermutationFill
93
101
if (srcIota) TransferLoopPermutationSrcIota<Fill, true >.copyLoop (baseInvocationIndex, propertyId, transferRequest, dispatchSize);
94
102
else TransferLoopPermutationSrcIota<Fill, false >.copyLoop (baseInvocationIndex, propertyId, transferRequest, dispatchSize);
95
103
}
96
- }
104
+ };
97
105
98
- void main (uint32_t3 dispatchId : SV_DispatchThreadID )
106
+ void main (uint32_t3 dispatchId)
99
107
{
100
108
const uint propertyId = dispatchId.y;
101
109
const uint invocationIndex = dispatchId.x;
@@ -107,12 +115,14 @@ void main(uint32_t3 dispatchId : SV_DispatchThreadID)
107
115
transferRequest.dstAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t));
108
116
transferRequest.srcIndexAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 2 );
109
117
transferRequest.dstIndexAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 3 );
110
- // TODO: These are all part of the same bitfield and shoulbe read with a single RawBufferLoad
111
- transferRequest.elementCount = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 4 );
112
- transferRequest.propertySize = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 5 );
113
- transferRequest.fill = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 6 );
114
- transferRequest.srcIndexSizeLog2 = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 7 );
115
- transferRequest.dstIndexSizeLog2 = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 8 );
118
+ // Remaining elements are part of the same bitfield
119
+ // TODO: Do this only using raw buffer load?
120
+ uint64_t bitfieldType = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof (uint64_t) * 4 );
121
+ transferRequest.elementCount = bitfieldType;
122
+ transferRequest.propertySize = bitfieldType >> 35 ;
123
+ transferRequest.fill = bitfieldType >> (35 + 24 );
124
+ transferRequest.srcIndexSizeLog2 = bitfieldType >> (35 + 24 + 1 );
125
+ transferRequest.dstIndexSizeLog2 = bitfieldType >> (35 + 24 + 1 + 2 );
116
126
117
127
const uint dispatchSize = capability_traits::maxOptimallyResidentWorkgroupInvocations;
118
128
const bool fill = transferRequest.fill == 1 ;
@@ -124,4 +134,3 @@ void main(uint32_t3 dispatchId : SV_DispatchThreadID)
124
134
}
125
135
}
126
136
}
127
-
0 commit comments