1
1
#include "nbl/builtin/hlsl/cpp_compat.hlsl"
2
- #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
3
- #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
4
- #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
5
- #include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
6
- #include "nbl/builtin/hlsl/device_capabilities_traits.hlsl"
7
2
#include "nbl/builtin/hlsl/enums.hlsl"
3
+ #include "nbl/builtin/hlsl/macros.h"
4
+
5
+ #ifndef _NBL_BUILTIN_BOX_SAMPLER_INCLUDED_
6
+ #define _NBL_BUILTIN_BOX_SAMPLER_INCLUDED_
8
7
9
8
namespace nbl
10
9
{
11
10
namespace hlsl
12
11
{
13
- namespace box_blur
14
- {
15
-
16
- template<
17
- typename DataAccessor,
18
- typename SharedAccessor,
19
- typename ScanSharedAccessor,
20
- typename Sampler,
21
- uint16_t WorkgroupSize,
22
- class device_capabilities=void > // TODO: define concepts for the Box1D and apply constraints
23
- struct Box1D
12
+ namespace prefix_sum_blur
24
13
{
25
- // TODO: Generalize later on when Francesco enforces accessor-concepts in `workgroup` and adds a `SharedMemoryAccessor` concept
26
- struct ScanSharedAccessorWrapper
27
- {
28
- void get (const uint16_t ix, NBL_REF_ARG (float32_t) val)
29
- {
30
- val = base.template get<float32_t, uint16_t>(ix);
31
- }
32
-
33
- void set (const uint16_t ix, const float32_t val)
34
- {
35
- base.template set<float32_t, uint16_t>(ix, val);
36
- }
37
-
38
- void workgroupExecutionAndMemoryBarrier ()
39
- {
40
- base.workgroupExecutionAndMemoryBarrier ();
41
- }
42
-
43
- ScanSharedAccessor base;
44
- };
45
-
46
- void operator ()(
47
- NBL_REF_ARG (DataAccessor) data,
48
- NBL_REF_ARG (SharedAccessor) scratch,
49
- NBL_REF_ARG (ScanSharedAccessor) scanScratch,
50
- NBL_REF_ARG (Sampler) boxSampler,
51
- const uint16_t channel)
52
- {
53
- const uint16_t end = data.linearSize ();
54
- const uint16_t localInvocationIndex = workgroup::SubgroupContiguousIndex ();
55
-
56
- // prefix sum
57
- // note the dynamically uniform loop condition
58
- for (uint16_t baseIx = 0 ; baseIx < end;)
59
- {
60
- const uint16_t ix = localInvocationIndex + baseIx;
61
- float32_t input = data.template get<float32_t>(channel, ix);
62
- // dynamically uniform condition
63
- if (baseIx != 0 )
64
- {
65
- // take result of previous prefix sum and add it to first element here
66
- if (localInvocationIndex == 0 )
67
- input += scratch.template get<float32_t>(baseIx - 1 );
68
- }
69
- // need to copy-in / copy-out the accessor cause no references in HLSL - yay!
70
- ScanSharedAccessorWrapper scanScratchWrapper;
71
- scanScratchWrapper.base = scanScratch;
72
- const float32_t sum = workgroup::inclusive_scan<plus<float32_t>, WorkgroupSize, device_capabilities>::template __call (input, scanScratchWrapper);
73
- scanScratch = scanScratchWrapper.base;
74
- // loop increment
75
- baseIx += WorkgroupSize;
76
- // if doing the last prefix sum, we need to barrier to stop aliasing of temporary scratch for `inclusive_scan` and our scanline
77
- // TODO: might be worth adding a non-aliased mode as NSight says nr 1 hotspot is barrier waiting in this code
78
- if (end + ScanSharedAccessor::Size > SharedAccessor::Size)
79
- scratch.workgroupExecutionAndMemoryBarrier ();
80
- // save prefix sum results
81
- if (ix < end)
82
- scratch.template set<float32_t>(ix, sum);
83
- // previous prefix sum must have finished before we ask for results
84
- scratch.workgroupExecutionAndMemoryBarrier ();
85
- }
86
-
87
- const float32_t last = end - 1 ;
88
- const float32_t normalizationFactor = 1.f / (2.f * radius + 1.f );
89
-
90
- for (float32_t ix = localInvocationIndex; ix < end; ix += WorkgroupSize)
91
- {
92
- const float32_t result = boxSampler (scratch, ix, radius, borderColor[channel]);
93
- data.template set<float32_t>(channel, uint16_t (ix), result * normalizationFactor);
94
- }
95
- }
96
-
97
- vector <float32_t, DataAccessor::Channels> borderColor;
98
- float32_t radius;
99
- };
100
14
101
15
template<typename PrefixSumAccessor, typename T>
102
16
struct BoxSampler
103
17
{
18
+ using prefix_sum_accessor_t = PrefixSumAccessor;
19
+
20
+ PrefixSumAccessor prefixSumAccessor;
104
21
uint16_t wrapMode;
105
22
uint16_t linearSize;
23
+ T normalizationFactor;
106
24
107
- T operator ()(NBL_REF_ARG (PrefixSumAccessor) prefixSumAccessor, float32_t ix, float32_t radius, float32_t borderColor)
25
+ T operator ()(float32_t ix, float32_t radius, float32_t borderColor)
108
26
{
109
- const float32_t alpha = radius - floor (radius);
110
- const float32_t lastIdx = linearSize - 1 ;
27
+ const float32_t alpha = frac (radius);
111
28
const float32_t rightIdx = float32_t (ix) + radius;
112
- const float32_t leftIdx = float32_t (ix) - radius;
29
+ const float32_t leftIdx = float32_t (ix) - radius - 1 ;
30
+ const int32_t lastIdx = linearSize - 1 ;
113
31
const int32_t rightFlIdx = (int32_t)floor (rightIdx);
114
32
const int32_t rightClIdx = (int32_t)ceil (rightIdx);
115
33
const int32_t leftFlIdx = (int32_t)floor (leftIdx);
116
34
const int32_t leftClIdx = (int32_t)ceil (leftIdx);
117
35
36
+ assert (linearSize > 1 );
37
+
118
38
T result = 0 ;
119
- if (rightFlIdx < linearSize)
39
+ if (rightClIdx < linearSize)
120
40
{
121
41
result += lerp (prefixSumAccessor.template get<T, uint32_t>(rightFlIdx), prefixSumAccessor.template get<T, uint32_t>(rightClIdx), alpha);
122
42
}
@@ -126,8 +46,8 @@ struct BoxSampler
126
46
case ETC_REPEAT:
127
47
{
128
48
const T last = prefixSumAccessor.template get<T, uint32_t>(lastIdx);
129
- const T floored = prefixSumAccessor.template get<T, uint32_t>(rightFlIdx % linearSize) + ceil ( float32_t (rightFlIdx % lastIdx) / linearSize) * last;
130
- const T ceiled = prefixSumAccessor.template get<T, uint32_t>(rightClIdx % linearSize) + ceil ( float32_t (rightClIdx % lastIdx) / linearSize) * last;
49
+ const T floored = prefixSumAccessor.template get<T, uint32_t>(rightFlIdx % linearSize) + last;
50
+ const T ceiled = prefixSumAccessor.template get<T, uint32_t>(rightClIdx % linearSize) + last;
131
51
result += lerp (floored, ceiled, alpha);
132
52
break ;
133
53
}
@@ -179,8 +99,7 @@ struct BoxSampler
179
99
{
180
100
const T last = prefixSumAccessor.template get<T, uint32_t>(lastIdx);
181
101
const T first = prefixSumAccessor.template get<T, uint32_t>(0 );
182
- const T firstPlusOne = prefixSumAccessor.template get<T, uint32_t>(1 );
183
- result += (rightIdx - lastIdx) * (firstPlusOne - first) + last;
102
+ result += (rightIdx - lastIdx) * first + last;
184
103
break ;
185
104
}
186
105
}
@@ -196,19 +115,19 @@ struct BoxSampler
196
115
case ETC_REPEAT:
197
116
{
198
117
const T last = prefixSumAccessor.template get<T, uint32_t>(lastIdx);
199
- const T floored = prefixSumAccessor.template get<T, uint32_t>(abs ( leftFlIdx) % linearSize) + ceil (T (leftFlIdx) / linearSize) * last;
200
- const T ceiled = prefixSumAccessor.template get<T, uint32_t>(abs ( leftClIdx) % linearSize) + ceil ( float32_t (leftClIdx) / linearSize) * last;
118
+ const T floored = prefixSumAccessor.template get<T, uint32_t>((lastIdx + leftFlIdx) % linearSize) + floor (T (leftFlIdx) / linearSize) * last;
119
+ const T ceiled = prefixSumAccessor.template get<T, uint32_t>((lastIdx + leftClIdx) % linearSize) + floor ( T (leftClIdx) / linearSize) * last;
201
120
result -= lerp (floored, ceiled, alpha);
202
121
break ;
203
122
}
204
123
case ETC_CLAMP_TO_BORDER:
205
124
{
206
- result -= prefixSumAccessor.template get<T, uint32_t>( 0 ) + leftIdx * borderColor;
125
+ result -= (leftIdx + 1 ) * borderColor;
207
126
break ;
208
127
}
209
128
case ETC_CLAMP_TO_EDGE:
210
129
{
211
- result -= leftIdx * prefixSumAccessor.template get<T, uint32_t>(0 );
130
+ result -= ( 1 - abs ( leftIdx)) * prefixSumAccessor.template get<T, uint32_t>(0 );
212
131
break ;
213
132
}
214
133
case ETC_MIRROR:
@@ -247,16 +166,18 @@ struct BoxSampler
247
166
{
248
167
const T last = prefixSumAccessor.template get<T, uint32_t>(lastIdx);
249
168
const T lastMinusOne = prefixSumAccessor.template get<T, uint32_t>(lastIdx - 1 );
250
- result -= leftIdx * (last - lastMinusOne);
169
+ result -= ( 1 - abs ( leftIdx)) * (last - lastMinusOne);
251
170
break ;
252
171
}
253
172
}
254
173
}
255
174
256
- return result;
175
+ return result * normalizationFactor ;
257
176
}
258
177
};
259
178
260
179
}
261
180
}
262
- }
181
+ }
182
+
183
+ #endif
0 commit comments