Skip to content

Commit d2adf17

Browse files
committed
prefix_sum_blur: address reviews
Signed-off-by: Ali Cheraghi <alichraghi@proton.me>
1 parent 19973ac commit d2adf17

File tree

3 files changed

+137
-109
lines changed

3 files changed

+137
-109
lines changed
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#include "nbl/builtin/hlsl/cpp_compat.hlsl"
2+
#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
3+
#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
4+
#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
5+
#include "nbl/builtin/hlsl/device_capabilities_traits.hlsl"
6+
#include "nbl/builtin/hlsl/enums.hlsl"
7+
8+
#ifndef _NBL_BUILTIN_PREFIX_SUM_BLUR_INCLUDED_
9+
#define _NBL_BUILTIN_PREFIX_SUM_BLUR_INCLUDED_
10+
11+
namespace nbl
12+
{
13+
namespace hlsl
14+
{
15+
namespace prefix_sum_blur
16+
{
17+
18+
// Prefix-Sum Blur using SAT (Summed Area Table) technique
19+
template<
20+
typename DataAccessor,
21+
typename ScanSharedAccessor,
22+
typename Sampler,
23+
uint16_t WorkgroupSize,
24+
class device_capabilities=void> // TODO: define concepts for the Box1D and apply constraints
25+
struct Blur1D
26+
{
27+
// TODO: Generalize later on when Francesco enforces accessor-concepts in `workgroup` and adds a `SharedMemoryAccessor` concept
28+
struct ScanSharedAccessorWrapper
29+
{
30+
void get(const uint16_t ix, NBL_REF_ARG(float32_t) val)
31+
{
32+
val = base.template get<float32_t, uint16_t>(ix);
33+
}
34+
35+
void set(const uint16_t ix, const float32_t val)
36+
{
37+
base.template set<float32_t, uint16_t>(ix, val);
38+
}
39+
40+
void workgroupExecutionAndMemoryBarrier()
41+
{
42+
base.workgroupExecutionAndMemoryBarrier();
43+
}
44+
45+
ScanSharedAccessor base;
46+
};
47+
48+
void operator()(
49+
NBL_REF_ARG(DataAccessor) data,
50+
NBL_REF_ARG(ScanSharedAccessor) scanScratch,
51+
NBL_REF_ARG(Sampler) _sampler,
52+
const uint16_t channel)
53+
{
54+
const uint16_t end = data.linearSize();
55+
const uint16_t localInvocationIndex = workgroup::SubgroupContiguousIndex();
56+
57+
// prefix sum
58+
// note the dynamically uniform loop condition
59+
for (uint16_t baseIx = 0; baseIx < end;)
60+
{
61+
const uint16_t ix = localInvocationIndex + baseIx;
62+
float32_t input = data.template get<float32_t>(channel, ix);
63+
// dynamically uniform condition
64+
if (baseIx != 0)
65+
{
66+
// take result of previous prefix sum and add it to first element here
67+
if (localInvocationIndex == 0)
68+
input += _sampler.prefixSumAccessor.template get<float32_t>(baseIx - 1);
69+
}
70+
// need to copy-in / copy-out the accessor cause no references in HLSL - yay!
71+
ScanSharedAccessorWrapper scanScratchWrapper;
72+
scanScratchWrapper.base = scanScratch;
73+
const float32_t sum = workgroup::inclusive_scan<plus<float32_t>, WorkgroupSize, device_capabilities>::template __call(input, scanScratchWrapper);
74+
scanScratch = scanScratchWrapper.base;
75+
// loop increment
76+
baseIx += WorkgroupSize;
77+
// if doing the last prefix sum, we need to barrier to stop aliasing of temporary scratch for `inclusive_scan` and our scanline
78+
// TODO: might be worth adding a non-aliased mode as NSight says nr 1 hotspot is barrier waiting in this code
79+
if (end + ScanSharedAccessor::Size > Sampler::prefix_sum_accessor_t::Size)
80+
_sampler.prefixSumAccessor.workgroupExecutionAndMemoryBarrier();
81+
// save prefix sum results
82+
if (ix < end)
83+
_sampler.prefixSumAccessor.template set<float32_t>(ix, sum);
84+
// previous prefix sum must have finished before we ask for results
85+
_sampler.prefixSumAccessor.workgroupExecutionAndMemoryBarrier();
86+
}
87+
88+
// TODO: split this Blur1D into two separate functors:
89+
// - multi-wg-wide prefix sum
90+
// - the SAT sampling
91+
const float32_t last = end - 1;
92+
for (float32_t ix = localInvocationIndex; ix < end; ix += WorkgroupSize)
93+
{
94+
const float32_t result = _sampler(ix, radius, borderColor[channel]);
95+
data.template set<float32_t>(channel, uint16_t(ix), result);
96+
}
97+
}
98+
99+
vector<float32_t, DataAccessor::Channels> borderColor;
100+
float32_t radius;
101+
};
102+
103+
}
104+
}
105+
}
106+
107+
#endif
Original file line numberDiff line numberDiff line change
@@ -1,122 +1,42 @@
11
#include "nbl/builtin/hlsl/cpp_compat.hlsl"
2-
#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
3-
#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
4-
#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
5-
#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
6-
#include "nbl/builtin/hlsl/device_capabilities_traits.hlsl"
72
#include "nbl/builtin/hlsl/enums.hlsl"
3+
#include "nbl/builtin/hlsl/macros.h"
4+
5+
#ifndef _NBL_BUILTIN_BOX_SAMPLER_INCLUDED_
6+
#define _NBL_BUILTIN_BOX_SAMPLER_INCLUDED_
87

98
namespace nbl
109
{
1110
namespace hlsl
1211
{
13-
namespace box_blur
14-
{
15-
16-
template<
17-
typename DataAccessor,
18-
typename SharedAccessor,
19-
typename ScanSharedAccessor,
20-
typename Sampler,
21-
uint16_t WorkgroupSize,
22-
class device_capabilities=void> // TODO: define concepts for the Box1D and apply constraints
23-
struct Box1D
12+
namespace prefix_sum_blur
2413
{
25-
// TODO: Generalize later on when Francesco enforces accessor-concepts in `workgroup` and adds a `SharedMemoryAccessor` concept
26-
struct ScanSharedAccessorWrapper
27-
{
28-
void get(const uint16_t ix, NBL_REF_ARG(float32_t) val)
29-
{
30-
val = base.template get<float32_t, uint16_t>(ix);
31-
}
32-
33-
void set(const uint16_t ix, const float32_t val)
34-
{
35-
base.template set<float32_t, uint16_t>(ix, val);
36-
}
37-
38-
void workgroupExecutionAndMemoryBarrier()
39-
{
40-
base.workgroupExecutionAndMemoryBarrier();
41-
}
42-
43-
ScanSharedAccessor base;
44-
};
45-
46-
void operator()(
47-
NBL_REF_ARG(DataAccessor) data,
48-
NBL_REF_ARG(SharedAccessor) scratch,
49-
NBL_REF_ARG(ScanSharedAccessor) scanScratch,
50-
NBL_REF_ARG(Sampler) boxSampler,
51-
const uint16_t channel)
52-
{
53-
const uint16_t end = data.linearSize();
54-
const uint16_t localInvocationIndex = workgroup::SubgroupContiguousIndex();
55-
56-
// prefix sum
57-
// note the dynamically uniform loop condition
58-
for (uint16_t baseIx = 0; baseIx < end;)
59-
{
60-
const uint16_t ix = localInvocationIndex + baseIx;
61-
float32_t input = data.template get<float32_t>(channel, ix);
62-
// dynamically uniform condition
63-
if (baseIx != 0)
64-
{
65-
// take result of previous prefix sum and add it to first element here
66-
if (localInvocationIndex == 0)
67-
input += scratch.template get<float32_t>(baseIx - 1);
68-
}
69-
// need to copy-in / copy-out the accessor cause no references in HLSL - yay!
70-
ScanSharedAccessorWrapper scanScratchWrapper;
71-
scanScratchWrapper.base = scanScratch;
72-
const float32_t sum = workgroup::inclusive_scan<plus<float32_t>, WorkgroupSize, device_capabilities>::template __call(input, scanScratchWrapper);
73-
scanScratch = scanScratchWrapper.base;
74-
// loop increment
75-
baseIx += WorkgroupSize;
76-
// if doing the last prefix sum, we need to barrier to stop aliasing of temporary scratch for `inclusive_scan` and our scanline
77-
// TODO: might be worth adding a non-aliased mode as NSight says nr 1 hotspot is barrier waiting in this code
78-
if (end + ScanSharedAccessor::Size > SharedAccessor::Size)
79-
scratch.workgroupExecutionAndMemoryBarrier();
80-
// save prefix sum results
81-
if (ix < end)
82-
scratch.template set<float32_t>(ix, sum);
83-
// previous prefix sum must have finished before we ask for results
84-
scratch.workgroupExecutionAndMemoryBarrier();
85-
}
86-
87-
const float32_t last = end - 1;
88-
const float32_t normalizationFactor = 1.f / (2.f * radius + 1.f);
89-
90-
for (float32_t ix = localInvocationIndex; ix < end; ix += WorkgroupSize)
91-
{
92-
const float32_t result = boxSampler(scratch, ix, radius, borderColor[channel]);
93-
data.template set<float32_t>(channel, uint16_t(ix), result * normalizationFactor);
94-
}
95-
}
96-
97-
vector<float32_t, DataAccessor::Channels> borderColor;
98-
float32_t radius;
99-
};
10014

10115
template<typename PrefixSumAccessor, typename T>
10216
struct BoxSampler
10317
{
18+
using prefix_sum_accessor_t = PrefixSumAccessor;
19+
20+
PrefixSumAccessor prefixSumAccessor;
10421
uint16_t wrapMode;
10522
uint16_t linearSize;
23+
T normalizationFactor;
10624

107-
T operator()(NBL_REF_ARG(PrefixSumAccessor) prefixSumAccessor, float32_t ix, float32_t radius, float32_t borderColor)
25+
T operator()(float32_t ix, float32_t radius, float32_t borderColor)
10826
{
109-
const float32_t alpha = radius - floor(radius);
110-
const float32_t lastIdx = linearSize - 1;
27+
const float32_t alpha = frac(radius);
11128
const float32_t rightIdx = float32_t(ix) + radius;
112-
const float32_t leftIdx = float32_t(ix) - radius;
29+
const float32_t leftIdx = float32_t(ix) - radius - 1;
30+
const int32_t lastIdx = linearSize - 1;
11331
const int32_t rightFlIdx = (int32_t)floor(rightIdx);
11432
const int32_t rightClIdx = (int32_t)ceil(rightIdx);
11533
const int32_t leftFlIdx = (int32_t)floor(leftIdx);
11634
const int32_t leftClIdx = (int32_t)ceil(leftIdx);
11735

36+
assert(linearSize > 1);
37+
11838
T result = 0;
119-
if (rightFlIdx < linearSize)
39+
if (rightClIdx < linearSize)
12040
{
12141
result += lerp(prefixSumAccessor.template get<T, uint32_t>(rightFlIdx), prefixSumAccessor.template get<T, uint32_t>(rightClIdx), alpha);
12242
}
@@ -126,8 +46,8 @@ struct BoxSampler
12646
case ETC_REPEAT:
12747
{
12848
const T last = prefixSumAccessor.template get<T, uint32_t>(lastIdx);
129-
const T floored = prefixSumAccessor.template get<T, uint32_t>(rightFlIdx % linearSize) + ceil(float32_t(rightFlIdx % lastIdx) / linearSize) * last;
130-
const T ceiled = prefixSumAccessor.template get<T, uint32_t>(rightClIdx % linearSize) + ceil(float32_t(rightClIdx % lastIdx) / linearSize) * last;
49+
const T floored = prefixSumAccessor.template get<T, uint32_t>(rightFlIdx % linearSize) + last;
50+
const T ceiled = prefixSumAccessor.template get<T, uint32_t>(rightClIdx % linearSize) + last;
13151
result += lerp(floored, ceiled, alpha);
13252
break;
13353
}
@@ -179,8 +99,7 @@ struct BoxSampler
17999
{
180100
const T last = prefixSumAccessor.template get<T, uint32_t>(lastIdx);
181101
const T first = prefixSumAccessor.template get<T, uint32_t>(0);
182-
const T firstPlusOne = prefixSumAccessor.template get<T, uint32_t>(1);
183-
result += (rightIdx - lastIdx) * (firstPlusOne - first) + last;
102+
result += (rightIdx - lastIdx) * first + last;
184103
break;
185104
}
186105
}
@@ -196,19 +115,19 @@ struct BoxSampler
196115
case ETC_REPEAT:
197116
{
198117
const T last = prefixSumAccessor.template get<T, uint32_t>(lastIdx);
199-
const T floored = prefixSumAccessor.template get<T, uint32_t>(abs(leftFlIdx) % linearSize) + ceil(T(leftFlIdx) / linearSize) * last;
200-
const T ceiled = prefixSumAccessor.template get<T, uint32_t>(abs(leftClIdx) % linearSize) + ceil(float32_t(leftClIdx) / linearSize) * last;
118+
const T floored = prefixSumAccessor.template get<T, uint32_t>((lastIdx + leftFlIdx) % linearSize) + floor(T(leftFlIdx) / linearSize) * last;
119+
const T ceiled = prefixSumAccessor.template get<T, uint32_t>((lastIdx + leftClIdx) % linearSize) + floor(T(leftClIdx) / linearSize) * last;
201120
result -= lerp(floored, ceiled, alpha);
202121
break;
203122
}
204123
case ETC_CLAMP_TO_BORDER:
205124
{
206-
result -= prefixSumAccessor.template get<T, uint32_t>(0) + leftIdx * borderColor;
125+
result -= (leftIdx + 1) * borderColor;
207126
break;
208127
}
209128
case ETC_CLAMP_TO_EDGE:
210129
{
211-
result -= leftIdx * prefixSumAccessor.template get<T, uint32_t>(0);
130+
result -= (1 - abs(leftIdx)) * prefixSumAccessor.template get<T, uint32_t>(0);
212131
break;
213132
}
214133
case ETC_MIRROR:
@@ -247,16 +166,18 @@ struct BoxSampler
247166
{
248167
const T last = prefixSumAccessor.template get<T, uint32_t>(lastIdx);
249168
const T lastMinusOne = prefixSumAccessor.template get<T, uint32_t>(lastIdx - 1);
250-
result -= leftIdx * (last - lastMinusOne);
169+
result -= (1 - abs(leftIdx)) * (last - lastMinusOne);
251170
break;
252171
}
253172
}
254173
}
255174

256-
return result;
175+
return result * normalizationFactor;
257176
}
258177
};
259178

260179
}
261180
}
262-
}
181+
}
182+
183+
#endif

src/nbl/asset/IAssetManager.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -308,8 +308,8 @@ void IAssetManager::insertBuiltinAssets()
308308
{
309309
asset::ISampler::SParams params;
310310
params.TextureWrapU = asset::ISampler::E_TEXTURE_CLAMP::ETC_REPEAT;
311-
params.TextureWrapV = asset::ISampler::E_TEXTURE_CLAMP::ETC_REPEAT;
312-
params.TextureWrapW = asset::ISampler::E_TEXTURE_CLAMP::ETC_REPEAT;
311+
params.TextureWrapV = asset::ISampler::E_TEXTURE_CLAMP::ETC_REPEAT;
312+
params.TextureWrapW = asset::ISampler::E_TEXTURE_CLAMP::ETC_REPEAT;
313313
params.BorderColor = asset::ISampler::ETBC_FLOAT_OPAQUE_BLACK;
314314
params.MinFilter = asset::ISampler::ETF_LINEAR;
315315
params.MaxFilter = asset::ISampler::ETF_LINEAR;

0 commit comments

Comments
 (0)