Skip to content

Commit 2077fba

Browse files
committed
Merge branch 'master' into ray_query_geometry_creator_example
2 parents 8736be1 + eebc9fe commit 2077fba

26 files changed

+750
-368
lines changed

3rdparty/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ if(_NBL_COMPILE_WITH_OPEN_EXR_)
235235
set(BUILD_TESTING ${_OLD_BUILD_TESTING})
236236
endif()
237237

238+
238239
#gli
239240
option(_NBL_COMPILE_WITH_GLI_ "Build with GLI library" ON)
240241
if(_NBL_COMPILE_WITH_GLI_)

3rdparty/dxc/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
option(NBL_LLVM_ENABLE_ASSERTIONS "LLVM_ENABLE_ASSERTIONS" ON)
2+
3+
list(APPEND NBL_DXC_CMAKE_OPTIONS "-DLLVM_ENABLE_ASSERTIONS:BOOL=${NBL_LLVM_ENABLE_ASSERTIONS}")
14
list(APPEND NBL_DXC_CMAKE_OPTIONS "-DHLSL_OPTIONAL_PROJS_IN_DEFAULT:BOOL=OFF")
25
list(APPEND NBL_DXC_CMAKE_OPTIONS "-DHLSL_ENABLE_ANALYZE:BOOL=OFF")
36
list(APPEND NBL_DXC_CMAKE_OPTIONS "-DHLSL_OFFICIAL_BUILD:BOOL=OFF")

include/nbl/builtin/hlsl/blit/common.hlsl

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <nbl/builtin/hlsl/binding_info.hlsl>
88

9+
#include <nbl/builtin/hlsl/glsl_compat/core.hlsl>
910
namespace nbl
1011
{
1112
namespace hlsl
@@ -43,6 +44,11 @@ RWTexture3D<float4> outAs3D[ConstevalParameters::output_binding_t::Count];
4344

4445

4546
groupshared uint32_t sMem[ConstevalParameters::SharedMemoryDWORDs];
47+
48+
[[vk::push_constant]] const nbl::hlsl::blit::Parameters pc;
49+
50+
51+
#include <nbl/builtin/hlsl/concepts.hlsl>
4652
/*
4753
struct HistogramAccessor
4854
{
@@ -62,19 +68,34 @@ struct SharedAccessor
6268
sMem[idx] = val;
6369
}
6470
};
65-
struct InCSAccessor
66-
{
67-
float32_t4 get(float32_t3 c, uint32_t l)
68-
{
69-
return inCS.SampleLevel(inSamp, blit::impl::dim_to_image_properties<ConstevalParameters::BlitDimCount>::getIndexCoord<float32_t>(c, l), 0);
70-
}
71-
};
71+
*/
72+
7273
struct OutImgAccessor
7374
{
74-
void set(int32_t3 c, uint32_t l, float32_t4 v)
75+
template<typename T, int32_t Dims NBL_FUNC_REQUIRES(is_same_v<T,float>)
76+
void set(const vector<uint16_t,Dims> uv, uint16_t layer, const vector<T,4> data)
7577
{
76-
outImg[blit::impl::dim_to_image_properties<ConstevalParameters::BlitDimCount>::getIndexCoord<int32_t>(c, l)] = v;
78+
return __set_impl<Dims>(uv,layer,data);
7779
}
80+
81+
template<int32_t Dims>
82+
void __set_impl(const vector<uint16_t,Dims> uv, uint16_t layer, const float32_t4 data);
83+
84+
uint32_t descIx;
7885
};
79-
*/
86+
template<>
87+
void OutImgAccessor::__set_impl<1>(const uint16_t1 uv, uint16_t layer, const float32_t4 data)
88+
{
89+
outAs1DArray[descIx][uint32_t2(uv,layer)] = data;
90+
}
91+
template<>
92+
void OutImgAccessor::__set_impl<2>(const uint16_t2 uv, uint16_t layer, const float32_t4 data)
93+
{
94+
outAs2DArray[descIx][uint32_t3(uv,layer)] = data;
95+
}
96+
template<>
97+
void OutImgAccessor::__set_impl<3>(const uint16_t3 uv, uint16_t layer, const float32_t4 data)
98+
{
99+
outAs3D[descIx][uv] = data;
100+
}
80101
#endif

include/nbl/builtin/hlsl/blit/compute_blit.hlsl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ struct compute_blit_t
7777
uint16_t localInvocationIndex)
7878
{
7979
const float3 halfScale = scale * float3(0.5f, 0.5f, 0.5f);
80+
// bottom of the input tile
8081
const uint32_t3 minOutputPixel = workGroupID * outputTexelsPerWG;
8182
const float3 minOutputPixelCenterOfWG = float3(minOutputPixel)*scale + halfScale;
8283
// this can be negative, in which case HW sampler takes care of wrapping for us

include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,21 +21,38 @@ struct KernelWeightsAccessor
2121
return kernelWeights[idx];
2222
}
2323
};
24-
struct InCSAccessor
24+
25+
inCS.SampleLevel(inSamp, blit::impl::dim_to_image_properties<ConstevalParameters::BlitDimCount>::getIndexCoord<float32_t>(c, l), 0);
26+
*/
27+
struct InImgAccessor
2528
{
26-
float32_t4 get(float32_t3 c, uint32_t l)
29+
template<typename T, int32_t Dims NBL_FUNC_REQUIRES(is_same_v<T,float>)
30+
vector<T,4> get(const vector<uint16_t,Dims> uv, uint16_t layer, uint16_t level)
2731
{
28-
return inCS.SampleLevel(inSamp, blit::impl::dim_to_image_properties<ConstevalParameters::BlitDimCount>::getIndexCoord<float32_t>(c, l), 0);
32+
return __get_impl<Dims>(uv,_static_cast<float>(layer),_static_cast<float>(level));
2933
}
34+
35+
template<int32_t Dims>
36+
float32_t4 __get_impl(const vector<float32_t,Dims> uv, float layer, float level);
37+
38+
uint32_t descIx : 20;
39+
uint32_t samplerIx : 12;
3040
};
31-
struct OutImgAccessor
41+
template<>
42+
float32_t4 InImgAccessor::__get_impl<1>(const float32_t1 uv, float layer, float level)
3243
{
33-
void set(int32_t3 c, uint32_t l, float32_t4 v)
34-
{
35-
outImg[blit::impl::dim_to_image_properties<ConstevalParameters::BlitDimCount>::getIndexCoord<int32_t>(c, l)] = v;
36-
}
37-
};
38-
*/
44+
return inAs1DArray[descIx].SampleLevel(inSamp[samplerIx],float32_t2(uv,layer),level);
45+
}
46+
template<>
47+
float32_t4 InImgAccessor::__get_impl<2>(const float32_t2 uv, float layer, float level)
48+
{
49+
return inAs2DArray[descIx].SampleLevel(inSamp[samplerIx],float32_t3(uv,layer),level);
50+
}
51+
template<>
52+
float32_t4 InImgAccessor::__get_impl<3>(const float32_t3 uv, float layer, float level)
53+
{
54+
return inAs3D[descIx].SampleLevel(inSamp[samplerIx],uv,level);
55+
}
3956

4057
using namespace nbl::hlsl::blit;
4158

@@ -44,6 +61,26 @@ using namespace nbl::hlsl::blit;
4461
[numthreads(ConstevalParameters::WorkGroupSize,1,1)]
4562
void main()
4663
{
64+
InImgAccessor inImgA;
65+
66+
OutImgAccessor outImgA;
67+
outImgA.descIx = pc.outputDescIx;
68+
69+
const uint16_t3 wgID = _static_cast<uint16_t3>(glsl::gl_WorkGroupID());
70+
const uint16_t3 baseCoord = pc.perWG.getOutputBaseCoord(wgID);
71+
// TODO: If and when someone can be bothered, change the blit api to compile a pipeline per image dimension, maybe it will be faster
72+
switch (pc.perWG.imageDim)
73+
{
74+
case 1:
75+
outImgA.set(uint16_t1(baseCoord.x),wgID.z,float32_t4(1,0,1,1));
76+
break;
77+
case 2:
78+
outImgA.set(baseCoord.xy,wgID.z,float32_t4(1,0,1,1));
79+
break;
80+
case 3:
81+
outImgA.set(baseCoord,0xdeadu,float32_t4(1,0,1,1));
82+
break;
83+
}
4784
/*
4885
blit::compute_blit_t<ConstevalParameters> blit = blit::compute_blit_t<ConstevalParameters>::create(params);
4986
InCSAccessor inCSA;

include/nbl/builtin/hlsl/blit/parameters.hlsl

Lines changed: 79 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ namespace blit
1414

1515
struct parameters_t
1616
{
17-
float32_t3 fScale;
17+
float32_t3 fScale; //
1818
float32_t3 negativeSupport;
1919
float32_t referenceAlpha;
2020
uint32_t kernelWeightsOffsetY;
@@ -24,17 +24,15 @@ struct parameters_t
2424

2525
uint16_t3 inputDims;
2626
uint16_t3 outputDims;
27-
uint16_t3 windowDims;
27+
uint16_t3 windowDims; //
2828
uint16_t3 phaseCount;
29-
uint16_t3 preloadRegion;
29+
uint16_t3 preloadRegion; //
3030
uint16_t3 iterationRegionXPrefixProducts;
3131
uint16_t3 iterationRegionYPrefixProducts;
3232
uint16_t3 iterationRegionZPrefixProducts;
3333

34-
//! Offset into the shared memory array which tells us from where the second buffer of shared memory begins
35-
//! Given by max(memory_for_preload_region, memory_for_result_of_y_pass)
36-
uint16_t secondScratchOffset;
37-
uint16_t outputTexelsPerWGZ;
34+
uint16_t secondScratchOffset; //
35+
uint16_t outputTexelsPerWGZ; //
3836

3937
uint32_t3 getOutputTexelsPerWG()
4038
{
@@ -44,36 +42,88 @@ struct parameters_t
4442
}
4543
};
4644

47-
struct parameters2_t
45+
// We do some dumb things with bitfields here like not using `vector<uint16_t,N>`, because AMD doesn't support them in push constants
46+
struct SPerWorkgroup
4847
{
49-
float32_t3 fScale;
50-
float32_t3 negativeSupportMinusHalf;
51-
float32_t referenceAlpha;
52-
uint32_t kernelWeightsOffsetY;
53-
uint32_t kernelWeightsOffsetZ;
54-
uint32_t inPixelCount;
55-
uint32_t outPixelCount;
48+
static inline SPerWorkgroup create(const float32_t3 _scale, const uint16_t _imageDim, const uint16_t3 output, const uint16_t3 preload, const uint16_t _otherPreloadOffset)
49+
{
50+
SPerWorkgroup retval;
51+
retval.scale = _scale;
52+
retval.imageDim = _imageDim;
53+
retval.preloadWidth = preload[0];
54+
retval.preloadHeight = preload[1];
55+
retval.preloadDepth = preload[2];
56+
retval.outputWidth = output[0];
57+
retval.outputHeight = output[1];
58+
retval.outputDepth = output[2];
59+
retval.otherPreloadOffset = _otherPreloadOffset;
60+
return retval;
61+
}
5662

57-
uint16_t3 inputDims;
58-
uint16_t3 outputDims;
59-
uint16_t3 windowDims;
60-
uint16_t3 phaseCount;
61-
uint16_t3 preloadRegion;
62-
uint16_t3 iterationRegionXPrefixProducts;
63-
uint16_t3 iterationRegionYPrefixProducts;
64-
uint16_t3 iterationRegionZPrefixProducts;
63+
inline uint16_t3 getOutputBaseCoord(const uint16_t3 workgroup) NBL_CONST_MEMBER_FUNC
64+
{
65+
return workgroup*uint16_t3(outputWidth,outputHeight,outputDepth);
66+
}
67+
68+
inline uint16_t3 getWorkgroupCount(const uint16_t3 outExtent, const uint16_t layersToBlit=0) NBL_CONST_MEMBER_FUNC
69+
{
70+
const uint16_t3 unit = uint16_t3(1,1,1);
71+
uint16_t3 retval = unit;
72+
retval += (outExtent-unit)/getOutputBaseCoord(unit);
73+
if (layersToBlit)
74+
retval[2] = layersToBlit;
75+
return retval;
76+
}
6577

78+
#ifndef __HLSL_VERSION
79+
explicit inline operator bool() const
80+
{
81+
return outputWidth && outputHeight && outputDepth && preloadWidth && preloadHeight && preloadDepth;
82+
}
83+
#endif
84+
85+
// ratio of input pixels to output
86+
float32_t3 scale;
87+
// whether its an image1D, image2D or image3D
88+
uint32_t imageDim : 2;
89+
uint32_t unused0 : 14; // channel, iterationRegionPrefixSums ?
90+
// 16bit in each dimension because some GPUs actually have enough shared memory for 32k pixels
91+
uint32_t outputWidth : 16;
92+
uint32_t outputHeight : 16;
93+
uint32_t outputDepth : 16;
94+
uint32_t preloadWidth : 16;
95+
uint32_t preloadHeight : 16;
96+
uint32_t preloadDepth : 16;
6697
//! Offset into the shared memory array which tells us from where the second buffer of shared memory begins
6798
//! Given by max(memory_for_preload_region, memory_for_result_of_y_pass)
68-
uint16_t secondScratchOffset;
69-
uint16_t outputTexelsPerWGZ;
99+
uint32_t otherPreloadOffset : 16;
100+
};
70101

71-
uint32_t3 getOutputTexelsPerWG()
102+
struct Parameters
103+
{
104+
#ifndef __HLSL_VERSION
105+
explicit inline operator bool() const
72106
{
73-
//! `outputTexelsPerWG.xy` just happens to be in the first components of `iterationRegionsXPrefixProducts` and `iterationRegionYPrefixProducts` --this is
74-
//! the result of how we choose to iterate, i.e. if, in the future, we decide to iterate differently, this needs to change.
75-
return uint32_t3(iterationRegionXPrefixProducts.x, iterationRegionYPrefixProducts.x, outputTexelsPerWGZ);
107+
return bool(perWG);
76108
}
109+
#endif
110+
111+
SPerWorkgroup perWG; // rename to perBlitWG?
112+
//! general settings
113+
uint32_t inputDescIx : 19;
114+
uint32_t samplerDescIx : 11;
115+
uint32_t unused0 : 2;
116+
//
117+
uint32_t outputDescIx : 19;
118+
uint32_t channelCount : 3;
119+
uint32_t unused1 : 10;
120+
//
121+
uint32_t unused2 : 12;
122+
//! coverage settings
123+
uint32_t intermAlphaDescIx : 19;
124+
uint32_t coverage : 1;
125+
// required to compare the atomic count of passing pixels against, so we can get original coverage
126+
uint32_t inPixelCount;
77127
};
78128

79129

include/nbl/builtin/hlsl/concepts.hlsl

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ namespace hlsl
1717
namespace concepts
1818
{
1919
// common implementation juice
20+
#define NBL_CONCEPT_MAX_PARAM_COUNT 32
2021
#include <boost/preprocessor/seq/elem.hpp>
2122
#define NBL_IMPL_CONCEPT_FULL_TPLT(z, n, unused) BOOST_PP_SEQ_ELEM(n,NBL_CONCEPT_TPLT_PRM_KINDS) BOOST_PP_SEQ_ELEM(n,NBL_CONCEPT_TPLT_PRM_NAMES)
2223
#include <boost/preprocessor/repetition/enum.hpp>
@@ -34,7 +35,7 @@ namespace concepts
3435

3536

3637
//! Now diverge
37-
#ifndef __cpp_concepts
38+
#ifdef __cpp_concepts
3839

3940

4041
// to define a concept using `concept Name = SomeContexprBoolCondition<T>;`
@@ -138,10 +139,10 @@ concept matricial = is_matrix<T>::value;
138139
// put just after the closing `>` on the partial template specialization `template` declaration e.g. `template<typename U, typename V, typename T> NBL_PARTIAL_REQ_TOP(SomeCond<U>)
139140
#define NBL_PARTIAL_REQ_TOP(...)
140141
// put just before closing `>` on the partial template specialization Type args, e.g. `MyStruct<U,V,T NBL_PARTIAL_REQ_BOT(SomeCond<U>)>
141-
#define NBL_PARTIAL_REQ_BOT(...) ,std::enable_if_t<(__VA_ARGS__),void>
142+
#define NBL_PARTIAL_REQ_BOT(...) ,::nbl::hlsl::enable_if_t<(__VA_ARGS__),void>
142143

143144
// condition, use instead of the closing `>` of a function template
144-
#define NBL_FUNC_REQUIRES(...) ,std::enable_if_t<(__VA_ARGS__),bool> = true>
145+
#define NBL_FUNC_REQUIRES(...) ,::nbl::hlsl::enable_if_t<(__VA_ARGS__),bool> = true>
145146

146147

147148
//
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
2+
// This file is part of the "Nabla Engine".
3+
// For conditions of distribution and use, see copyright notice in nabla.h
4+
#undef NBL_CONCEPT_PARAM_31
5+
#undef NBL_CONCEPT_PARAM_30
6+
#undef NBL_CONCEPT_PARAM_29
7+
#undef NBL_CONCEPT_PARAM_28
8+
#undef NBL_CONCEPT_PARAM_27
9+
#undef NBL_CONCEPT_PARAM_26
10+
#undef NBL_CONCEPT_PARAM_25
11+
#undef NBL_CONCEPT_PARAM_24
12+
#undef NBL_CONCEPT_PARAM_23
13+
#undef NBL_CONCEPT_PARAM_22
14+
#undef NBL_CONCEPT_PARAM_21
15+
#undef NBL_CONCEPT_PARAM_20
16+
#undef NBL_CONCEPT_PARAM_19
17+
#undef NBL_CONCEPT_PARAM_18
18+
#undef NBL_CONCEPT_PARAM_17
19+
#undef NBL_CONCEPT_PARAM_16
20+
#undef NBL_CONCEPT_PARAM_15
21+
#undef NBL_CONCEPT_PARAM_14
22+
#undef NBL_CONCEPT_PARAM_13
23+
#undef NBL_CONCEPT_PARAM_12
24+
#undef NBL_CONCEPT_PARAM_11
25+
#undef NBL_CONCEPT_PARAM_10
26+
#undef NBL_CONCEPT_PARAM_9
27+
#undef NBL_CONCEPT_PARAM_8
28+
#undef NBL_CONCEPT_PARAM_7
29+
#undef NBL_CONCEPT_PARAM_6
30+
#undef NBL_CONCEPT_PARAM_5
31+
#undef NBL_CONCEPT_PARAM_4
32+
#undef NBL_CONCEPT_PARAM_3
33+
#undef NBL_CONCEPT_PARAM_2
34+
#undef NBL_CONCEPT_PARAM_1
35+
#undef NBL_CONCEPT_PARAM_0
36+
#undef NBL_CONCEPT_TPLT_PRM_NAMES
37+
#undef NBL_CONCEPT_TPLT_PRM_KINDS
38+
#undef NBL_CONCEPT_NAME

0 commit comments

Comments
 (0)